feat: re-implement fetch-python-requirements...

in python. This canonicalizes package name with the official
`packaging` package and handels TLS ca certificates, so that
pip does not need --trusted-host args anymore.
This commit is contained in:
phaer 2023-02-27 18:46:22 +01:00 committed by DavHau
parent d3a15ccda7
commit 9ed740937e
4 changed files with 151 additions and 78 deletions

View File

@ -96,7 +96,7 @@ let
# we use mitmproxy to filter the pypi responses
pythonWithMitmproxy =
python3.withPackages (ps: [ ps.mitmproxy ps.python-dateutil ]);
python3.withPackages (ps: [ ps.mitmproxy ps.python-dateutil ps.packaging]);
# fixed output derivation containing downloaded packages,
# each being symlinked from it's normalized name
@ -165,7 +165,7 @@ let
pythonBin = python.interpreter;
filterPypiResponsesScript = ./filter-pypi-responses.py;
buildScript = ./fetch-python-requirements.sh;
buildScript = ./fetch-python-requirements.py;
inherit
pythonWithMitmproxy
pipVersion
@ -183,7 +183,7 @@ let
'' -r ${lib.concatStringsSep " -r " (map toString finalAttrs.requirementsFiles)}'';
buildPhase = ''
bash $buildScript
$pythonWithMitmproxy/bin/python $buildScript
'';
});
in self;

View File

@ -0,0 +1,138 @@
import os
import socket
import ssl
import subprocess
import time
import dateutil.parser
import urllib.request
from pathlib import Path
import certifi
from packaging.utils import canonicalize_name, parse_sdist_filename, parse_wheel_filename
HOME = Path(os.getcwd())
OUT = Path(os.getenv("out"))
PYTHON_BIN = os.getenv("pythonBin")
PYTHON_WITH_MITM_PROXY = os.getenv("pythonWithMitmproxy")
FILTER_PYPI_RESPONSE_SCRIPTS = os.getenv("filterPypiResponsesScript")
PIP_VERSION = os.getenv("pipVersion")
PIP_FLAGS = os.getenv('pipFlags')
ONLY_BINARY_FLAGS = os.getenv('onlyBinaryFlags')
REQUIREMENTS_LIST = os.getenv('requirementsList')
REQUIREMENTS_FLAGS = os.getenv('requirementsFlags')
def get_max_date():
try:
return int(os.getenv("MAX_DATE"))
except ValueError:
return dateutil.parser.parse(os.getenv("MAX_DATE"))
def get_free_port():
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.bind(("", 0))
port = sock.getsockname()[1]
sock.close()
return port
def start_mitmproxy(port):
proc = subprocess.Popen(
[
f"{PYTHON_WITH_MITM_PROXY}/bin/mitmdump",
"--listen-port", str(port),
"--ignore-hosts", ".*files.pythonhosted.org.*",
"--script", FILTER_PYPI_RESPONSE_SCRIPTS
],
env = {
"MAX_DATE": os.getenv('MAX_DATE'),
"HOME": HOME
}
)
return proc
def wait_for_proxy(proxy_port, cafile):
timeout = time.time() + 60 * 5
req = urllib.request.Request('https://pypi.org')
req.set_proxy(f'127.0.0.1:{proxy_port}', 'http')
req.set_proxy(f'127.0.0.1:{proxy_port}', 'https')
context = ssl.create_default_context(cafile=cafile)
while time.time() < timeout:
try:
res = urllib.request.urlopen(req, None, 5, context=context)
if res.status < 400:
break
except urllib.error.URLError as e:
pass
finally:
time.sleep(1)
# as we only proxy *some* calls, we need to combine upstream
# ca certificates and the one from mitm proxy
def generate_ca_bundle(path):
with open(HOME / ".mitmproxy/mitmproxy-ca-cert.pem", "r") as f:
mitmproxy_cacert = f.read()
with open(certifi.where(), "r") as f:
certifi_cacert = f.read()
with open(path, "w") as f:
f.write(mitmproxy_cacert)
f.write("\n")
f.write(certifi_cacert)
return path
def create_venv(path):
subprocess.run([PYTHON_BIN, '-m', 'venv', path], check=True)
def pip(venv_path, *args):
subprocess.run([f"{venv_path}/bin/pip", *args], check=True)
if __name__ == '__main__':
OUT.mkdir()
dist_path = OUT / "dist"
names_path = OUT / "names"
dist_path.mkdir()
names_path.mkdir()
print(f"selected maximum release date for python packages: {get_max_date()}")
proxy_port = get_free_port()
proxy = start_mitmproxy(proxy_port)
venv_path = Path('.venv').absolute()
create_venv(venv_path)
pip(venv_path, 'install', '--upgrade', f'pip=={PIP_VERSION}')
cafile = generate_ca_bundle(HOME / ".ca-cert.pem")
wait_for_proxy(proxy_port, cafile)
optional_flags = [PIP_FLAGS, ONLY_BINARY_FLAGS, REQUIREMENTS_LIST, REQUIREMENTS_FLAGS]
optional_flags = " ".join(filter(None, optional_flags)).split(" ")
pip(
venv_path,
'download',
'--no-cache',
'--dest', dist_path,
'--progress-bar', 'off',
'--proxy', f'https://localhost:{proxy_port}',
'--cert', cafile,
*optional_flags
)
proxy.kill()
for dist_file in dist_path.iterdir():
if dist_file.suffix == '.whl':
name = parse_wheel_filename(dist_file.name)[0]
else:
name = parse_sdist_filename(dist_file.name)[0]
pname = canonicalize_name(name)
name_path = names_path / pname
print(f'creating link {name_path} -> {dist_file}')
name_path.mkdir()
(name_path / dist_file.name).symlink_to(f"../../dist/{dist_file.name}")

View File

@ -1,74 +0,0 @@
#!/usr/bin/env bash
set -Eeuo pipefail
# the script.py will read this date
pretty=$(python -c '
import os; import dateutil.parser;
try:
print(int(os.getenv("MAX_DATE")))
except ValueError:
print(dateutil.parser.parse(os.getenv("MAX_DATE")))
')
echo "selected maximum release date for python packages: $pretty"
# find free port for proxy
proxyPort=$(python -c '\
import socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind(("", 0))
print(s.getsockname()[1])
s.close()')
# start proxy to filter pypi responses
# mitmproxy wants HOME set
# mitmdump == mitmproxy without GUI
HOME=$(pwd) $pythonWithMitmproxy/bin/mitmdump \
--listen-port "$proxyPort" \
--ignore-hosts '.*files.pythonhosted.org.*' \
--script $filterPypiResponsesScript &
proxyPID=$!
# install specified version of pip first to ensure reproducible resolver logic
$pythonBin -m venv .venv
.venv/bin/pip install --upgrade pip==$pipVersion
fetcherPip=.venv/bin/pip
# wait for proxy to come up
while sleep 0.5; do
timeout 5 curl -fs --proxy http://localhost:$proxyPort http://pypi.org && break
done
# make pip query pypi through the filtering proxy
# FIXME: pip does not return ifit crashes. The build will freeze indefinitely.
mkdir "$out"
mkdir "$out/dist"
$fetcherPip download \
--no-cache \
--dest "$out/dist" \
--progress-bar off \
--proxy http://localhost:$proxyPort \
--trusted-host pypi.org \
--trusted-host files.pythonhosted.org \
$pipFlags \
$onlyBinaryFlags \
$(printf " %s" "${requirementsList[@]}") \
$requirementsFlags
# terminate proxy
echo "killing proxy with PID: $proxyPID"
kill $proxyPID
# create symlinks to allow files being referenced via their normalized package names
# Example:
# "$out/names/werkzeug" will point to "$out/dist/Werkzeug-0.14.1-py2.py3-none-any.whl"
cd "$out/dist"
mkdir "$out/names"
for f in $(ls "$out/dist"); do
if [[ "$f" == *.whl ]]; then
pname=$(echo "$f" | cut -d "-" -f 1 | sed -e 's/_/-/' -e 's/\./-/' -e 's/\(.*\)/\L\1/')
else
pname=$(echo "${f%-*}" | sed -e 's/_/-/' -e 's/\./-/' -e 's/\(.*\)/\L\1/')
fi
echo "creating link $out/names/$pname"
mkdir "$out/names/$pname"
ln -s "../../dist/$f" "$out/names/$pname/$f"
done

View File

@ -9,7 +9,10 @@ It has to do one extra api request for each queried package name
"""
import json
import os
import sys
import ssl
from urllib.request import Request, urlopen
from pathlib import Path
import dateutil.parser
import gzip
@ -21,11 +24,17 @@ Query the pypi json api to get timestamps for all release files of the given pna
return all file names which are newer than the given timestamp
"""
def get_files_to_hide(pname, max_ts):
ca_file = Path(os.getenv('HOME')) / ".ca-cert.pem"
context = ssl.create_default_context(cafile=ca_file)
if not ca_file.exists():
print("mitmproxy ca not found")
sys.exit(1)
# query the api
url = f"https://pypi.org/pypi/{pname}/json"
req = Request(url)
req.add_header('Accept-Encoding', 'gzip')
with urlopen(req) as response:
with urlopen(req, context=context) as response:
content = gzip.decompress(response.read())
resp = json.loads(content)