From 9ed740937eddd13197c21926a1ca04218f251118 Mon Sep 17 00:00:00 2001 From: phaer Date: Mon, 27 Feb 2023 18:46:22 +0100 Subject: [PATCH] feat: re-implement fetch-python-requirements... in python. This canonicalizes package name with the official `packaging` package and handels TLS ca certificates, so that pip does not need --trusted-host args anymore. --- .../fetch-python-requirements.nix | 6 +- .../fetch-python-requirements.py | 138 ++++++++++++++++++ .../fetch-python-requirements.sh | 74 ---------- .../filter-pypi-responses.py | 11 +- 4 files changed, 151 insertions(+), 78 deletions(-) create mode 100644 v1/nix/pkgs/fetchPythonRequirements/fetch-python-requirements.py delete mode 100755 v1/nix/pkgs/fetchPythonRequirements/fetch-python-requirements.sh diff --git a/v1/nix/pkgs/fetchPythonRequirements/fetch-python-requirements.nix b/v1/nix/pkgs/fetchPythonRequirements/fetch-python-requirements.nix index c408403e..bebec297 100644 --- a/v1/nix/pkgs/fetchPythonRequirements/fetch-python-requirements.nix +++ b/v1/nix/pkgs/fetchPythonRequirements/fetch-python-requirements.nix @@ -96,7 +96,7 @@ let # we use mitmproxy to filter the pypi responses pythonWithMitmproxy = - python3.withPackages (ps: [ ps.mitmproxy ps.python-dateutil ]); + python3.withPackages (ps: [ ps.mitmproxy ps.python-dateutil ps.packaging]); # fixed output derivation containing downloaded packages, # each being symlinked from it's normalized name @@ -165,7 +165,7 @@ let pythonBin = python.interpreter; filterPypiResponsesScript = ./filter-pypi-responses.py; - buildScript = ./fetch-python-requirements.sh; + buildScript = ./fetch-python-requirements.py; inherit pythonWithMitmproxy pipVersion @@ -183,7 +183,7 @@ let '' -r ${lib.concatStringsSep " -r " (map toString finalAttrs.requirementsFiles)}''; buildPhase = '' - bash $buildScript + $pythonWithMitmproxy/bin/python $buildScript ''; }); in self; diff --git a/v1/nix/pkgs/fetchPythonRequirements/fetch-python-requirements.py b/v1/nix/pkgs/fetchPythonRequirements/fetch-python-requirements.py new file mode 100644 index 00000000..1a5c62f9 --- /dev/null +++ b/v1/nix/pkgs/fetchPythonRequirements/fetch-python-requirements.py @@ -0,0 +1,138 @@ +import os +import socket +import ssl +import subprocess +import time +import dateutil.parser +import urllib.request +from pathlib import Path + +import certifi +from packaging.utils import canonicalize_name, parse_sdist_filename, parse_wheel_filename + + +HOME = Path(os.getcwd()) +OUT = Path(os.getenv("out")) +PYTHON_BIN = os.getenv("pythonBin") +PYTHON_WITH_MITM_PROXY = os.getenv("pythonWithMitmproxy") +FILTER_PYPI_RESPONSE_SCRIPTS = os.getenv("filterPypiResponsesScript") +PIP_VERSION = os.getenv("pipVersion") +PIP_FLAGS = os.getenv('pipFlags') +ONLY_BINARY_FLAGS = os.getenv('onlyBinaryFlags') +REQUIREMENTS_LIST = os.getenv('requirementsList') +REQUIREMENTS_FLAGS = os.getenv('requirementsFlags') + +def get_max_date(): + try: + return int(os.getenv("MAX_DATE")) + except ValueError: + return dateutil.parser.parse(os.getenv("MAX_DATE")) + + +def get_free_port(): + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.bind(("", 0)) + port = sock.getsockname()[1] + sock.close() + return port + + +def start_mitmproxy(port): + proc = subprocess.Popen( + [ + f"{PYTHON_WITH_MITM_PROXY}/bin/mitmdump", + "--listen-port", str(port), + "--ignore-hosts", ".*files.pythonhosted.org.*", + "--script", FILTER_PYPI_RESPONSE_SCRIPTS + ], + env = { + "MAX_DATE": os.getenv('MAX_DATE'), + "HOME": HOME + } + ) + return proc + + +def wait_for_proxy(proxy_port, cafile): + timeout = time.time() + 60 * 5 + req = urllib.request.Request('https://pypi.org') + req.set_proxy(f'127.0.0.1:{proxy_port}', 'http') + req.set_proxy(f'127.0.0.1:{proxy_port}', 'https') + + context = ssl.create_default_context(cafile=cafile) + while time.time() < timeout: + try: + res = urllib.request.urlopen(req, None, 5, context=context) + if res.status < 400: + break + except urllib.error.URLError as e: + pass + finally: + time.sleep(1) + + +# as we only proxy *some* calls, we need to combine upstream +# ca certificates and the one from mitm proxy +def generate_ca_bundle(path): + with open(HOME / ".mitmproxy/mitmproxy-ca-cert.pem", "r") as f: + mitmproxy_cacert = f.read() + with open(certifi.where(), "r") as f: + certifi_cacert = f.read() + with open(path, "w") as f: + f.write(mitmproxy_cacert) + f.write("\n") + f.write(certifi_cacert) + return path + +def create_venv(path): + subprocess.run([PYTHON_BIN, '-m', 'venv', path], check=True) + + +def pip(venv_path, *args): + subprocess.run([f"{venv_path}/bin/pip", *args], check=True) + + +if __name__ == '__main__': + OUT.mkdir() + dist_path = OUT / "dist" + names_path = OUT / "names" + dist_path.mkdir() + names_path.mkdir() + + print(f"selected maximum release date for python packages: {get_max_date()}") + proxy_port = get_free_port() + + proxy = start_mitmproxy(proxy_port) + + venv_path = Path('.venv').absolute() + create_venv(venv_path) + pip(venv_path, 'install', '--upgrade', f'pip=={PIP_VERSION}') + + cafile = generate_ca_bundle(HOME / ".ca-cert.pem") + wait_for_proxy(proxy_port, cafile) + + optional_flags = [PIP_FLAGS, ONLY_BINARY_FLAGS, REQUIREMENTS_LIST, REQUIREMENTS_FLAGS] + optional_flags = " ".join(filter(None, optional_flags)).split(" ") + pip( + venv_path, + 'download', + '--no-cache', + '--dest', dist_path, + '--progress-bar', 'off', + '--proxy', f'https://localhost:{proxy_port}', + '--cert', cafile, + *optional_flags + ) + + proxy.kill() + + for dist_file in dist_path.iterdir(): + if dist_file.suffix == '.whl': + name = parse_wheel_filename(dist_file.name)[0] + else: + name = parse_sdist_filename(dist_file.name)[0] + pname = canonicalize_name(name) + name_path = names_path / pname + print(f'creating link {name_path} -> {dist_file}') + name_path.mkdir() + (name_path / dist_file.name).symlink_to(f"../../dist/{dist_file.name}") diff --git a/v1/nix/pkgs/fetchPythonRequirements/fetch-python-requirements.sh b/v1/nix/pkgs/fetchPythonRequirements/fetch-python-requirements.sh deleted file mode 100755 index c8f5d758..00000000 --- a/v1/nix/pkgs/fetchPythonRequirements/fetch-python-requirements.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env bash -set -Eeuo pipefail -# the script.py will read this date -pretty=$(python -c ' -import os; import dateutil.parser; -try: - print(int(os.getenv("MAX_DATE"))) -except ValueError: - print(dateutil.parser.parse(os.getenv("MAX_DATE"))) -') -echo "selected maximum release date for python packages: $pretty" - -# find free port for proxy -proxyPort=$(python -c '\ -import socket -s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) -s.bind(("", 0)) -print(s.getsockname()[1]) -s.close()') - -# start proxy to filter pypi responses -# mitmproxy wants HOME set -# mitmdump == mitmproxy without GUI -HOME=$(pwd) $pythonWithMitmproxy/bin/mitmdump \ - --listen-port "$proxyPort" \ - --ignore-hosts '.*files.pythonhosted.org.*' \ - --script $filterPypiResponsesScript & -proxyPID=$! - -# install specified version of pip first to ensure reproducible resolver logic -$pythonBin -m venv .venv -.venv/bin/pip install --upgrade pip==$pipVersion -fetcherPip=.venv/bin/pip - -# wait for proxy to come up -while sleep 0.5; do - timeout 5 curl -fs --proxy http://localhost:$proxyPort http://pypi.org && break -done - -# make pip query pypi through the filtering proxy -# FIXME: pip does not return ifit crashes. The build will freeze indefinitely. -mkdir "$out" -mkdir "$out/dist" -$fetcherPip download \ - --no-cache \ - --dest "$out/dist" \ - --progress-bar off \ - --proxy http://localhost:$proxyPort \ - --trusted-host pypi.org \ - --trusted-host files.pythonhosted.org \ - $pipFlags \ - $onlyBinaryFlags \ - $(printf " %s" "${requirementsList[@]}") \ - $requirementsFlags - -# terminate proxy -echo "killing proxy with PID: $proxyPID" -kill $proxyPID - -# create symlinks to allow files being referenced via their normalized package names -# Example: -# "$out/names/werkzeug" will point to "$out/dist/Werkzeug-0.14.1-py2.py3-none-any.whl" -cd "$out/dist" -mkdir "$out/names" -for f in $(ls "$out/dist"); do - if [[ "$f" == *.whl ]]; then - pname=$(echo "$f" | cut -d "-" -f 1 | sed -e 's/_/-/' -e 's/\./-/' -e 's/\(.*\)/\L\1/') - else - pname=$(echo "${f%-*}" | sed -e 's/_/-/' -e 's/\./-/' -e 's/\(.*\)/\L\1/') - fi - echo "creating link $out/names/$pname" - mkdir "$out/names/$pname" - ln -s "../../dist/$f" "$out/names/$pname/$f" -done diff --git a/v1/nix/pkgs/fetchPythonRequirements/filter-pypi-responses.py b/v1/nix/pkgs/fetchPythonRequirements/filter-pypi-responses.py index ba3f86a9..2e97b53c 100644 --- a/v1/nix/pkgs/fetchPythonRequirements/filter-pypi-responses.py +++ b/v1/nix/pkgs/fetchPythonRequirements/filter-pypi-responses.py @@ -9,7 +9,10 @@ It has to do one extra api request for each queried package name """ import json import os +import sys +import ssl from urllib.request import Request, urlopen +from pathlib import Path import dateutil.parser import gzip @@ -21,11 +24,17 @@ Query the pypi json api to get timestamps for all release files of the given pna return all file names which are newer than the given timestamp """ def get_files_to_hide(pname, max_ts): + ca_file = Path(os.getenv('HOME')) / ".ca-cert.pem" + context = ssl.create_default_context(cafile=ca_file) + if not ca_file.exists(): + print("mitmproxy ca not found") + sys.exit(1) + # query the api url = f"https://pypi.org/pypi/{pname}/json" req = Request(url) req.add_header('Accept-Encoding', 'gzip') - with urlopen(req) as response: + with urlopen(req, context=context) as response: content = gzip.decompress(response.read()) resp = json.loads(content)