2023-02-27 20:19:20 +03:00
|
|
|
"""
|
2023-06-16 13:19:12 +03:00
|
|
|
This script is part of fetchPipMetadata
|
2023-02-27 20:19:20 +03:00
|
|
|
It is meant to be used with mitmproxy via `--script`
|
|
|
|
It will filter api repsonses from the pypi.org api (used by pip),
|
2023-03-24 07:13:29 +03:00
|
|
|
to only contain files with release date < pypiSnapshotDate
|
2023-02-27 20:19:20 +03:00
|
|
|
|
|
|
|
For retrieving the release dates for files, it uses the pypi.org json api
|
|
|
|
It has to do one extra api request for each queried package name
|
|
|
|
"""
|
|
|
|
import json
|
|
|
|
import os
|
2023-02-27 20:46:22 +03:00
|
|
|
import sys
|
|
|
|
import ssl
|
2023-02-27 20:19:20 +03:00
|
|
|
from urllib.request import Request, urlopen
|
2023-02-27 20:46:22 +03:00
|
|
|
from pathlib import Path
|
2023-02-27 20:19:20 +03:00
|
|
|
import dateutil.parser
|
|
|
|
import gzip
|
|
|
|
|
|
|
|
from mitmproxy import http
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
Query the pypi json api to get timestamps for all release files of the given pname.
|
|
|
|
return all file names which are newer than the given timestamp
|
|
|
|
"""
|
2023-03-12 18:29:53 +03:00
|
|
|
|
|
|
|
|
2023-02-27 20:19:20 +03:00
|
|
|
def get_files_to_hide(pname, max_ts):
|
2023-03-12 18:29:53 +03:00
|
|
|
ca_file = Path(os.getenv("HOME")) / ".ca-cert.pem"
|
2023-02-27 20:46:22 +03:00
|
|
|
context = ssl.create_default_context(cafile=ca_file)
|
|
|
|
if not ca_file.exists():
|
|
|
|
print("mitmproxy ca not found")
|
|
|
|
sys.exit(1)
|
|
|
|
|
2023-02-27 20:19:20 +03:00
|
|
|
# query the api
|
|
|
|
url = f"https://pypi.org/pypi/{pname}/json"
|
|
|
|
req = Request(url)
|
2023-03-12 18:29:53 +03:00
|
|
|
req.add_header("Accept-Encoding", "gzip")
|
2023-02-27 20:46:22 +03:00
|
|
|
with urlopen(req, context=context) as response:
|
2023-02-27 20:19:20 +03:00
|
|
|
content = gzip.decompress(response.read())
|
|
|
|
resp = json.loads(content)
|
|
|
|
|
|
|
|
# collect files to hide
|
|
|
|
files = set()
|
2023-03-12 18:29:53 +03:00
|
|
|
for ver, releases in resp["releases"].items():
|
2023-02-27 20:19:20 +03:00
|
|
|
for release in releases:
|
2023-03-12 18:29:53 +03:00
|
|
|
ts = dateutil.parser.parse(release["upload_time"]).timestamp()
|
2023-02-27 20:19:20 +03:00
|
|
|
if ts > max_ts:
|
2023-03-12 18:29:53 +03:00
|
|
|
files.add(release["filename"])
|
2023-02-27 20:19:20 +03:00
|
|
|
return files
|
|
|
|
|
|
|
|
|
|
|
|
# accept unix timestamp or human readable format
|
|
|
|
try:
|
2023-03-24 07:13:29 +03:00
|
|
|
max_ts = int(os.getenv("pypiSnapshotDate"))
|
2023-02-27 20:19:20 +03:00
|
|
|
except ValueError:
|
2023-03-24 07:13:29 +03:00
|
|
|
max_date = dateutil.parser.parse(os.getenv("pypiSnapshotDate"))
|
2023-02-27 20:19:20 +03:00
|
|
|
max_ts = max_date.timestamp()
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
Response format:
|
|
|
|
{
|
|
|
|
"files": [
|
|
|
|
{
|
|
|
|
"filename": "pip-0.2.tar.gz",
|
|
|
|
"hashes": {
|
|
|
|
"sha256": "88bb8d029e1bf4acd0e04d300104b7440086f94cc1ce1c5c3c31e3293aee1f81"
|
|
|
|
},
|
|
|
|
"requires-python": null,
|
|
|
|
"url": "https://files.pythonhosted.org/packages/3d/9d/1e313763bdfb6a48977b65829c6ce2a43eaae29ea2f907c8bbef024a7219/pip-0.2.tar.gz",
|
|
|
|
"yanked": false
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"filename": "pip-0.2.1.tar.gz",
|
|
|
|
"hashes": {
|
|
|
|
"sha256": "83522005c1266cc2de97e65072ff7554ac0f30ad369c3b02ff3a764b962048da"
|
|
|
|
},
|
|
|
|
"requires-python": null,
|
|
|
|
"url": "https://files.pythonhosted.org/packages/18/ad/c0fe6cdfe1643a19ef027c7168572dac6283b80a384ddf21b75b921877da/pip-0.2.1.tar.gz",
|
|
|
|
"yanked": false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
"""
|
2023-03-12 18:29:53 +03:00
|
|
|
|
|
|
|
|
2023-02-27 20:19:20 +03:00
|
|
|
def response(flow: http.HTTPFlow) -> None:
|
|
|
|
if not "/simple/" in flow.request.url:
|
|
|
|
return
|
2023-03-12 18:29:53 +03:00
|
|
|
pname = flow.request.url.strip("/").split("/")[-1]
|
2023-02-27 20:19:20 +03:00
|
|
|
badFiles = get_files_to_hide(pname, max_ts)
|
2023-03-12 18:29:53 +03:00
|
|
|
keepFile = lambda file: file["filename"] not in badFiles
|
2023-02-27 20:19:20 +03:00
|
|
|
data = json.loads(flow.response.text)
|
|
|
|
if badFiles:
|
|
|
|
print(f"removing the following files form the API response:\n {badFiles}")
|
2023-03-12 18:29:53 +03:00
|
|
|
data["files"] = list(filter(keepFile, data["files"]))
|
2023-02-27 20:19:20 +03:00
|
|
|
flow.response.text = json.dumps(data)
|