gitbutler/scripts/gen_branch_testdata.py

124 lines
3.3 KiB
Python
Raw Normal View History

2023-06-13 19:48:59 +03:00
#!/usr/bin/env /usr/bin/python3
2023-07-12 12:40:23 +03:00
# This file was used to generate test data before GitButler was able to work
# with real user data. It is unused for now, but we'll keep it around for a
# while in case it turns out to be useful for e.g. e2e testing.
2023-06-13 19:48:59 +03:00
import subprocess
import json
2023-06-15 02:25:06 +03:00
import openai
import os
openai.api_key = os.getenv("OPENAI_API_KEY")
2023-06-13 19:48:59 +03:00
try:
from unidiff import PatchSet
except ImportError as e:
print(
"unidiff is not installed, please install it first with: python3 -m pip install unidiff"
)
exit(1)
try:
subprocess.check_output("gh --version", shell=True, text=True)
except subprocess.CalledProcessError as e:
print("gh is not installed, please install it first from https://cli.github.com/")
exit(1)
def get_last_n_pr_nums(n_prs):
list_prs = subprocess.check_output(
"gh pr list --state merged | head -n %d | awk '{print $1}'" % n_prs,
shell=True,
text=True,
)
return list_prs.splitlines()
2023-06-15 02:25:06 +03:00
def summarize_hunk(hunk):
prompt = """
Summarize the following git diff hunk in less than 80 characters:
```
{hunk}
```
""".format(
hunk=hunk[0:1000]
)
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}],
max_tokens=32,
)
return response.choices[0].message.content.strip()
2023-06-13 19:48:59 +03:00
def process_pr(pr_number):
branch_name = subprocess.check_output(
"gh pr view %s --json headRefName -q '.headRefName'" % pr_number,
shell=True,
text=True,
).splitlines()[0]
updated_at = subprocess.check_output(
"gh pr view %s --json updatedAt -q '.updatedAt'" % pr_number,
shell=True,
text=True,
).splitlines()[0]
title = subprocess.check_output(
"gh pr view %s --json title -q '.title'" % pr_number,
shell=True,
text=True,
).splitlines()[0]
2023-06-15 01:13:49 +03:00
body = subprocess.check_output(
"gh pr view %s --json body -q '.body'" % pr_number,
shell=True,
text=True,
).splitlines()[0]
2023-06-13 19:48:59 +03:00
diff = subprocess.check_output("gh pr diff %s" % pr_number, shell=True, text=True)
patch = PatchSet(diff)
files = []
for file in patch:
hunks = []
for hunk in file:
hunk_out = {
"id": branch_name + ":" + file.path + ":" + str(hunk.target_start),
2023-06-15 02:25:06 +03:00
"name": summarize_hunk(str(hunk)),
2023-06-13 19:48:59 +03:00
"diff": str(hunk),
"kind": "hunk",
"modifiedAt": updated_at,
"filePath": file.path,
}
hunks.append(hunk_out)
file_out = {
"id": branch_name + ":" + file.path,
"path": file.path,
"kind": "file",
"hunks": hunks,
}
files.append(file_out)
branch = {
"id": branch_name + ":" + pr_number,
"name": branch_name,
"active": True,
"kind": "branch",
"files": files,
2023-06-15 01:13:49 +03:00
"description": title + "\n" + body,
2023-06-13 19:48:59 +03:00
}
return branch
# prs = get_last_n_pr_nums(4)
2023-06-15 09:46:57 +03:00
prs = [
"425",
"429",
"420",
"414",
"409",
"407",
] # feel free to paste some some specific PRs
2023-06-13 19:48:59 +03:00
branches = [process_pr(pr) for pr in prs]
with open("scripts/branch_testdata.json", "w") as json_file:
json.dump(branches, json_file, indent=4)