backup commits to sapling-pr-archive branch

Summary:
Because `sl pr submit` relies on force-pushing to GitHub,
we run the risk of commits from previous versions of a PR
getting reaped, as GitHub periodically garbage collects
commits (and trees and blobs) that are no longer reachable
via a public branch.

To ensure we can always see previous versions of a PR in its
entirety, we merge the head of the development branch to a
special branch named `sapling-pr-archive-{username}`.
The idea is that this branch is not intended for development,
but to defend against GitHub's garbage collection process.

To support this, we add some new methods to `gh_submit.py`:

- `get_username()`
- `create_branch()`
- `merge_into_branch()`

These do what their names suggest via GitHub's GraphQL API.

Because `merge_into_branch()` has some "expected" error cases
(such as "branch does not exist"), we had to update the logic in
`make_request()` to ensure we can read the error back out of the
response to check whether it is one of these *expected* errors.

Over in `submit.py`, we introduce new functions to test the response
contents to see if it corresponds to an expected error:

- `is_already_merged_error()`
- `is_branch_does_not_exist_error()`

Finally, `add_pr_head_to_archives()` is where we put it all together
to merge the head of the development stack into the archive
branch, creating the branch, if necessary.

Reviewed By: quark-zju

Differential Revision: D39919020

fbshipit-source-id: 3822e7cb22d0b6f308a912fe98eec8106a87b5d9
This commit is contained in:
Michael Bolin 2022-10-03 23:23:09 -07:00 committed by Facebook GitHub Bot
parent 415868f503
commit 57e8b788df
2 changed files with 246 additions and 18 deletions

View File

@ -267,6 +267,80 @@ mutation ($pullRequestId: ID!, $title: String!, $body: String!) {
return Result(ok=result.ok["data"]["updatePullRequest"]["pullRequest"]["id"])
async def create_branch(*, repo_id: str, branch_name: str, oid: str) -> Result[str]:
"""Attempts to create the branch. If successful, returns the ID of the newly
created Ref.
"""
query = """
mutation ($repositoryId: ID!, $name: String!, $oid: GitObjectID!) {
createRef(input: {repositoryId: $repositoryId, name: $name, oid: $oid}) {
ref {
id
}
}
}
"""
params: Dict[str, Union[str, int]] = {
"query": query,
"repositoryId": repo_id,
"name": f"refs/heads/{branch_name}",
"oid": oid,
}
result = await make_request(params)
if result.is_error():
return result
else:
return Result(ok=result.ok["data"]["createRef"]["ref"]["id"])
async def merge_into_branch(
*, repo_id: str, oid_to_merge: str, branch_name: str
) -> Result[str]:
"""Takes the hash, oid_to_merge, and merges it into the specified branch_name."""
query = """
mutation ($repositoryId: ID!, $base: String!, $head: String!) {
mergeBranch(input: {repositoryId: $repositoryId, base: $base, head: $head}) {
mergeCommit {
oid
}
}
}
"""
params: Dict[str, Union[str, int]] = {
"query": query,
"repositoryId": repo_id,
"base": branch_name,
"head": oid_to_merge,
}
result = await make_request(params)
if result.is_error():
return result
else:
return Result(ok=result.ok["data"]["mergeBranch"]["mergeCommit"]["oid"])
async def get_username() -> Result[str]:
"""Returns the username associated with the auth token. Note that it is
slightly faster to call graphql.try_parse_oath_token_from_hosts_yml() and
read the value from hosts.yml.
"""
query = """
query {
viewer {
login
}
}
"""
params: Dict[str, Union[str, int]] = {
"query": query,
}
result = await make_request(params)
if result.is_error():
return result
else:
return Result(ok=result.ok["data"]["viewer"]["login"])
async def make_request(
params: Dict[str, Union[str, int]], endpoint="graphql"
) -> Result:
@ -286,17 +360,20 @@ async def make_request(
)
stdout, stderr = await proc.communicate()
if proc.returncode != 0:
return Result(
error=f"Failure running {' '.join(args)}\nstdout: {stdout}\nstderr: {stderr}\n"
)
# If proc exits with a non-zero exit code, the stdout may still
# be valid JSON, but we expect it to have an "errors" property defined.
try:
response = json.loads(stdout)
except json.JSONDecodeError:
response = None
return check_stdout(stdout)
def check_stdout(stdout: bytes) -> Result:
response = json.loads(stdout)
if "errors" in response:
return Result(error=json.dumps(response, indent=1))
else:
if proc.returncode == 0:
assert response is not None
assert "errors" not in response
return Result(ok=response)
elif response is None:
return Result(
error=f"exit({proc.returncode}) Failure running {' '.join(args)}\nstdout: {stdout}\nstderr: {stderr}\n"
)
else:
return Result(error=json.dumps(response, indent=1))

View File

@ -100,7 +100,7 @@ async def update_commits_in_stack(ui, repo) -> int:
# These are set lazily because they require GraphQL calls.
next_pull_request_number = None
repository = None
repository: Optional[Repository] = None
for partition in partitions:
top = partition[0]
@ -157,13 +157,14 @@ async def update_commits_in_stack(ui, repo) -> int:
# create/update the pull request title and body, as appropriate.
pr_numbers = [none_throws(p[0].pr).number for p in partitions]
rewrite_requests = [
# Add the head of the stack to the sapling-pr-archive branch.
tip = hex(partitions[0][0].node)
rewrite_and_archive_requests = [
rewrite_pull_request_body(partition, index, pr_numbers, ui)
for index, partition in enumerate(partitions)
]
await asyncio.gather(*rewrite_requests)
# TODO: Add the head of the stack to the reviewstack-archive branch.
] + [add_pr_head_to_archives(origin=origin, repository=repository, tip=tip)]
await asyncio.gather(*rewrite_and_archive_requests)
return 0
@ -318,6 +319,156 @@ async def get_pull_request_details_or_throw(pr_id: PullRequestId) -> PullRequest
return none_throws(result.ok)
async def add_pr_head_to_archives(
*, origin: str, repository: Optional[Repository], tip: str
):
"""Takes the specified commit (tip) and merges it into the appropriate
archive branch for the (repo, username). GitHub will periodically garbage
collect commits that are no longer part of a public branch, but we want to
prevent this to ensure previous version of a PR can be viewed later, even
after it has been updated via a force-push.
tip is the hex version of the commit hash to be merged into the archive branch.
"""
username = await get_username()
if not username:
raise error.Abort(_("could not determine GitHub username"))
if not repository:
repository = await get_repository_for_origin(origin)
branch_name = f"sapling-pr-archive-{username}"
# Try to merge the tip directly, though this may fail if tip has already
# been merged or if the branch has not been created before. We try to merge
# without checking for the existence of the branch to try to avoid a TOCTOU
# error.
result = await gh_submit.merge_into_branch(
repo_id=repository.id, oid_to_merge=tip, branch_name=branch_name
)
if not result.is_error():
return
import json
# TODO: Store Result.error as Dict so we don't have to parse it again.
err = none_throws(result.error)
response = None
try:
response = json.loads(err)
except json.JSONDecodeError:
# response is not guaranteed to be valid JSON.
pass
if response and is_already_merged_error(response):
# Nothing to do!
return
elif response and is_branch_does_not_exist_error(response):
# Archive branch does not exist yet, so initialize it with the current
# tip.
result = await gh_submit.create_branch(
repo_id=repository.id, branch_name=branch_name, oid=tip
)
if result.is_error():
raise error.Abort(
_("unexpected error when trying to create branch %s with commit %s: %s")
% (branch_name, tip, result.error)
)
else:
raise error.Abort(
_("unexpected error when trying to merge %s into %s: %s")
% (tip, branch_name, err)
)
def is_already_merged_error(response) -> bool:
r"""
>>> response = {
... "data": {
... "mergeBranch": None
... },
... "errors": [
... {
... "type": "UNPROCESSABLE",
... "path": [
... "mergeBranch"
... ],
... "locations": [
... {
... "line": 2,
... "column": 3
... }
... ],
... "message": "Failed to merge: \"Already merged\""
... }
... ]
... }
>>> is_already_merged_error(response)
True
"""
errors = response.get("errors")
if not errors or not isinstance(errors, list):
return False
for err in errors:
if err.get("type") != "UNPROCESSABLE":
continue
message = err.get("message")
if isinstance(message, str) and "Already merged" in message:
return True
return False
def is_branch_does_not_exist_error(response) -> bool:
r"""
>>> response = {
... "data": {
... "mergeBranch": None
... },
... "errors": [
... {
... "type": "NOT_FOUND",
... "path": [
... "mergeBranch"
... ],
... "locations": [
... {
... "line": 2,
... "column": 3
... }
... ],
... "message": "No such base."
... }
... ]
... }
"""
errors = response.get("errors")
if not errors or not isinstance(errors, list):
return False
for err in errors:
if err.get("type") != "NOT_FOUND":
continue
message = err.get("message")
if isinstance(message, str) and "No such base." in message:
return True
return False
async def get_username() -> Optional[str]:
"""Returns the username associated with the GitHub personal access token."""
# First we try to parse the hosts.yml file directly, as that is faster,
# but we fall back to GraphQL, as it is more reliable, if parsing hosts.yml
# fails for some reason.
from .graphql import try_parse_oauth_token_from_gh_config
username_and_token = try_parse_oauth_token_from_gh_config()
if username_and_token:
return username_and_token[0]
else:
result = await gh_submit.get_username()
if result.is_error():
return None
else:
return none_throws(result.ok)
def run_git_command(args: List[str], gitdir: str):
full_args = ["git", "--git-dir", gitdir] + args
proc = subprocess.run(full_args, capture_output=True)