mirror of
https://github.com/facebook/sapling.git
synced 2024-10-16 19:57:18 +03:00
64d45ccdb6
Summary: Reformat all opted-in python code with version `18.9b0` of Black. Reviewed By: ambv Differential Revision: D10126605 fbshipit-source-id: 82af0d645dd411ce8ae6b8d239e151b3730cd789
544 lines
18 KiB
Python
544 lines
18 KiB
Python
# synthrepo.py - repo synthesis
|
|
#
|
|
# Copyright 2012 Facebook
|
|
#
|
|
# This software may be used and distributed according to the terms of the
|
|
# GNU General Public License version 2 or any later version.
|
|
|
|
"""synthesize structurally interesting change history
|
|
|
|
This extension is useful for creating a repository with properties
|
|
that are statistically similar to an existing repository. During
|
|
analysis, a simple probability table is constructed from the history
|
|
of an existing repository. During synthesis, these properties are
|
|
reconstructed.
|
|
|
|
Properties that are analyzed and synthesized include the following:
|
|
|
|
- Lines added or removed when an existing file is modified
|
|
- Number and sizes of files added
|
|
- Number of files removed
|
|
- Line lengths
|
|
- Topological distance to parent changeset(s)
|
|
- Probability of a commit being a merge
|
|
- Probability of a newly added file being added to a new directory
|
|
- Interarrival time, and time zone, of commits
|
|
- Number of files in each directory
|
|
|
|
A few obvious properties that are not currently handled realistically:
|
|
|
|
- Merges are treated as regular commits with two parents, which is not
|
|
realistic
|
|
- Modifications are not treated as operations on hunks of lines, but
|
|
as insertions and deletions of randomly chosen single lines
|
|
- Committer ID (always random)
|
|
- Executability of files
|
|
- Symlinks and binary files are ignored
|
|
"""
|
|
|
|
from __future__ import absolute_import
|
|
|
|
import bisect
|
|
import collections
|
|
import itertools
|
|
import json
|
|
import os
|
|
import random
|
|
import sys
|
|
import time
|
|
|
|
from mercurial import context, error, hg, patch, registrar, scmutil, util
|
|
from mercurial.i18n import _
|
|
from mercurial.node import nullid, nullrev, short
|
|
|
|
|
|
# Note for extension authors: ONLY specify testedwith = 'ships-with-hg-core' for
|
|
# extensions which SHIP WITH MERCURIAL. Non-mainline extensions should
|
|
# be specifying the version(s) of Mercurial they are tested with, or
|
|
# leave the attribute unspecified.
|
|
testedwith = "ships-with-hg-core"
|
|
|
|
cmdtable = {}
|
|
command = registrar.command(cmdtable)
|
|
|
|
newfile = {"new fi", "rename", "copy f", "copy t"}
|
|
|
|
if sys.version_info.major >= 3:
|
|
xrange = range
|
|
|
|
|
|
def zerodict():
|
|
return collections.defaultdict(lambda: 0)
|
|
|
|
|
|
def roundto(x, k):
|
|
if x > k * 2:
|
|
return int(round(x / float(k)) * k)
|
|
return int(round(x))
|
|
|
|
|
|
def parsegitdiff(lines):
|
|
filename, mar, lineadd, lineremove = None, None, zerodict(), 0
|
|
binary = False
|
|
for line in lines:
|
|
start = line[:6]
|
|
if start == "diff -":
|
|
if filename:
|
|
yield filename, mar, lineadd, lineremove, binary
|
|
mar, lineadd, lineremove, binary = "m", zerodict(), 0, False
|
|
filename = patch.gitre.match(line).group(1)
|
|
elif start in newfile:
|
|
mar = "a"
|
|
elif start == "GIT bi":
|
|
binary = True
|
|
elif start == "delete":
|
|
mar = "r"
|
|
elif start:
|
|
s = start[0]
|
|
if s == "-" and not line.startswith("--- "):
|
|
lineremove += 1
|
|
elif s == "+" and not line.startswith("+++ "):
|
|
lineadd[roundto(len(line) - 1, 5)] += 1
|
|
if filename:
|
|
yield filename, mar, lineadd, lineremove, binary
|
|
|
|
|
|
@command(
|
|
"analyze",
|
|
[
|
|
("o", "output", "", _("write output to given file"), _("FILE")),
|
|
("r", "rev", [], _("analyze specified revisions"), _("REV")),
|
|
],
|
|
_("hg analyze"),
|
|
optionalrepo=True,
|
|
)
|
|
def analyze(ui, repo, *revs, **opts):
|
|
"""create a simple model of a repository to use for later synthesis
|
|
|
|
This command examines every changeset in the given range (or all
|
|
of history if none are specified) and creates a simple statistical
|
|
model of the history of the repository. It also measures the directory
|
|
structure of the repository as checked out.
|
|
|
|
The model is written out to a JSON file, and can be used by
|
|
:hg:`synthesize` to create or augment a repository with synthetic
|
|
commits that have a structure that is statistically similar to the
|
|
analyzed repository.
|
|
"""
|
|
root = repo.root
|
|
if not root.endswith(os.path.sep):
|
|
root += os.path.sep
|
|
|
|
revs = list(revs)
|
|
revs.extend(opts["rev"])
|
|
if not revs:
|
|
revs = [":"]
|
|
|
|
output = opts["output"]
|
|
if not output:
|
|
output = os.path.basename(root) + ".json"
|
|
|
|
if output == "-":
|
|
fp = sys.stdout
|
|
else:
|
|
fp = open(output, "w")
|
|
|
|
# Always obtain file counts of each directory in the given root directory.
|
|
def onerror(e):
|
|
ui.warn(_("error walking directory structure: %s\n") % e)
|
|
|
|
dirs = {}
|
|
rootprefixlen = len(root)
|
|
for dirpath, dirnames, filenames in os.walk(root, onerror=onerror):
|
|
dirpathfromroot = dirpath[rootprefixlen:]
|
|
dirs[dirpathfromroot] = len(filenames)
|
|
if ".hg" in dirnames:
|
|
dirnames.remove(".hg")
|
|
|
|
lineschanged = zerodict()
|
|
children = zerodict()
|
|
p1distance = zerodict()
|
|
p2distance = zerodict()
|
|
linesinfilesadded = zerodict()
|
|
fileschanged = zerodict()
|
|
filesadded = zerodict()
|
|
filesremoved = zerodict()
|
|
linelengths = zerodict()
|
|
interarrival = zerodict()
|
|
parents = zerodict()
|
|
dirsadded = zerodict()
|
|
tzoffset = zerodict()
|
|
|
|
# If a mercurial repo is available, also model the commit history.
|
|
if repo:
|
|
revs = scmutil.revrange(repo, revs)
|
|
revs.sort()
|
|
|
|
progress = ui.progress
|
|
_analyzing = _("analyzing")
|
|
_changesets = _("changesets")
|
|
_total = len(revs)
|
|
|
|
for i, rev in enumerate(revs):
|
|
progress(_analyzing, i, unit=_changesets, total=_total)
|
|
ctx = repo[rev]
|
|
pl = ctx.parents()
|
|
pctx = pl[0]
|
|
prev = pctx.rev()
|
|
children[prev] += 1
|
|
p1distance[rev - prev] += 1
|
|
parents[len(pl)] += 1
|
|
tzoffset[ctx.date()[1]] += 1
|
|
if len(pl) > 1:
|
|
p2distance[rev - pl[1].rev()] += 1
|
|
if prev == rev - 1:
|
|
lastctx = pctx
|
|
else:
|
|
lastctx = repo[rev - 1]
|
|
if lastctx.rev() != nullrev:
|
|
timedelta = ctx.date()[0] - lastctx.date()[0]
|
|
interarrival[roundto(timedelta, 300)] += 1
|
|
diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])
|
|
fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0
|
|
for filename, mar, lineadd, lineremove, isbin in parsegitdiff(diff):
|
|
if isbin:
|
|
continue
|
|
added = sum(lineadd.itervalues(), 0)
|
|
if mar == "m":
|
|
if added and lineremove:
|
|
lineschanged[roundto(added, 5), roundto(lineremove, 5)] += 1
|
|
filechanges += 1
|
|
elif mar == "a":
|
|
fileadds += 1
|
|
if "/" in filename:
|
|
filedir = filename.rsplit("/", 1)[0]
|
|
if filedir not in pctx.dirs():
|
|
diradds += 1
|
|
linesinfilesadded[roundto(added, 5)] += 1
|
|
elif mar == "r":
|
|
fileremoves += 1
|
|
for length, count in lineadd.iteritems():
|
|
linelengths[length] += count
|
|
fileschanged[filechanges] += 1
|
|
filesadded[fileadds] += 1
|
|
dirsadded[diradds] += 1
|
|
filesremoved[fileremoves] += 1
|
|
|
|
invchildren = zerodict()
|
|
|
|
for rev, count in children.iteritems():
|
|
invchildren[count] += 1
|
|
|
|
if output != "-":
|
|
ui.status(_("writing output to %s\n") % output)
|
|
|
|
def pronk(d):
|
|
return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)
|
|
|
|
json.dump(
|
|
{
|
|
"revs": len(revs),
|
|
"initdirs": pronk(dirs),
|
|
"lineschanged": pronk(lineschanged),
|
|
"children": pronk(invchildren),
|
|
"fileschanged": pronk(fileschanged),
|
|
"filesadded": pronk(filesadded),
|
|
"linesinfilesadded": pronk(linesinfilesadded),
|
|
"dirsadded": pronk(dirsadded),
|
|
"filesremoved": pronk(filesremoved),
|
|
"linelengths": pronk(linelengths),
|
|
"parents": pronk(parents),
|
|
"p1distance": pronk(p1distance),
|
|
"p2distance": pronk(p2distance),
|
|
"interarrival": pronk(interarrival),
|
|
"tzoffset": pronk(tzoffset),
|
|
},
|
|
fp,
|
|
)
|
|
fp.close()
|
|
|
|
|
|
@command(
|
|
"synthesize",
|
|
[
|
|
("c", "count", 0, _("create given number of commits"), _("COUNT")),
|
|
("", "dict", "", _("path to a dictionary of words"), _("FILE")),
|
|
("", "initfiles", 0, _("initial file count to create"), _("COUNT")),
|
|
],
|
|
_("hg synthesize [OPTION].. DESCFILE"),
|
|
)
|
|
def synthesize(ui, repo, descpath, **opts):
|
|
"""synthesize commits based on a model of an existing repository
|
|
|
|
The model must have been generated by :hg:`analyze`. Commits will
|
|
be generated randomly according to the probabilities described in
|
|
the model. If --initfiles is set, the repository will be seeded with
|
|
the given number files following the modeled repository's directory
|
|
structure.
|
|
|
|
When synthesizing new content, commit descriptions, and user
|
|
names, words will be chosen randomly from a dictionary that is
|
|
presumed to contain one word per line. Use --dict to specify the
|
|
path to an alternate dictionary to use.
|
|
"""
|
|
try:
|
|
fp = hg.openpath(ui, descpath)
|
|
except Exception as err:
|
|
raise error.Abort("%s: %s" % (descpath, err[0].strerror))
|
|
desc = json.load(fp)
|
|
fp.close()
|
|
|
|
def cdf(l):
|
|
if not l:
|
|
return [], []
|
|
vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))
|
|
t = float(sum(probs, 0))
|
|
s, cdfs = 0, []
|
|
for v in probs:
|
|
s += v
|
|
cdfs.append(s / t)
|
|
return vals, cdfs
|
|
|
|
lineschanged = cdf(desc["lineschanged"])
|
|
fileschanged = cdf(desc["fileschanged"])
|
|
filesadded = cdf(desc["filesadded"])
|
|
dirsadded = cdf(desc["dirsadded"])
|
|
filesremoved = cdf(desc["filesremoved"])
|
|
linelengths = cdf(desc["linelengths"])
|
|
parents = cdf(desc["parents"])
|
|
p1distance = cdf(desc["p1distance"])
|
|
p2distance = cdf(desc["p2distance"])
|
|
interarrival = cdf(desc["interarrival"])
|
|
linesinfilesadded = cdf(desc["linesinfilesadded"])
|
|
tzoffset = cdf(desc["tzoffset"])
|
|
|
|
dictfile = opts.get("dict") or "/usr/share/dict/words"
|
|
try:
|
|
fp = open(dictfile, "rU")
|
|
except IOError as err:
|
|
raise error.Abort("%s: %s" % (dictfile, err.strerror))
|
|
words = fp.read().splitlines()
|
|
fp.close()
|
|
|
|
initdirs = {}
|
|
if desc["initdirs"]:
|
|
for k, v in desc["initdirs"]:
|
|
initdirs[k.encode("utf-8").replace(".hg", "_hg")] = v
|
|
initdirs = renamedirs(initdirs, words)
|
|
initdirscdf = cdf(initdirs)
|
|
|
|
def pick(cdf):
|
|
return cdf[0][bisect.bisect_left(cdf[1], random.random())]
|
|
|
|
def pickpath():
|
|
return os.path.join(pick(initdirscdf), random.choice(words))
|
|
|
|
def makeline(minimum=0):
|
|
total = max(minimum, pick(linelengths))
|
|
c, l = 0, []
|
|
while c < total:
|
|
w = random.choice(words)
|
|
c += len(w) + 1
|
|
l.append(w)
|
|
return " ".join(l)
|
|
|
|
wlock = repo.wlock()
|
|
lock = repo.lock()
|
|
|
|
nevertouch = {".hgsub", ".hgignore", ".hgtags"}
|
|
|
|
progress = ui.progress
|
|
_synthesizing = _("synthesizing")
|
|
_files = _("initial files")
|
|
_changesets = _("changesets")
|
|
|
|
# Synthesize a single initial revision adding files to the repo according
|
|
# to the modeled directory structure.
|
|
initcount = int(opts["initfiles"])
|
|
if initcount and initdirs:
|
|
pctx = repo[None].parents()[0]
|
|
dirs = set(pctx.dirs())
|
|
files = {}
|
|
|
|
def validpath(path):
|
|
# Don't pick filenames which are already directory names.
|
|
if path in dirs:
|
|
return False
|
|
# Don't pick directories which were used as file names.
|
|
while path:
|
|
if path in files:
|
|
return False
|
|
path = os.path.dirname(path)
|
|
return True
|
|
|
|
for i in xrange(0, initcount):
|
|
ui.progress(_synthesizing, i, unit=_files, total=initcount)
|
|
|
|
path = pickpath()
|
|
while not validpath(path):
|
|
path = pickpath()
|
|
data = "%s contents\n" % path
|
|
files[path] = data
|
|
dir = os.path.dirname(path)
|
|
while dir and dir not in dirs:
|
|
dirs.add(dir)
|
|
dir = os.path.dirname(dir)
|
|
|
|
def filectxfn(repo, memctx, path):
|
|
return context.memfilectx(repo, memctx, path, files[path])
|
|
|
|
ui.progress(_synthesizing, None)
|
|
message = "synthesized wide repo with %d files" % (len(files),)
|
|
mc = context.memctx(
|
|
repo,
|
|
[pctx.node(), nullid],
|
|
message,
|
|
files.iterkeys(),
|
|
filectxfn,
|
|
ui.username(),
|
|
"%d %d" % util.makedate(),
|
|
)
|
|
initnode = mc.commit()
|
|
if ui.debugflag:
|
|
hexfn = hex
|
|
else:
|
|
hexfn = short
|
|
ui.status(_("added commit %s with %d files\n") % (hexfn(initnode), len(files)))
|
|
|
|
# Synthesize incremental revisions to the repository, adding repo depth.
|
|
count = int(opts["count"])
|
|
heads = set(map(repo.changelog.rev, repo.heads()))
|
|
for i in xrange(count):
|
|
progress(_synthesizing, i, unit=_changesets, total=count)
|
|
|
|
node = repo.changelog.node
|
|
revs = len(repo)
|
|
|
|
def pickhead(heads, distance):
|
|
if heads:
|
|
lheads = sorted(heads)
|
|
rev = revs - min(pick(distance), revs)
|
|
if rev < lheads[-1]:
|
|
rev = lheads[bisect.bisect_left(lheads, rev)]
|
|
else:
|
|
rev = lheads[-1]
|
|
return rev, node(rev)
|
|
return nullrev, nullid
|
|
|
|
r1 = revs - min(pick(p1distance), revs)
|
|
p1 = node(r1)
|
|
|
|
# the number of heads will grow without bound if we use a pure
|
|
# model, so artificially constrain their proliferation
|
|
toomanyheads = len(heads) > random.randint(1, 20)
|
|
if p2distance[0] and (pick(parents) == 2 or toomanyheads):
|
|
r2, p2 = pickhead(heads.difference([r1]), p2distance)
|
|
else:
|
|
r2, p2 = nullrev, nullid
|
|
|
|
pl = [p1, p2]
|
|
pctx = repo[r1]
|
|
mf = pctx.manifest()
|
|
mfk = mf.keys()
|
|
changes = {}
|
|
if mfk:
|
|
for __ in xrange(pick(fileschanged)):
|
|
for __ in xrange(10):
|
|
fctx = pctx.filectx(random.choice(mfk))
|
|
path = fctx.path()
|
|
if not (
|
|
path in nevertouch or fctx.isbinary() or "l" in fctx.flags()
|
|
):
|
|
break
|
|
lines = fctx.data().splitlines()
|
|
add, remove = pick(lineschanged)
|
|
for __ in xrange(remove):
|
|
if not lines:
|
|
break
|
|
del lines[random.randrange(0, len(lines))]
|
|
for __ in xrange(add):
|
|
lines.insert(random.randint(0, len(lines)), makeline())
|
|
path = fctx.path()
|
|
changes[path] = "\n".join(lines) + "\n"
|
|
for __ in xrange(pick(filesremoved)):
|
|
path = random.choice(mfk)
|
|
for __ in xrange(10):
|
|
path = random.choice(mfk)
|
|
if path not in changes:
|
|
break
|
|
if filesadded:
|
|
dirs = list(pctx.dirs())
|
|
dirs.insert(0, "")
|
|
for __ in xrange(pick(filesadded)):
|
|
pathstr = ""
|
|
while pathstr in dirs:
|
|
path = [random.choice(dirs)]
|
|
if pick(dirsadded):
|
|
path.append(random.choice(words))
|
|
path.append(random.choice(words))
|
|
pathstr = "/".join(filter(None, path))
|
|
data = (
|
|
"\n".join(makeline() for __ in xrange(pick(linesinfilesadded))) + "\n"
|
|
)
|
|
changes[pathstr] = data
|
|
|
|
def filectxfn(repo, memctx, path):
|
|
if path not in changes:
|
|
return None
|
|
return context.memfilectx(repo, memctx, path, changes[path])
|
|
|
|
if not changes:
|
|
continue
|
|
if revs:
|
|
date = repo["tip"].date()[0] + pick(interarrival)
|
|
else:
|
|
date = time.time() - (86400 * count)
|
|
# dates in mercurial must be positive, fit in 32-bit signed integers.
|
|
date = min(0x7FFFFFFF, max(0, date))
|
|
user = random.choice(words) + "@" + random.choice(words)
|
|
mc = context.memctx(
|
|
repo,
|
|
pl,
|
|
makeline(minimum=2),
|
|
sorted(changes),
|
|
filectxfn,
|
|
user,
|
|
"%d %d" % (date, pick(tzoffset)),
|
|
)
|
|
newnode = mc.commit()
|
|
heads.add(repo.changelog.rev(newnode))
|
|
heads.discard(r1)
|
|
heads.discard(r2)
|
|
|
|
lock.release()
|
|
wlock.release()
|
|
|
|
|
|
def renamedirs(dirs, words):
|
|
"""Randomly rename the directory names in the per-dir file count dict."""
|
|
wordgen = itertools.cycle(words)
|
|
replacements = {"": ""}
|
|
|
|
def rename(dirpath):
|
|
"""Recursively rename the directory and all path prefixes.
|
|
|
|
The mapping from path to renamed path is stored for all path prefixes
|
|
as in dynamic programming, ensuring linear runtime and consistent
|
|
renaming regardless of iteration order through the model.
|
|
"""
|
|
if dirpath in replacements:
|
|
return replacements[dirpath]
|
|
head, _ = os.path.split(dirpath)
|
|
if head:
|
|
head = rename(head)
|
|
else:
|
|
head = ""
|
|
renamed = os.path.join(head, next(wordgen))
|
|
replacements[dirpath] = renamed
|
|
return renamed
|
|
|
|
result = []
|
|
for dirpath, count in dirs.iteritems():
|
|
result.append([rename(dirpath.lstrip(os.sep)), count])
|
|
return result
|