mirror of
https://github.com/CodedotAl/gpt-code-clippy.git
synced 2024-10-26 09:17:45 +03:00
added streaming duplicate remover (probably v. slow)
This commit is contained in:
parent
07aacc88da
commit
8c7fb0ed99
39
deduplication/deduplication_streaming.py
Normal file
39
deduplication/deduplication_streaming.py
Normal file
@ -0,0 +1,39 @@
|
||||
import argparse
|
||||
import datasets
|
||||
import lm_dataformat
|
||||
import re
|
||||
import tqdm
|
||||
|
||||
parser = argparse.ArgumentParser(description="Deduplicate a list of files")
|
||||
parser.add_argument("--data_dir", type=str, required=True)
|
||||
parser.add_argument("--output_dir", type=str, required=True)
|
||||
parser.add_argument("--archive_commit_freq", type=int, default=10_000)
|
||||
args = parser.parse_args()
|
||||
|
||||
dataset = datasets.load_dataset(
|
||||
"script.py", data_dir=args.data_dir, split="train", streaming=True
|
||||
)
|
||||
|
||||
|
||||
def get_variables(example):
|
||||
variables = " ".join(re.split(r"\W+", example["text"]))
|
||||
return variables
|
||||
|
||||
|
||||
uniques = set()
|
||||
ar = lm_dataformat.Archive(args.output_dir)
|
||||
i = 0
|
||||
|
||||
for example in tqdm.tqdm(dataset):
|
||||
variables = get_variables(example)
|
||||
if variables not in uniques:
|
||||
uniques.add(variables)
|
||||
code = example["text"]
|
||||
del example["text"]
|
||||
ar.add_data(code, meta=example)
|
||||
i += 1
|
||||
if i % args.archive_commit_freq == 0:
|
||||
ar.commit()
|
||||
ar.commit()
|
||||
|
||||
print(i)
|
Loading…
Reference in New Issue
Block a user