From e5058af709e911e768962f698b6d616a74563577 Mon Sep 17 00:00:00 2001 From: ncoop57 Date: Sun, 25 Jul 2021 16:02:17 -0400 Subject: [PATCH] Add filter dataset version --- data_processing/code_clippy_filter.py | 202 ++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 data_processing/code_clippy_filter.py diff --git a/data_processing/code_clippy_filter.py b/data_processing/code_clippy_filter.py new file mode 100644 index 0000000..aeb9ae5 --- /dev/null +++ b/data_processing/code_clippy_filter.py @@ -0,0 +1,202 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Datasets Authors and the CodeClippy team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +CodeClippy dataset - opensource code from Github. Scrapped July 7 2021. +More to add here. +""" + +import os +import io +from typing import List +import jsonlines +import json +import zstandard as zstd +from pathlib import Path + +import datasets + + +# TODO: Add BibTeX citation +# Find for instance the citation on arxiv or on the dataset repo/website +_CITATION = """\ +@InProceedings{huggingface:dataset, +title = {Code Clippy}, +author={CodeClippy team and all the opensource devs around the globe +}, +year={2021} +} +""" + +_DESCRIPTION = """ +CodeClippy dataset - opensource code from Github. Scrapped July 7 2021. +""" + +# TODO: Add a link to an official homepage for the dataset here +_HOMEPAGE = "" + +# TODO: Add the licence for the dataset here if you can find it +_LICENSE = "" + +# TODO: Add link to the official dataset URLs here (once we have those) +# The HuggingFace dataset library don't host the datasets but only point to the original files +# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) +_URLs = { + "https://huggingface.co/great-new-dataset-first_domain.zip", +} + + +class CodeClippy(datasets.GeneratorBasedBuilder): + """CodeClippy dataset - opensource code from Github. Scrapped July 7 2021.""" + + VERSION = datasets.Version("0.1.0") + + # This is an example of a dataset with multiple configurations. + # If you don't want/need to define several sub-sets in your dataset, + # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes. + + # If you need to make complex sub-parts in the datasets with configurable options + # You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig + # BUILDER_CONFIG_CLASS = MyBuilderConfig + + # You will be able to load one or the other configurations in the following list with + # data = datasets.load_dataset('my_dataset', 'first_domain') + # data = datasets.load_dataset('my_dataset', 'second_domain') + # BUILDER_CONFIGS = [ + # datasets.BuilderConfig(name="first_domain", version=VERSION, description="This part of my dataset covers a first domain"), + # datasets.BuilderConfig(name="second_domain", version=VERSION, description="This part of my dataset covers a second domain"), + # ] + + # DEFAULT_CONFIG_NAME = "first_domain" + + def _info(self): + features = datasets.Features( + { + "id": datasets.Value("int64"), + "text": datasets.Value("string"), + "repo_name": datasets.Value("string"), + "stars": datasets.Value("string"), + "repo_language": datasets.Value("string"), + "file_name": datasets.Value("string"), + "mime_type": datasets.Value("string"), + } + ) + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs + # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files. + # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive + + # data_dir = dl_manager.download_and_extract(_URLs) + # filepath = dl_manager.download("https://gist.githubusercontent.com/ppisarczyk/43962d06686722d26d176fad46879d41/raw/211547723b4621a622fc56978d74aa416cbd1729/Programming_Languages_Extensions.json") + # with open(filepath, "r") as f: + # data = json.load(f) + + # lang_exts = [] + # for i in data: + # if "extensions" not in i: + # continue + # lang_exts.extend(i["extensions"]) + # self.lang_exts = set(lang_exts) + self.lang_exts = { + ".lisp", + ".lsp", + ".f", + ".fs", + ".sh", + ".groovy", + ".r", + ".pl", + ".html", + ".css", + ".sql", + ".py", + ".c", + ".cpp", + ".h", + ".hpp", + ".jl", + ".java", + ".js", + ".ts", + ".cs", + ".go", + ".rs", + ".swift", + ".php", + ".dart", + ".kt", + ".m", + ".hs", + ".scala", + ".sc", + ".lua", + ".rb", + } + data_dir = self.config.data_dir + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepaths": sorted( + [ + str(fp) + for fp in Path(f"{data_dir}/train").glob("*.jsonl.zst") + ] + ) + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepaths": sorted( + [str(fp) for fp in Path(f"{data_dir}/test").glob("*.jsonl.zst")] + ) + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepaths": sorted( + [ + str(fp) + for fp in Path(f"{data_dir}/validation").glob("*.jsonl.zst") + ] + ) + }, + ), + ] + + def _generate_examples(self, filepaths: List): + """Yields examples as (key, example) tuples.""" + id_ = 0 + dctx = zstd.ZstdDecompressor() + for filepath in filepaths: + with open(filepath, "rb") as f: + f = dctx.stream_reader(f) + f = io.TextIOWrapper(f, encoding="utf-8") + f = jsonlines.Reader(f) + for line in f: + filename = line["meta"]["file_name"] + start = filename.rfind(".") + if filename[start:] in self.lang_exts: + yield id_, {"id": id_, "text": line["text"], **line["meta"]} + id_ += 1