Update data scripts for downloading stuff without OOMing

This commit is contained in:
ncoop57 2021-07-07 18:57:07 +00:00
parent 9615befa43
commit c7d3719bf4
2 changed files with 108 additions and 257 deletions

View File

@ -2,12 +2,13 @@
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import gdown\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from pathlib import Path"
@ -15,25 +16,25 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"name": "stdout",
"text": [
"File exists: ../data/repo_infos.csv\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'../data/repo_infos.csv'"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
"execution_count": 5
}
],
"source": [
@ -48,18 +49,9 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/nathan/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3441: DtypeWarning: Columns (3) have mixed types.Specify dtype option on import or set low_memory=False.\n",
" exec(code_obj, self.user_global_ns, self.user_ns)\n"
]
}
],
"outputs": [],
"source": [
"our_repos = pd.read_csv(data_path/\"repo_infos.csv\", parse_dates=True)\n",
"eleuther_repos = pd.read_csv(\n",
@ -69,301 +61,149 @@
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": 21,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"670996"
"array([ 28043.1 , 70708.05, 338060.77])"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
"execution_count": 21
}
],
"source": [
"np.percentile(our_repos[\"size\"].values, [90, 95, 99])"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(544340, 517122)"
]
},
"metadata": {},
"execution_count": 22
}
],
"source": [
"len(our_repos), len(our_repos[our_repos[\"size\"] < 70708])"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"our_filtered_repos = our_repos[our_repos[\"size\"] < 70708]"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"648023"
]
},
"metadata": {},
"execution_count": 24
}
],
"source": [
"# Combine our repos and EleutherAI's repos\n",
"def combine_repos(our_repos, eleuther_repos):\n",
"def combine_repos(ours, eleuthers):\n",
" # Combine our repos\n",
" combined = pd.concat(\n",
" [our_repos, eleuther_repos],\n",
" [ours, eleuthers],\n",
" )\n",
"\n",
" # Remove duplicate repos\n",
" dedup_combined = combined[~combined[\"name\"].duplicated(keep=\"last\")]\n",
" return dedup_combined\n",
"\n",
"combined = combine_repos(our_repos, eleuther_repos)\n",
"combined = combine_repos(our_filtered_repos, eleuther_repos)\n",
"len(combined)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"combined.to_csv(data_path/\"combined_repos_unfiltered.csv\", index=False)"
"combined.to_csv(data_path/\"combined_repos_size_filtered.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"name": "stderr",
"text": [
"/home/nathan/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3441: DtypeWarning: Columns (1,3,4,6,7,12,13,14,15,16,17,18,19,20,21,22,23,24,26) have mixed types.Specify dtype option on import or set low_memory=False.\n",
" exec(code_obj, self.user_global_ns, self.user_ns)\n"
"/home/nathan/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3441: DtypeWarning: Columns (1,3,4,6,7,12,13,14,15,16,17,18,19,20,21,22,23,24,26) have mixed types.Specify dtype option on import or set low_memory=False.\n exec(code_obj, self.user_global_ns, self.user_ns)\n"
]
}
],
"source": [
"combined = pd.read_csv(data_path/\"combined_repos_unfiltered.csv\")"
"combined = pd.read_csv(data_path/\"combined_repos_size_filtered.csv\")"
]
},
{
"source": [
"Shard the dataset into manageable pieces since EleutherAI's downloader has a memory leak."
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"N_SHARDS = 24\n",
"\n",
"df1, df2, df3 = np.array_split(combined, 3)\n",
"assert len(df1) + len(df2) + len(df3) == len(combined)"
"shards = np.array_split(combined, N_SHARDS)\n",
"lens = list(map(len, shards))\n",
"assert sum(lens) == len(combined)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(223666, 223665, 223665)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(df1), len(df2), len(df3)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"df1.to_csv(data_path/\"combined_repos_shard_1.csv\", index=False)\n",
"df2.to_csv(data_path/\"combined_repos_shard_2.csv\", index=False)\n",
"df3.to_csv(data_path/\"combined_repos_shard_3.csv\", index=False)"
"for idx, shard in enumerate(shards):\n",
" shard.to_csv(data_path/f\"shards/combined_repos_shard_{idx}\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 45,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"670996"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(combined)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>fork project</th>\n",
" <th>commits</th>\n",
" <th>branches</th>\n",
" <th>default branch</th>\n",
" <th>releases</th>\n",
" <th>contributors</th>\n",
" <th>license</th>\n",
" <th>watchers</th>\n",
" <th>stargazers</th>\n",
" <th>...</th>\n",
" <th>total issues</th>\n",
" <th>open issues</th>\n",
" <th>total pull requests</th>\n",
" <th>open pull requests</th>\n",
" <th>last commit</th>\n",
" <th>last commit SHA</th>\n",
" <th>has wiki</th>\n",
" <th>is archived</th>\n",
" <th>languages</th>\n",
" <th>labels</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0-1-0/lightblue-0.4</td>\n",
" <td>False</td>\n",
" <td>8.0</td>\n",
" <td>1</td>\n",
" <td>master</td>\n",
" <td>0.0</td>\n",
" <td>4</td>\n",
" <td>GNU General Public License v3.0</td>\n",
" <td>14.0</td>\n",
" <td>86</td>\n",
" <td>...</td>\n",
" <td>9</td>\n",
" <td>8</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>2020-10-18 21:26:07.0</td>\n",
" <td>9a4f7b37e923b262d2a29894676ff8ed8cde6237</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0-14n/ndroid</td>\n",
" <td>False</td>\n",
" <td>131.0</td>\n",
" <td>1</td>\n",
" <td>master</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>Other</td>\n",
" <td>5.0</td>\n",
" <td>50</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>2015-03-17 13:10:07.0</td>\n",
" <td>4e5dbe69855a7fda8b74e61d9db5aa61e6ba9ee8</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>C,C++,Objective-C,Shell,Assembly,Haxe,Groff,Py...</td>\n",
" <td>bug,duplicate,enhancement,help wanted,invalid,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0-sec/zero-crack</td>\n",
" <td>False</td>\n",
" <td>4.0</td>\n",
" <td>1</td>\n",
" <td>main</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>GNU General Public License v3.0</td>\n",
" <td>0.0</td>\n",
" <td>62</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2021-05-12 02:03:08.0</td>\n",
" <td>70ee16550a81b396333565515723d5abab87c719</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>Python</td>\n",
" <td>bug,documentation,duplicate,enhancement,good f...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0-tikaro/minimum-viable-startpage</td>\n",
" <td>False</td>\n",
" <td>15.0</td>\n",
" <td>1</td>\n",
" <td>master</td>\n",
" <td>0.0</td>\n",
" <td>?</td>\n",
" <td>MIT License</td>\n",
" <td>4.0</td>\n",
" <td>56</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>2019-04-21 09:11:12.0</td>\n",
" <td>a4fb4aea4474d635c4e4738f7d8c1a485d5d74c8</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>JavaScript,CSS,HTML</td>\n",
" <td>bug,duplicate,enhancement,good first issue,hel...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0-u-0/dugon-media-server</td>\n",
" <td>False</td>\n",
" <td>52.0</td>\n",
" <td>1</td>\n",
" <td>master</td>\n",
" <td>5.0</td>\n",
" <td>?</td>\n",
" <td>MIT License</td>\n",
" <td>2.0</td>\n",
" <td>14</td>\n",
" <td>...</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2020-05-16 04:11:45.0</td>\n",
" <td>1d6bb1c589e51d2c34b11be20d34dae4bb0c7779</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>JavaScript,Dockerfile</td>\n",
" <td>bug,documentation,duplicate,enhancement,featur...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 27 columns</p>\n",
"</div>"
],
"text/plain": [
" name fork project commits branches \\\n",
"0 0-1-0/lightblue-0.4 False 8.0 1 \n",
@ -415,11 +255,11 @@
"5 bug,documentation,duplicate,enhancement,featur... \n",
"\n",
"[5 rows x 27 columns]"
]
],
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>name</th>\n <th>fork project</th>\n <th>commits</th>\n <th>branches</th>\n <th>default branch</th>\n <th>releases</th>\n <th>contributors</th>\n <th>license</th>\n <th>watchers</th>\n <th>stargazers</th>\n <th>...</th>\n <th>total issues</th>\n <th>open issues</th>\n <th>total pull requests</th>\n <th>open pull requests</th>\n <th>last commit</th>\n <th>last commit SHA</th>\n <th>has wiki</th>\n <th>is archived</th>\n <th>languages</th>\n <th>labels</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0-1-0/lightblue-0.4</td>\n <td>False</td>\n <td>8.0</td>\n <td>1</td>\n <td>master</td>\n <td>0.0</td>\n <td>4</td>\n <td>GNU General Public License v3.0</td>\n <td>14.0</td>\n <td>86</td>\n <td>...</td>\n <td>9</td>\n <td>8</td>\n <td>5</td>\n <td>0</td>\n <td>2020-10-18 21:26:07.0</td>\n <td>9a4f7b37e923b262d2a29894676ff8ed8cde6237</td>\n <td>True</td>\n <td>False</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>1</th>\n <td>0-14n/ndroid</td>\n <td>False</td>\n <td>131.0</td>\n <td>1</td>\n <td>master</td>\n <td>0.0</td>\n <td>2</td>\n <td>Other</td>\n <td>5.0</td>\n <td>50</td>\n <td>...</td>\n <td>1</td>\n <td>1</td>\n <td>2</td>\n <td>1</td>\n <td>2015-03-17 13:10:07.0</td>\n <td>4e5dbe69855a7fda8b74e61d9db5aa61e6ba9ee8</td>\n <td>True</td>\n <td>False</td>\n <td>C,C++,Objective-C,Shell,Assembly,Haxe,Groff,Py...</td>\n <td>bug,duplicate,enhancement,help wanted,invalid,...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0-sec/zero-crack</td>\n <td>False</td>\n <td>4.0</td>\n <td>1</td>\n <td>main</td>\n <td>1.0</td>\n <td>1</td>\n <td>GNU General Public License v3.0</td>\n <td>0.0</td>\n <td>62</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>2021-05-12 02:03:08.0</td>\n <td>70ee16550a81b396333565515723d5abab87c719</td>\n <td>True</td>\n <td>False</td>\n <td>Python</td>\n <td>bug,documentation,duplicate,enhancement,good f...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0-tikaro/minimum-viable-startpage</td>\n <td>False</td>\n <td>15.0</td>\n <td>1</td>\n <td>master</td>\n <td>0.0</td>\n <td>?</td>\n <td>MIT License</td>\n <td>4.0</td>\n <td>56</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>2</td>\n <td>1</td>\n <td>2019-04-21 09:11:12.0</td>\n <td>a4fb4aea4474d635c4e4738f7d8c1a485d5d74c8</td>\n <td>True</td>\n <td>False</td>\n <td>JavaScript,CSS,HTML</td>\n <td>bug,duplicate,enhancement,good first issue,hel...</td>\n </tr>\n <tr>\n <th>5</th>\n <td>0-u-0/dugon-media-server</td>\n <td>False</td>\n <td>52.0</td>\n <td>1</td>\n <td>master</td>\n <td>5.0</td>\n <td>?</td>\n <td>MIT License</td>\n <td>2.0</td>\n <td>14</td>\n <td>...</td>\n <td>5</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>2020-05-16 04:11:45.0</td>\n <td>1d6bb1c589e51d2c34b11be20d34dae4bb0c7779</td>\n <td>True</td>\n <td>False</td>\n <td>JavaScript,Dockerfile</td>\n <td>bug,documentation,duplicate,enhancement,featur...</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 27 columns</p>\n</div>"
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
"execution_count": 45
}
],
"source": [
@ -629,8 +469,8 @@
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
},
"kernelspec": {
"display_name": "Python 3.8.10 64-bit",
"name": "python3"
"name": "python3",
"display_name": "Python 3.8.10 64-bit"
},
"language_info": {
"codemirror_mode": {

View File

@ -1,8 +1,19 @@
#! /bin/bash
COMBINED_REPOS=$1
REPO_SHARDS=$1
OUT_DIR=$2
python convert_to_gh_downloader_format.py $COMBINED_REPOS $OUT_DIR
ID=0
for shard in ${REPO_SHARDS}/*; do
echo "Processing data from ${shard}"
python convert_to_gh_downloader_format.py $shard $OUT_DIR
ID=$(($ID + 1))
cd $OUT_DIR
python download_repo_text.py
mv $OUT_DIR"github_data" $OUT_DIR"github_data_${ID}"
cd -
echo "Finished processsing data from ${shard}"
done
# python convert_to_gh_downloader_format.py $COMBINED_REPOS $OUT_DIR
cd $OUT_DIR
python download_repo_text.py
cd -
# cd $OUT_DIR
# python download_repo_text.py
# cd -