Update data scripts for downloading stuff without OOMing

2024-09-11 15:05:44 +03:00 · 2021-07-07 18:57:07 +00:00 · 2021-07-07 18:57:07 +00:00 · c7d3719bf4
commit c7d3719bf4
parent 9615befa43
2 changed files with 108 additions and 257 deletions
--- a/nbs/data_processing.ipynb
+++ b/nbs/data_processing.ipynb
@ -2,12 +2,13 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gdown\n",
    "\n",
+    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from pathlib import Path"
@ -15,25 +16,25 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
-     "name": "stdout",
     "output_type": "stream",
+     "name": "stdout",
     "text": [
      "File exists: ../data/repo_infos.csv\n"
     ]
    },
    {
+     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "'../data/repo_infos.csv'"
      ]
     },
-     "execution_count": 4,
     "metadata": {},
-     "output_type": "execute_result"
+     "execution_count": 5
    }
   ],
   "source": [
@ -48,18 +49,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 20,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/nathan/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3441: DtypeWarning: Columns (3) have mixed types.Specify dtype option on import or set low_memory=False.\n",
-      "  exec(code_obj, self.user_global_ns, self.user_ns)\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "our_repos = pd.read_csv(data_path/\"repo_infos.csv\", parse_dates=True)\n",
    "eleuther_repos = pd.read_csv(\n",
@ -69,301 +61,149 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
+     "output_type": "execute_result",
     "data": {
      "text/plain": [
-       "670996"
+       "array([ 28043.1 ,  70708.05, 338060.77])"
      ]
     },
-     "execution_count": 38,
     "metadata": {},
-     "output_type": "execute_result"
+     "execution_count": 21
+    }
+   ],
+   "source": [
+    "np.percentile(our_repos[\"size\"].values, [90, 95, 99])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "(544340, 517122)"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 22
+    }
+   ],
+   "source": [
+    "len(our_repos), len(our_repos[our_repos[\"size\"] < 70708])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "our_filtered_repos = our_repos[our_repos[\"size\"] < 70708]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "648023"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 24
    }
   ],
   "source": [
    "# Combine our repos and EleutherAI's repos\n",
-    "def combine_repos(our_repos, eleuther_repos):\n",
+    "def combine_repos(ours, eleuthers):\n",
    "    # Combine our repos\n",
    "    combined = pd.concat(\n",
-    "        [our_repos, eleuther_repos],\n",
+    "        [ours, eleuthers],\n",
    "    )\n",
    "\n",
    "    # Remove duplicate repos\n",
    "    dedup_combined = combined[~combined[\"name\"].duplicated(keep=\"last\")]\n",
    "    return dedup_combined\n",
    "\n",
-    "combined = combine_repos(our_repos, eleuther_repos)\n",
+    "combined = combine_repos(our_filtered_repos, eleuther_repos)\n",
    "len(combined)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
-    "combined.to_csv(data_path/\"combined_repos_unfiltered.csv\", index=False)"
+    "combined.to_csv(data_path/\"combined_repos_size_filtered.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
-     "name": "stderr",
     "output_type": "stream",
+     "name": "stderr",
     "text": [
-      "/home/nathan/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3441: DtypeWarning: Columns (1,3,4,6,7,12,13,14,15,16,17,18,19,20,21,22,23,24,26) have mixed types.Specify dtype option on import or set low_memory=False.\n",
-      "  exec(code_obj, self.user_global_ns, self.user_ns)\n"
+      "/home/nathan/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3441: DtypeWarning: Columns (1,3,4,6,7,12,13,14,15,16,17,18,19,20,21,22,23,24,26) have mixed types.Specify dtype option on import or set low_memory=False.\n  exec(code_obj, self.user_global_ns, self.user_ns)\n"
     ]
    }
   ],
   "source": [
-    "combined = pd.read_csv(data_path/\"combined_repos_unfiltered.csv\")"
+    "combined = pd.read_csv(data_path/\"combined_repos_size_filtered.csv\")"
   ]
  },
+  {
+   "source": [
+    "Shard the dataset into manageable pieces since EleutherAI's downloader has a memory leak."
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
-    "import numpy as np\n",
+    "N_SHARDS = 24\n",
    "\n",
-    "df1, df2, df3 = np.array_split(combined, 3)\n",
-    "assert len(df1) + len(df2) + len(df3) == len(combined)"
+    "shards = np.array_split(combined, N_SHARDS)\n",
+    "lens = list(map(len, shards))\n",
+    "assert sum(lens) == len(combined)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(223666, 223665, 223665)"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(df1), len(df2), len(df3)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
-    "df1.to_csv(data_path/\"combined_repos_shard_1.csv\", index=False)\n",
-    "df2.to_csv(data_path/\"combined_repos_shard_2.csv\", index=False)\n",
-    "df3.to_csv(data_path/\"combined_repos_shard_3.csv\", index=False)"
+    "for idx, shard in enumerate(shards):\n",
+    "    shard.to_csv(data_path/f\"shards/combined_repos_shard_{idx}\", index=False)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
+     "output_type": "execute_result",
     "data": {
-      "text/plain": [
-       "670996"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(combined)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>name</th>\n",
-       "      <th>fork project</th>\n",
-       "      <th>commits</th>\n",
-       "      <th>branches</th>\n",
-       "      <th>default branch</th>\n",
-       "      <th>releases</th>\n",
-       "      <th>contributors</th>\n",
-       "      <th>license</th>\n",
-       "      <th>watchers</th>\n",
-       "      <th>stargazers</th>\n",
-       "      <th>...</th>\n",
-       "      <th>total issues</th>\n",
-       "      <th>open issues</th>\n",
-       "      <th>total pull requests</th>\n",
-       "      <th>open pull requests</th>\n",
-       "      <th>last commit</th>\n",
-       "      <th>last commit SHA</th>\n",
-       "      <th>has wiki</th>\n",
-       "      <th>is archived</th>\n",
-       "      <th>languages</th>\n",
-       "      <th>labels</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0-1-0/lightblue-0.4</td>\n",
-       "      <td>False</td>\n",
-       "      <td>8.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>master</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>4</td>\n",
-       "      <td>GNU General Public License v3.0</td>\n",
-       "      <td>14.0</td>\n",
-       "      <td>86</td>\n",
-       "      <td>...</td>\n",
-       "      <td>9</td>\n",
-       "      <td>8</td>\n",
-       "      <td>5</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2020-10-18 21:26:07.0</td>\n",
-       "      <td>9a4f7b37e923b262d2a29894676ff8ed8cde6237</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>0-14n/ndroid</td>\n",
-       "      <td>False</td>\n",
-       "      <td>131.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>master</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>Other</td>\n",
-       "      <td>5.0</td>\n",
-       "      <td>50</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2015-03-17 13:10:07.0</td>\n",
-       "      <td>4e5dbe69855a7fda8b74e61d9db5aa61e6ba9ee8</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>C,C++,Objective-C,Shell,Assembly,Haxe,Groff,Py...</td>\n",
-       "      <td>bug,duplicate,enhancement,help wanted,invalid,...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0-sec/zero-crack</td>\n",
-       "      <td>False</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>main</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>GNU General Public License v3.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>62</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2021-05-12 02:03:08.0</td>\n",
-       "      <td>70ee16550a81b396333565515723d5abab87c719</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Python</td>\n",
-       "      <td>bug,documentation,duplicate,enhancement,good f...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>0-tikaro/minimum-viable-startpage</td>\n",
-       "      <td>False</td>\n",
-       "      <td>15.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>master</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>?</td>\n",
-       "      <td>MIT License</td>\n",
-       "      <td>4.0</td>\n",
-       "      <td>56</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2019-04-21 09:11:12.0</td>\n",
-       "      <td>a4fb4aea4474d635c4e4738f7d8c1a485d5d74c8</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>JavaScript,CSS,HTML</td>\n",
-       "      <td>bug,duplicate,enhancement,good first issue,hel...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>0-u-0/dugon-media-server</td>\n",
-       "      <td>False</td>\n",
-       "      <td>52.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>master</td>\n",
-       "      <td>5.0</td>\n",
-       "      <td>?</td>\n",
-       "      <td>MIT License</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>14</td>\n",
-       "      <td>...</td>\n",
-       "      <td>5</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2020-05-16 04:11:45.0</td>\n",
-       "      <td>1d6bb1c589e51d2c34b11be20d34dae4bb0c7779</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>JavaScript,Dockerfile</td>\n",
-       "      <td>bug,documentation,duplicate,enhancement,featur...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5 rows × 27 columns</p>\n",
-       "</div>"
-      ],
      "text/plain": [
       "                                name fork project  commits branches  \\\n",
       "0                0-1-0/lightblue-0.4        False      8.0        1   \n",
@ -415,11 +255,11 @@
       "5  bug,documentation,duplicate,enhancement,featur...  \n",
       "\n",
       "[5 rows x 27 columns]"
-      ]
+      ],
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>name</th>\n      <th>fork project</th>\n      <th>commits</th>\n      <th>branches</th>\n      <th>default branch</th>\n      <th>releases</th>\n      <th>contributors</th>\n      <th>license</th>\n      <th>watchers</th>\n      <th>stargazers</th>\n      <th>...</th>\n      <th>total issues</th>\n      <th>open issues</th>\n      <th>total pull requests</th>\n      <th>open pull requests</th>\n      <th>last commit</th>\n      <th>last commit SHA</th>\n      <th>has wiki</th>\n      <th>is archived</th>\n      <th>languages</th>\n      <th>labels</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0-1-0/lightblue-0.4</td>\n      <td>False</td>\n      <td>8.0</td>\n      <td>1</td>\n      <td>master</td>\n      <td>0.0</td>\n      <td>4</td>\n      <td>GNU General Public License v3.0</td>\n      <td>14.0</td>\n      <td>86</td>\n      <td>...</td>\n      <td>9</td>\n      <td>8</td>\n      <td>5</td>\n      <td>0</td>\n      <td>2020-10-18 21:26:07.0</td>\n      <td>9a4f7b37e923b262d2a29894676ff8ed8cde6237</td>\n      <td>True</td>\n      <td>False</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>0-14n/ndroid</td>\n      <td>False</td>\n      <td>131.0</td>\n      <td>1</td>\n      <td>master</td>\n      <td>0.0</td>\n      <td>2</td>\n      <td>Other</td>\n      <td>5.0</td>\n      <td>50</td>\n      <td>...</td>\n      <td>1</td>\n      <td>1</td>\n      <td>2</td>\n      <td>1</td>\n      <td>2015-03-17 13:10:07.0</td>\n      <td>4e5dbe69855a7fda8b74e61d9db5aa61e6ba9ee8</td>\n      <td>True</td>\n      <td>False</td>\n      <td>C,C++,Objective-C,Shell,Assembly,Haxe,Groff,Py...</td>\n      <td>bug,duplicate,enhancement,help wanted,invalid,...</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>0-sec/zero-crack</td>\n      <td>False</td>\n      <td>4.0</td>\n      <td>1</td>\n      <td>main</td>\n      <td>1.0</td>\n      <td>1</td>\n      <td>GNU General Public License v3.0</td>\n      <td>0.0</td>\n      <td>62</td>\n      <td>...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>2021-05-12 02:03:08.0</td>\n      <td>70ee16550a81b396333565515723d5abab87c719</td>\n      <td>True</td>\n      <td>False</td>\n      <td>Python</td>\n      <td>bug,documentation,duplicate,enhancement,good f...</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>0-tikaro/minimum-viable-startpage</td>\n      <td>False</td>\n      <td>15.0</td>\n      <td>1</td>\n      <td>master</td>\n      <td>0.0</td>\n      <td>?</td>\n      <td>MIT License</td>\n      <td>4.0</td>\n      <td>56</td>\n      <td>...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>2</td>\n      <td>1</td>\n      <td>2019-04-21 09:11:12.0</td>\n      <td>a4fb4aea4474d635c4e4738f7d8c1a485d5d74c8</td>\n      <td>True</td>\n      <td>False</td>\n      <td>JavaScript,CSS,HTML</td>\n      <td>bug,duplicate,enhancement,good first issue,hel...</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>0-u-0/dugon-media-server</td>\n      <td>False</td>\n      <td>52.0</td>\n      <td>1</td>\n      <td>master</td>\n      <td>5.0</td>\n      <td>?</td>\n      <td>MIT License</td>\n      <td>2.0</td>\n      <td>14</td>\n      <td>...</td>\n      <td>5</td>\n      <td>1</td>\n      <td>0</td>\n      <td>0</td>\n      <td>2020-05-16 04:11:45.0</td>\n      <td>1d6bb1c589e51d2c34b11be20d34dae4bb0c7779</td>\n      <td>True</td>\n      <td>False</td>\n      <td>JavaScript,Dockerfile</td>\n      <td>bug,documentation,duplicate,enhancement,featur...</td>\n    </tr>\n  </tbody>\n</table>\n<p>5 rows × 27 columns</p>\n</div>"
     },
-     "execution_count": 33,
     "metadata": {},
-     "output_type": "execute_result"
+     "execution_count": 45
    }
   ],
   "source": [
@ -629,8 +469,8 @@
   "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
  },
  "kernelspec": {
-   "display_name": "Python 3.8.10 64-bit",
-   "name": "python3"
+   "name": "python3",
+   "display_name": "Python 3.8.10 64-bit"
  },
  "language_info": {
   "codemirror_mode": {
--- a/scripts/download_data.sh
+++ b/scripts/download_data.sh
@ -1,8 +1,19 @@
 #! /bin/bash
-COMBINED_REPOS=$1
+REPO_SHARDS=$1
 OUT_DIR=$2
-python convert_to_gh_downloader_format.py $COMBINED_REPOS $OUT_DIR
+ID=0
+for shard in ${REPO_SHARDS}/*; do
+    echo "Processing data from ${shard}"
+    python convert_to_gh_downloader_format.py $shard $OUT_DIR
+    ID=$(($ID + 1))
+    cd $OUT_DIR
+    python download_repo_text.py
+    mv $OUT_DIR"github_data" $OUT_DIR"github_data_${ID}"
+    cd -
+    echo "Finished processsing data from ${shard}"
+done
+# python convert_to_gh_downloader_format.py $COMBINED_REPOS $OUT_DIR

-cd $OUT_DIR
-python download_repo_text.py
-cd -
+# cd $OUT_DIR
+# python download_repo_text.py
+# cd -