Skip to content

Commit

Permalink
render jupyter notebook on documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
AnishereMariam committed Oct 22, 2024
1 parent 8613d59 commit 5b33e41
Show file tree
Hide file tree
Showing 2 changed files with 321 additions and 0 deletions.
5 changes: 5 additions & 0 deletions docs/source/scribe_data/wikipedia/gen_autosuggestions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ gen_autosuggestions.ipynb
Scribe Autosuggest Generation
-----------------------------

.. toctree::

notebook.ipynb
../../../../src/scribe_data/wikipedia/gen_autosuggestions.ipynb

This notebook is used to run the functions found in Scribe-Data to extract, clean and load autosuggestion files into Scribe apps.

Use the :code:`View code on GitHub` link above to view the notebook and explore the process!
316 changes: 316 additions & 0 deletions docs/source/scribe_data/wikipedia/notebook.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,316 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "743abe55",
"metadata": {
"toc": true
},
"source": [
"<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
"<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Download-and-Parse-Wiki\" data-toc-modified-id=\"Download-and-Parse-Wiki-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Download and Parse Wiki</a></span></li><li><span><a href=\"#Process-and-Clean\" data-toc-modified-id=\"Process-and-Clean-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Process and Clean</a></span></li><li><span><a href=\"#Generate-and-Upload\" data-toc-modified-id=\"Generate-and-Upload-3\"><span class=\"toc-item-num\">3&nbsp;&nbsp;</span>Generate and Upload</a></span></li></ul></div>"
]
},
{
"cell_type": "markdown",
"id": "592e4b36",
"metadata": {},
"source": [
"**Scribe Autosuggest Generation**\n",
"\n",
"This notebook is used to run the functions found in Scribe-Data to extract, clean and load autosuggestion files into Scribe apps."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bec5ff38",
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\", message=r\"Passing\", category=FutureWarning)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c8c7a44",
"metadata": {
"ExecuteTime": {
"end_time": "2023-04-10T19:52:39.142528Z",
"start_time": "2023-04-10T19:52:39.087499Z"
}
},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"import json\n",
"\n",
"from tqdm.auto import tqdm\n",
"from IPython.core.display import display, HTML\n",
"display(HTML(\"<style>.container { width:99% !important; }</style>\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "14a5bf58",
"metadata": {
"ExecuteTime": {
"end_time": "2023-04-10T19:52:39.147871Z",
"start_time": "2023-04-10T19:52:39.144127Z"
}
},
"outputs": [],
"source": [
"pwd = os.path.dirname(os.path.realpath(\"gen_autosuggestions.ipynb\"))\n",
"pwd = pwd.split(\"scribe_data\")[0]\n",
"sys.path.append(pwd)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c7939bd",
"metadata": {
"ExecuteTime": {
"end_time": "2023-04-10T19:52:52.508933Z",
"start_time": "2023-04-10T19:52:52.506137Z"
}
},
"outputs": [],
"source": [
"from scribe_data.wikipedia.extract_wiki import (\n",
" download_wiki,\n",
" parse_to_ndjson,\n",
")\n",
"from scribe_data.wikipedia.process_wiki import (\n",
" clean,\n",
" gen_autosuggestions,\n",
")\n",
"from scribe_data.utils import get_language_iso"
]
},
{
"cell_type": "markdown",
"id": "2add942e",
"metadata": {},
"source": [
"# Download and Parse Wiki"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a722df43",
"metadata": {
"ExecuteTime": {
"end_time": "2023-04-10T19:53:16.467643Z",
"start_time": "2023-04-10T19:53:16.464619Z"
}
},
"outputs": [],
"source": [
"# Languages: French, German, Italian, Portuguese, Russian, Spanish, Swedish\n",
"language = \"French\"\n",
"language_abbr = get_language_iso(language)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "11546a55",
"metadata": {
"ExecuteTime": {
"end_time": "2023-04-10T19:53:28.138818Z",
"start_time": "2023-04-10T19:53:17.184354Z"
}
},
"outputs": [],
"source": [
"files = download_wiki(\n",
" language=language,\n",
" target_dir=f\"./{language_abbr}wiki_dump\",\n",
" file_limit=None, # None is all files\n",
" dump_id=\"20220920\"\n",
")\n",
"print(f\"Number of files: {len(files)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b40fd9d9",
"metadata": {
"ExecuteTime": {
"end_time": "2022-10-03T12:25:23.192390Z",
"start_time": "2022-10-03T12:25:23.189124Z"
}
},
"outputs": [],
"source": [
"parse_to_ndjson(\n",
" output_path=f\"./{language_abbr}wiki.ndjson\",\n",
" input_dir=f\"./{language_abbr}wiki_dump\",\n",
" partitions_dir=f\"./{language_abbr}wiki_partitions\",\n",
" article_limit=None, # None is all articles\n",
" delete_parsed_files=True,\n",
" multicore=True,\n",
" verbose=True,\n",
")"
]
},
{
"cell_type": "markdown",
"id": "3c3f2f51",
"metadata": {},
"source": [
"# Process and Clean"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "205a01b4",
"metadata": {
"ExecuteTime": {
"start_time": "2022-10-03T12:25:27.126Z"
}
},
"outputs": [],
"source": [
"with open(f\"./{language_abbr}wiki.ndjson\", \"r\") as fin:\n",
" article_texts = [\n",
" json.loads(lang)[1] for lang in tqdm(fin, desc=\"Articles added\", unit=\"articles\")\n",
" ]\n",
"\n",
"print(f\"Number of articles: {len(article_texts)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b1b869f4",
"metadata": {
"ExecuteTime": {
"start_time": "2022-10-03T12:25:34.201Z"
}
},
"outputs": [],
"source": [
"# Define sample size for up to 1 million articles.\n",
"sample_size = 1000000 / len(article_texts)\n",
"sample_size = min(sample_size, 1)\n",
"sample_size"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ea9ea16c",
"metadata": {
"ExecuteTime": {
"start_time": "2022-10-03T12:25:40.574Z"
}
},
"outputs": [],
"source": [
"text_corpus = clean(\n",
" texts=article_texts,\n",
" language=language,\n",
" remove_words=None,\n",
" sample_size=sample_size,\n",
" verbose=True,\n",
")"
]
},
{
"cell_type": "markdown",
"id": "593e855d",
"metadata": {},
"source": [
"# Generate and Upload"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cda9e874",
"metadata": {
"ExecuteTime": {
"start_time": "2022-10-03T12:25:54.735Z"
}
},
"outputs": [],
"source": [
"autosuggest_dict = gen_autosuggestions(\n",
" text_corpus,\n",
" language=language,\n",
" num_words=1000,\n",
" ignore_words=None,\n",
" update_local_data=True,\n",
" verbose=True\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c8c385b7",
"metadata": {
"ExecuteTime": {
"start_time": "2022-10-03T12:25:55.451Z"
}
},
"outputs": [],
"source": [
"# autosuggest_dict"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "af62c758",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:scribe-data-dev] *",
"language": "python",
"name": "conda-env-scribe-data-dev-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.17"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": true,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 5b33e41

Please sign in to comment.