From 5b33e4174f188a312f06af5d77a4a6f3916a7328 Mon Sep 17 00:00:00 2001 From: Anishere Mariam Date: Tue, 22 Oct 2024 11:55:53 +0100 Subject: [PATCH] render jupyter notebook on documentation --- .../wikipedia/gen_autosuggestions.rst | 5 + .../scribe_data/wikipedia/notebook.ipynb | 316 ++++++++++++++++++ 2 files changed, 321 insertions(+) create mode 100644 docs/source/scribe_data/wikipedia/notebook.ipynb diff --git a/docs/source/scribe_data/wikipedia/gen_autosuggestions.rst b/docs/source/scribe_data/wikipedia/gen_autosuggestions.rst index 5f4c90b00..3307c3d62 100644 --- a/docs/source/scribe_data/wikipedia/gen_autosuggestions.rst +++ b/docs/source/scribe_data/wikipedia/gen_autosuggestions.rst @@ -6,6 +6,11 @@ gen_autosuggestions.ipynb Scribe Autosuggest Generation ----------------------------- +.. toctree:: + + notebook.ipynb + ../../../../src/scribe_data/wikipedia/gen_autosuggestions.ipynb + This notebook is used to run the functions found in Scribe-Data to extract, clean and load autosuggestion files into Scribe apps. Use the :code:`View code on GitHub` link above to view the notebook and explore the process! diff --git a/docs/source/scribe_data/wikipedia/notebook.ipynb b/docs/source/scribe_data/wikipedia/notebook.ipynb new file mode 100644 index 000000000..0e77ed7d8 --- /dev/null +++ b/docs/source/scribe_data/wikipedia/notebook.ipynb @@ -0,0 +1,316 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "743abe55", + "metadata": { + "toc": true + }, + "source": [ + "

Table of Contents

\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "592e4b36", + "metadata": {}, + "source": [ + "**Scribe Autosuggest Generation**\n", + "\n", + "This notebook is used to run the functions found in Scribe-Data to extract, clean and load autosuggestion files into Scribe apps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bec5ff38", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\", message=r\"Passing\", category=FutureWarning)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c8c7a44", + "metadata": { + "ExecuteTime": { + "end_time": "2023-04-10T19:52:39.142528Z", + "start_time": "2023-04-10T19:52:39.087499Z" + } + }, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "\n", + "from tqdm.auto import tqdm\n", + "from IPython.core.display import display, HTML\n", + "display(HTML(\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14a5bf58", + "metadata": { + "ExecuteTime": { + "end_time": "2023-04-10T19:52:39.147871Z", + "start_time": "2023-04-10T19:52:39.144127Z" + } + }, + "outputs": [], + "source": [ + "pwd = os.path.dirname(os.path.realpath(\"gen_autosuggestions.ipynb\"))\n", + "pwd = pwd.split(\"scribe_data\")[0]\n", + "sys.path.append(pwd)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c7939bd", + "metadata": { + "ExecuteTime": { + "end_time": "2023-04-10T19:52:52.508933Z", + "start_time": "2023-04-10T19:52:52.506137Z" + } + }, + "outputs": [], + "source": [ + "from scribe_data.wikipedia.extract_wiki import (\n", + " download_wiki,\n", + " parse_to_ndjson,\n", + ")\n", + "from scribe_data.wikipedia.process_wiki import (\n", + " clean,\n", + " gen_autosuggestions,\n", + ")\n", + "from scribe_data.utils import get_language_iso" + ] + }, + { + "cell_type": "markdown", + "id": "2add942e", + "metadata": {}, + "source": [ + "# Download and Parse Wiki" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a722df43", + "metadata": { + "ExecuteTime": { + "end_time": "2023-04-10T19:53:16.467643Z", + "start_time": "2023-04-10T19:53:16.464619Z" + } + }, + "outputs": [], + "source": [ + "# Languages: French, German, Italian, Portuguese, Russian, Spanish, Swedish\n", + "language = \"French\"\n", + "language_abbr = get_language_iso(language)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11546a55", + "metadata": { + "ExecuteTime": { + "end_time": "2023-04-10T19:53:28.138818Z", + "start_time": "2023-04-10T19:53:17.184354Z" + } + }, + "outputs": [], + "source": [ + "files = download_wiki(\n", + " language=language,\n", + " target_dir=f\"./{language_abbr}wiki_dump\",\n", + " file_limit=None, # None is all files\n", + " dump_id=\"20220920\"\n", + ")\n", + "print(f\"Number of files: {len(files)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b40fd9d9", + "metadata": { + "ExecuteTime": { + "end_time": "2022-10-03T12:25:23.192390Z", + "start_time": "2022-10-03T12:25:23.189124Z" + } + }, + "outputs": [], + "source": [ + "parse_to_ndjson(\n", + " output_path=f\"./{language_abbr}wiki.ndjson\",\n", + " input_dir=f\"./{language_abbr}wiki_dump\",\n", + " partitions_dir=f\"./{language_abbr}wiki_partitions\",\n", + " article_limit=None, # None is all articles\n", + " delete_parsed_files=True,\n", + " multicore=True,\n", + " verbose=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3c3f2f51", + "metadata": {}, + "source": [ + "# Process and Clean" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "205a01b4", + "metadata": { + "ExecuteTime": { + "start_time": "2022-10-03T12:25:27.126Z" + } + }, + "outputs": [], + "source": [ + "with open(f\"./{language_abbr}wiki.ndjson\", \"r\") as fin:\n", + " article_texts = [\n", + " json.loads(lang)[1] for lang in tqdm(fin, desc=\"Articles added\", unit=\"articles\")\n", + " ]\n", + "\n", + "print(f\"Number of articles: {len(article_texts)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1b869f4", + "metadata": { + "ExecuteTime": { + "start_time": "2022-10-03T12:25:34.201Z" + } + }, + "outputs": [], + "source": [ + "# Define sample size for up to 1 million articles.\n", + "sample_size = 1000000 / len(article_texts)\n", + "sample_size = min(sample_size, 1)\n", + "sample_size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea9ea16c", + "metadata": { + "ExecuteTime": { + "start_time": "2022-10-03T12:25:40.574Z" + } + }, + "outputs": [], + "source": [ + "text_corpus = clean(\n", + " texts=article_texts,\n", + " language=language,\n", + " remove_words=None,\n", + " sample_size=sample_size,\n", + " verbose=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "593e855d", + "metadata": {}, + "source": [ + "# Generate and Upload" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cda9e874", + "metadata": { + "ExecuteTime": { + "start_time": "2022-10-03T12:25:54.735Z" + } + }, + "outputs": [], + "source": [ + "autosuggest_dict = gen_autosuggestions(\n", + " text_corpus,\n", + " language=language,\n", + " num_words=1000,\n", + " ignore_words=None,\n", + " update_local_data=True,\n", + " verbose=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8c385b7", + "metadata": { + "ExecuteTime": { + "start_time": "2022-10-03T12:25:55.451Z" + } + }, + "outputs": [], + "source": [ + "# autosuggest_dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af62c758", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:scribe-data-dev] *", + "language": "python", + "name": "conda-env-scribe-data-dev-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": true, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}