From c2a471c809ac5447f09eb97c95e0ba6337d915d9 Mon Sep 17 00:00:00 2001 From: Nestor Qin Date: Sun, 8 Dec 2024 21:37:43 -0500 Subject: [PATCH] [Docs] Add documentation page --- .github/workflows/build-doc.yaml | 39 ++++ docs/Makefile | 20 ++ docs/README.md | 30 +++ .../img/mlc-logo-with-text-landscape.svg | 87 ++++++++ docs/conf.py | 102 +++++++++ docs/developer/add_models.rst | 6 + docs/developer/building_from_source.rst | 35 +++ docs/index.rst | 35 +++ docs/make.bat | 35 +++ docs/requirements.txt | 8 + docs/user/advanced_usage.rst | 133 ++++++++++++ docs/user/api_reference.rst | 202 ++++++++++++++++++ docs/user/basic_usage.rst | 120 +++++++++++ docs/user/get_started.rst | 75 +++++++ scripts/gh_deploy_site.sh | 4 + site/_includes/hero.html | 2 +- 16 files changed, 932 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/build-doc.yaml create mode 100644 docs/Makefile create mode 100644 docs/README.md create mode 100644 docs/_static/img/mlc-logo-with-text-landscape.svg create mode 100644 docs/conf.py create mode 100644 docs/developer/add_models.rst create mode 100644 docs/developer/building_from_source.rst create mode 100644 docs/index.rst create mode 100644 docs/make.bat create mode 100644 docs/requirements.txt create mode 100644 docs/user/advanced_usage.rst create mode 100644 docs/user/api_reference.rst create mode 100644 docs/user/basic_usage.rst create mode 100644 docs/user/get_started.rst diff --git a/.github/workflows/build-doc.yaml b/.github/workflows/build-doc.yaml new file mode 100644 index 00000000..644df9cd --- /dev/null +++ b/.github/workflows/build-doc.yaml @@ -0,0 +1,39 @@ +name: Build Docs + +on: + push: + branches: + - main + +jobs: + test_linux: + name: Deploy Docs + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + with: + submodules: recursive + + - name: Configuring build Environment + run: | + sudo apt-get update + python -m pip install -U pip wheel + + - name: Setup Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: '3.0' + + - name: Installing dependencies + run: | + python -m pip install -r docs/requirements.txt + gem install jekyll jekyll-remote-theme + + - name: Deploying on GitHub Pages + if: github.ref == 'refs/heads/main' + run: | + git remote set-url origin https://x-access-token:${{ secrets.MLC_GITHUB_TOKEN }}@github.com/$GITHUB_REPOSITORY + git config --global user.email "mlc-gh-actions-bot@nomail" + git config --global user.name "mlc-gh-actions-bot" + ./scripts/gh_deploy_site.sh diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..3449de1e --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= python -m sphinx +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..ea43a183 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,30 @@ +# MLC-LLM Documentation + +The documentation was built upon [Sphinx](https://www.sphinx-doc.org/en/master/). + +## Dependencies + +Run the following command in this directory to install dependencies first: + +```bash +pip3 install -r requirements.txt +``` + +## Build the Documentation + +Then you can build the documentation by running: + +```bash +make html +``` + +## View the Documentation + +Run the following command to start a simple HTTP server: + +```bash +cd _build/html +python3 -m http.server +``` + +Then you can view the documentation in your browser at `http://localhost:8000` (the port can be customized by appending ` -p PORT_NUMBER` in the python command above). diff --git a/docs/_static/img/mlc-logo-with-text-landscape.svg b/docs/_static/img/mlc-logo-with-text-landscape.svg new file mode 100644 index 00000000..e122d32f --- /dev/null +++ b/docs/_static/img/mlc-logo-with-text-landscape.svg @@ -0,0 +1,87 @@ + +image/svg+xml + + + + + + + + + + + + + + + + diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..85718e6b --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +import os +import sys + +import tlcpack_sphinx_addon + +# -- General configuration ------------------------------------------------ + +sys.path.insert(0, os.path.abspath("../python")) +sys.path.insert(0, os.path.abspath("../")) +autodoc_mock_imports = ["torch"] + +# General information about the project. +project = "web-llm" +author = "WebLLM Contributors" +copyright = "2023, %s" % author + +# Version information. + +version = "0.2.77" +release = "0.2.77" + +extensions = [ + "sphinx_tabs.tabs", + "sphinx_toolbox.collapse", + "sphinxcontrib.httpdomain", + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx_reredirects", +] + +redirects = {"get_started/try_out": "../index.html#getting-started"} + +source_suffix = [".rst"] + +language = "en" + +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = "sphinx" + +# A list of ignored prefixes for module index sorting. +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + +# -- Options for HTML output ---------------------------------------------- + +# The theme is set by the make target +import sphinx_rtd_theme + +html_theme = "sphinx_rtd_theme" +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +templates_path = [] + +html_static_path = [] + +footer_copyright = "© 2023 MLC LLM" +footer_note = " " + +html_logo = "_static/img/mlc-logo-with-text-landscape.svg" + +html_theme_options = { + "logo_only": True, +} + +header_links = [ + ("Home", "https://webllm.mlc.ai/"), + ("Github", "https://github.com/mlc-ai/web-llm"), + ("Discord Server", "https://discord.gg/9Xpy2HGBuD"), +] + +header_dropdown = { + "name": "Other Resources", + "items": [ + ("WebLLM Chat", "https://chat.webllm.ai/"), + ("MLC Course", "https://mlc.ai/"), + ("MLC Blog", "https://blog.mlc.ai/"), + ("MLC LLM", "https://llm.mlc.ai/"), + ], +} + +html_context = { + "footer_copyright": footer_copyright, + "footer_note": footer_note, + "header_links": header_links, + "header_dropdown": header_dropdown, + "display_github": True, + "github_user": "mlc-ai", + "github_repo": "mlc-llm", + "github_version": "main/docs/", + "theme_vcs_pageview_mode": "edit", + # "header_logo": "/path/to/logo", + # "header_logo_link": "", + # "version_selecter": "", +} + + +# add additional overrides +templates_path += [tlcpack_sphinx_addon.get_templates_path()] +html_static_path += [tlcpack_sphinx_addon.get_static_path()] diff --git a/docs/developer/add_models.rst b/docs/developer/add_models.rst new file mode 100644 index 00000000..0a4803d8 --- /dev/null +++ b/docs/developer/add_models.rst @@ -0,0 +1,6 @@ +Adding Models +============= + +WebLLM allows you to compile custom language models using `MLC LLM `_ and then serve compiled model through WebLLM. + +For instructions of how to compile and add custom models to WebLLM, check the `MLC LLM documentation here `_. \ No newline at end of file diff --git a/docs/developer/building_from_source.rst b/docs/developer/building_from_source.rst new file mode 100644 index 00000000..e4508f90 --- /dev/null +++ b/docs/developer/building_from_source.rst @@ -0,0 +1,35 @@ +Building From Source +==================== + +Clone the Repository +--------------------- +.. code-block:: bash + + git clone https://github.com/mlc-ai/web-llm.git + cd web-llm + +Install Dependencies +--------------------- +.. code-block:: bash + + npm install + +Build the Project +----------------- +.. code-block:: bash + + npm run build + +Test Changes +------------ + +To test you changes, you can reuse any existing example or create a new example for your new functionality to test. + +Then, to test the effects of your code change in an example, inside ``examples//package.json``, change from ``"@mlc-ai/web-llm": "^0.2.xx"`` to ``"@mlc-ai/web-llm": ../...`` to let it reference you local code. + +.. code-block:: bash + + cd examples/ + # Modify the package.json + npm install + npm start diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 00000000..28b0ed70 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,35 @@ +👋 Welcome to WebLLM +==================== + +`GitHub `_ | `WebLLM Chat `_ | `NPM `_ | `Discord `_ + +WebLLM is a high-performance in-browser language model inference engine that brings large language models (LLMs) to web browsers with hardware acceleration. With WebGPU support, it allows developers to build AI-powered applications directly within the browser environment, removing the need for server-side processing and ensuring privacy. + +It provides a specialized runtime for the web backend of MLCEngine, leverages +`WebGPU `_ for local acceleration, offers OpenAI-compatible API, +and provides built-in support for web workers to separate heavy computation from the UI flow. + +Key Features +------------ +- 🌐 In-Browser Inference: Run LLMs directly in the browser +- 🚀 WebGPU Acceleration: Leverage hardware acceleration for optimal performance +- 🔄 OpenAI API Compatibility: Seamless integration with standard AI workflows +- 📦 Multiple Model Support: Works with Llama, Phi, Gemma, Mistral, and more + +Start exploring WebLLM by `chatting with WebLLM Chat `_, and start building webapps with high-performance local LLM inference with the following guides and tutorials. + +.. toctree:: + :maxdepth: 2 + :caption: User Guide + + user/get_started.rst + user/basic_usage.rst + user/advanced_usage.rst + user/api_reference.rst + +.. toctree:: + :maxdepth: 2 + :caption: Developer Guide + + developer/building_from_source.rst + developer/add_models.rst diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 00000000..954237b9 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..2658857d --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,8 @@ +sphinx-tabs == 3.4.1 +sphinx-rtd-theme +sphinx == 5.2.3 +sphinx-toolbox == 3.4.0 +tlcpack-sphinx-addon==0.2.2 +sphinxcontrib_httpdomain==1.8.1 +sphinxcontrib-napoleon==0.7 +sphinx-reredirects==0.1.2 diff --git a/docs/user/advanced_usage.rst b/docs/user/advanced_usage.rst new file mode 100644 index 00000000..f5cb034c --- /dev/null +++ b/docs/user/advanced_usage.rst @@ -0,0 +1,133 @@ +Advanced Use Cases +================== + +Using Workers +------------- + +You can put the heavy computation in a worker script to optimize your application performance. To do so, you need to: + +Create a handler in the worker thread that communicates with the frontend while handling the requests. +Create a Worker Engine in your main application, which under the hood sends messages to the handler in the worker thread. +For detailed implementations of different kinds of Workers, check the following sections. + +Using Web Workers +^^^^^^^^^^^^^^^^^ +WebLLM comes with API support for `Web Workers `_ so you can offload the computation-heavy generation work into a separate worker thread. WebLLM has implemented the cross-thread communication through messages under the hood so you don't need to manually implement it any more. + +In the worker script, import and instantiate ``WebWorkerMLCEngineHandler``, which handles the communications with other scripts and processes incoming requests. + +.. code-block:: typescript + + // worker.ts + import { WebWorkerMLCEngineHandler } from "@mlc-ai/web-llm"; + + const handler = new WebWorkerMLCEngineHandler(); + self.onmessage = (msg: MessageEvent) => { + handler.onmessage(msg); + }; + +In the main script, import and instantiate a ``WebWorkerMLCEngine`` that implements the same ``MLCEngineInterface`` and exposes the same APIs, then simply use it as how you would use a normal ``MLCEngine`` in your application. + +.. code-block:: typescript + + import { CreateWebWorkerMLCEngine } from "@mlc-ai/web-llm"; + + async function runWorker() { + const engine = await CreateWebWorkerMLCEngine( + new Worker(new URL("./worker.ts", import.meta.url), { type: "module" }), + "Llama-3.1-8B-Instruct" + ); + + const messages = [{ role: "user", content: "How does WebLLM use workers?" }]; + const reply = await engine.chat.completions.create({ messages }); + console.log(reply.choices[0].message.content); + } + + runWorker(); + + +Under the hood, ``WebWorkerMLCEngine`` does **not** actual doing any computation, but instead serves as a proxy to translate all calls into messages and send to the ``WebWorkerMLCEngineHandler`` to process. The worker thread will receive these messages and process the actual computation using a hidden engine, and return the result back to the main thread using messages. + +Service Workers +^^^^^^^^^^^^^^^ +WebLLM also support offloading the computation in `Service Workers `_ to avoid reloading the model between page refreshes and optimize your application's offline experience. + +(Note, Service Worker's life cycle is managed by the browser and can be killed any time without notifying the webapp. WebLLM's ``ServiceWorkerMLCEngine`` will try to keep the service worker thread alive by periodically sending heartbeat events, but the script could still be killed any time by Chrome and your application should include proper error handling. Check `keepAliveMs` and `missedHeatbeat` in `ServiceWorkerMLCEngine `_ for more details.) + +In the worker script, import and instantiate ``ServiceWorkerMLCEngineHandler``, which handles the communications with page scripts and processes incoming requests. + +.. code-block:: typescript + + // sw.ts + import { ServiceWorkerMLCEngineHandler } from "@mlc-ai/web-llm"; + + self.addEventListener("activate", () => { + const handler = new ServiceWorkerMLCEngineHandler(); + console.log("Service Worker activated!"); + }); + + +Then in the main page script, register the service worker and instantiate the engine using ``CreateServiceWorkerMLCEngine`` factory function. The Engine implements the same ``MLCEngineInterface`` and exposes the same APIs, then simply use it as how you would use a normal ``MLCEngine`` in your application. + +.. code-block:: typescript + + // main.ts + import { MLCEngineInterface, CreateServiceWorkerMLCEngine } from "@mlc-ai/web-llm"; + + if ("serviceWorker" in navigator) { + navigator.serviceWorker.register( + new URL("sw.ts", import.meta.url), // worker script + { type: "module" }, + ); + } + + const engine: MLCEngineInterface = + await CreateServiceWorkerMLCEngine( + selectedModel, + { initProgressCallback }, // engineConfig + ); + +Similar to the ``WebWorkerMLCEngine`` above, the ``ServiceWorkerMLCEngine`` is also a proxy and does not do any actual computation. Instead it sends all calls to the service worker thread to handle and receives the result back through messages. + +Chrome Extension +---------------- + +WebLLM can be used in Chrome extensions to empower local LLM inference. You can find examples of building Chrome extension using WebLLM in `examples/chrome-extension `_ and `examples/chrome-extension-webgpu-service-worker `_. The latter one leverages service worker, so the extension is persistent in the background. + +Additionally, we have a full Chrome extension project, `WebLLM Assistant `_, which leverages WebLLM to provide personal web browsing copilot assistance experience. Free to to check it out and contribute if you are interested. + + +Other Customization +------------------- + +Using IndexedDB Cache +^^^^^^^^^^^^^^^^^^^^^ + +Set `appConfig` in `MLCEngineConfig` to enable caching for faster subsequent model loads. + +.. code-block:: typescript + + const engine = await CreateMLCEngine("Llama-3.1-8B-Instruct", { + appConfig: { + useIndexedDB: true, + models: [ + { model_id: "Llama-3.1-8B", model_path: "/models/llama3" }, + ], + }, + }); + +Customizing Token Behavior +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Modify `logit_bias` in `GenerationConfig` to influence token likelihood: + +.. code-block:: typescript + + const messages = [ + { role: "user", content: "Describe WebLLM in detail." }, + ]; + + const response = await engine.chatCompletion({ + messages, + logit_bias: { "50256": -100 }, // Example: Prevent specific token generation + }); diff --git a/docs/user/api_reference.rst b/docs/user/api_reference.rst new file mode 100644 index 00000000..0a28cf79 --- /dev/null +++ b/docs/user/api_reference.rst @@ -0,0 +1,202 @@ +.. _api-reference: + +WebLLM API Reference +==================== + +The ``MLCEngine`` class is the core interface of WebLLM. It enables model loading, chat completions, embeddings, and other operations. Below, we document its methods, along with the associated configuration interfaces. + +Interfaces +---------- + +The following interfaces are used as parameters or configurations within ``MLCEngine`` methods. They are linked to their respective methods for reference. + +MLCEngineConfig +^^^^^^^^^^^^^^^ + +Optional configurations for ``CreateMLCEngine()`` and ``CreateWebWorkerMLCEngine()``. + + +- **Fields**: + - ``appConfig``: Configure the app, including the list of models and whether to use IndexedDB cache. + - ``initProgressCallback``: A callback for showing the progress of loading the model. + - ``logitProcessorRegistry``: A register for stateful logit processors, see ``webllm.LogitProcessor``. + + +- **Usage**: + - ``appConfig``: Contains application-specific settings, including: + - Model configurations. + - IndexedDB caching preferences. + - ``initProgressCallback``: Allows developers to visualize model loading progress by implementing a callback. + - ``logitProcessorRegistry``: A ``Map`` object for registering custom logit processors. Only applies to ``MLCEngine``. + + +.. note:: All fields are optional, and ``logitProcessorRegistry`` is only used for ``MLCEngine``. + + +Example: + +.. code-block:: typescript + + const engine = await CreateMLCEngine("Llama-3.1-8B-Instruct", { + appConfig: { /* app-specific config */ }, + initProgressCallback: (progress) => console.log(progress), + }); + + +GenerationConfig +^^^^^^^^^^^^^^^^ + +Configurations for a single generation task, primarily used in chat completions. + +- **Fields**: + - ``repetition_penalty``, ``ignore_eos``: Specific to MLC models. + - ``top_p``, ``temperature``, ``max_tokens``, ``stop``: Common with OpenAI APIs. + - ``logit_bias``, ``n``: Additional parameters for sampling control. + +- **Usage**: + - Fields like ``repetition_penalty`` and ``ignore_eos`` allow fine control over the output generation behavior. + - Common parameters shared with OpenAI APIs (e.g., ``temperature``, ``top_p``) ensure compatibility. + + +Example: + +.. code-block:: typescript + + const messages = [ + { role: "system", content: "You are a helpful assistant." }, + { role: "user", content: "Explain WebLLM." }, + ]; + + const response = await engine.chatCompletion({ + messages, + top_p: 0.9, + temperature: 0.8, + max_tokens: 150, + }); + +ChatCompletionRequest +^^^^^^^^^^^^^^^^^^^^^ + +Defines the structure for chat completion requests. + +- **Base Interface**: ``ChatCompletionRequestBase`` + - Contains parameters like ``messages``, ``stream``, ``frequency_penalty``, and ``presence_penalty``. +- **Variants**: + - ``ChatCompletionRequestNonStreaming``: For non-streaming completions. + - ``ChatCompletionRequestStreaming``: For streaming completions. + +- **Usage**: + - Combines settings from ``GenerationConfig`` and ``ChatCompletionRequestBase`` to provide complete control over chat behavior. + - The ``stream`` parameter enables dynamic streaming responses, improving interactivity in conversational agents. + - The ``logit_bias`` feature allows fine-tuning of token generation probabilities, providing a mechanism to restrict or encourage specific outputs. + + +Example: + +.. code-block:: typescript + + const response = await engine.chatCompletion({ + messages: [ + { role: "user", content: "Tell me about WebLLM." }, + ], + stream: true, + }); + +Model Loading +------------- + +``MLCEngine.reload(modelId: string | string[], chatOpts?: ChatOptions | ChatOptions[]): Promise`` + +Loads the specified model(s) into the engine. Uses ``MLCEngineConfig`` during initialization. + +- Parameters: + - ``modelId``: Identifier(s) for the model(s) to load. + - ``chatOpts``: Configuration for generation (see ``GenerationConfig``). + +Example: + +.. code-block:: typescript + + await engine.reload(["Llama-3.1-8B", "Gemma-2B"], [ + { temperature: 0.7 }, + { top_p: 0.9 }, + ]); + +``MLCEngine.unload(): Promise`` + +Unloads all loaded models and clears their associated configurations. + +Example: + +.. code-block:: typescript + + await engine.unload(); + +--- + +Chat Completions +---------------- + +``MLCEngine.chat.completions.create(request: ChatCompletionRequest): Promise>`` + +Generates chat-based completions using a specified request configuration. + +- Parameters: + - ``request``: A ``ChatCompletionRequest`` instance. + +Example: + +.. code-block:: typescript + + const response = await engine.chat.completions.create({ + messages: [ + { role: "system", content: "You are a helpful AI assistant." }, + { role: "user", content: "What is WebLLM?" }, + ], + temperature: 0.8, + stream: false, + }); + +--- + +Utility Methods +^^^^^^^^^^^^^^^ + +``MLCEngine.getMessage(modelId?: string): Promise`` + +Retrieves the current output message from the specified model. + +``MLCEngine.resetChat(keepStats?: boolean, modelId?: string): Promise`` + +Resets the chat history and optionally retains usage statistics. + +GPU Information +---------------- + +The following methods provide detailed information about the GPU used for WebLLM computations. + +``MLCEngine.getGPUVendor(): Promise`` + +Retrieves the vendor name of the GPU used for computations. Useful for understanding the hardware capabilities during inference. + +- **Returns**: A string indicating the GPU vendor (e.g., "Intel", "NVIDIA"). + +Example: + +.. code-block:: typescript + + const gpuVendor = await engine.getGPUVendor(); + console.log(``GPU Vendor: ${gpuVendor}``); + +``MLCEngine.getMaxStorageBufferBindingSize(): Promise`` + +Returns the maximum storage buffer size supported by the GPU. This is important when working with larger models that require significant memory for processing. + +- **Returns**: A number representing the maximum size in bytes. + +Example: + +.. code-block:: typescript + + const maxBufferSize = await engine.getMaxStorageBufferBindingSize(); + console.log(``Max Storage Buffer Binding Size: ${maxBufferSize}``); diff --git a/docs/user/basic_usage.rst b/docs/user/basic_usage.rst new file mode 100644 index 00000000..7f77409f --- /dev/null +++ b/docs/user/basic_usage.rst @@ -0,0 +1,120 @@ +Basic Usage +================ + +Model Records in WebLLM +----------------------- + +Each of the model available WebLLM is registered as an instance of +``ModelRecord`` and can be accessed at +`webllm.prebuiltAppConfig.model_list `__. + +Creating an MLCEngine +--------------------- + +WebLLM APIs are exposed through the ``MLCEngine`` interface. You can create an ``MLCEngine`` instance and loading the model by calling the CreateMLCEngine() factory function. + +(Note that loading models requires downloading and it can take a significant amount of time for the very first run without caching previously. You should properly handle this asynchronous call.) + +``MLCEngine`` can be instantiated in two ways: +1. Using the factory function ``CreateMLCEngine``. +2. Instantiating the ``MLCEngine`` class directly and using ``reload()`` to load models. + +.. code-block:: typescript + + import { CreateMLCEngine, MLCEngine } from "@mlc-ai/web-llm"; + + // Initialize with a progress callback + const initProgressCallback = (progress) => { + console.log("Model loading progress:", progress); + }; + + // Using CreateMLCEngine + const engine = await CreateMLCEngine("Llama-3.1-8B-Instruct", { initProgressCallback }); + + // Direct instantiation + const engineInstance = new MLCEngine({ initProgressCallback }); + await engineInstance.reload("Llama-3.1-8B-Instruct"); + +Under the hood, this factory function ``CreateMLCEngine`` does the following steps for first creating an engine instance (synchronous) and then loading the model (asynchronous). You can also do them separately in your application. + +.. code-block:: typescript + + import { MLCEngine } from "@mlc-ai/web-llm"; + + // This is a synchronous call that returns immediately + const engine = new MLCEngine({ + initProgressCallback: initProgressCallback + }); + + // This is an asynchronous call and can take a long time to finish + await engine.reload(selectedModel); + + +Chat Completion +--------------- + +Chat completions can be invoked using OpenAI style chat APIs through the ``engine.chat.completions`` interface of an initialized ``MLCEgnine``. For the full list of parameters and their descriptions, check :ref:`api-reference` for full list of parameters. + +(Note: As model is determined at the ``MLCEngine`` initialization time, ``model`` parameter is not supported and will be **ignored**. Instead, call ``CreateMLCEngine(model)`` or ``engine.reload(model)`` to reinitialize the engine to use a specific model.) + +.. code-block:: typescript + + const messages = [ + { role: "system", content: "You are a helpful AI assistant." }, + { role: "user", content: "Hello!" } + ]; + + const reply = await engine.chat.completions.create({ + messages, + }); + + console.log(reply.choices[0].message); + console.log(reply.usage); + + +Streaming Chat Completion +------------------------- + +Streaming chat completion could be enabled by passsing ``stream: true`` parameter to the `engine.chat.completions.create` call configuration. Check :ref:`api-reference` for full list of parameters. + +.. code-block:: typescript + + const messages = [ + { role: "system", content: "You are a helpful AI assistant." }, + { role: "user", content: "Hello!" }, + ] + + // Chunks is an AsyncGenerator object + const chunks = await engine.chat.completions.create({ + messages, + temperature: 1, + stream: true, // <-- Enable streaming + stream_options: { include_usage: true }, + }); + + let reply = ""; + for await (const chunk of chunks) { + reply += chunk.choices[0]?.delta.content || ""; + console.log(reply); + if (chunk.usage) { + console.log(chunk.usage); // only last chunk has usage + } + } + + const fullReply = await engine.getMessage(); + console.log(fullReply); + + +Chatbot Examples +---------------- + +Learn how to use WebLLM to integrate large language models into your applications and generate chat completions through this simple Chatbot example: + +- `Example in JSFiddle `_ +- `Example in CodePen `_ + +For an advanced example of a larger, more complicated project, check `WebLLM Chat `_. + +More examples for different use cases are available in the examples folder. + + diff --git a/docs/user/get_started.rst b/docs/user/get_started.rst new file mode 100644 index 00000000..f88613d4 --- /dev/null +++ b/docs/user/get_started.rst @@ -0,0 +1,75 @@ +Getting Started with WebLLM +=========================== + +This guide will help you set up WebLLM in your project, install necessary dependencies, and verify your setup. + + +WebLLM Chat +----------- + +If you want to experience AI Chat supported by local LLM inference and understand how WebLLM works, try out `WebLLM Chat `__, which provides a great example +of integrating WebLLM into a full web application. + +A WebGPU-compatible browser is needed to run WebLLM-powered web applications. +You can download the latest Google Chrome and use `WebGPU Report `__ +to verify the functionality of WebGPU on your browser. + +Installation +------------ + +WebLLM offers a minimalist and modular interface to access the chatbot in the browser. The package is designed in a modular way to hook to any of the UI components. + +WebLLM is available as an `npm package `_ and is also CDN-delivered. Therefore, uou can install WebLLM using Node.js pacakage managers like npm, yarn, or pnpm, or directly import the pacakge via CDN. + +Using Package Managers +^^^^^^^^^^^^^^^^^^^^^^ +Install WebLLM via your preferred package manager: + +.. code-block:: bash + + # npm + npm install @mlc-ai/web-llm + # yarn + yarn add @mlc-ai/web-llm + # pnpm + pnpm install @mlc-ai/web-llm + +Import WebLLM into your project: + +.. code-block:: javascript + + // Import everything + import * as webllm from "@mlc-ai/web-llm"; + + // Or only import what you need + import { CreateMLCEngine } from "@mlc-ai/web-llm"; + +Using CDN +^^^^^^^^^ +Thanks to `jsdelivr.com `_, WebLLM can be imported directly through URL and work out-of-the-box on cloud development platforms like `jsfiddle.net `_, `Codepen.io `_, and `Scribbler `_: + +.. code-block:: javascript + + import * as webllm from "https://esm.run/@mlc-ai/web-llm"; + +This method is especially useful for online environments like CodePen, JSFiddle, or local experiments. + +Verifying Installation +^^^^^^^^^^^^^^^^^^^^^^ +Run the following script to verify the installation: + +.. code-block:: javascript + + import { CreateMLCEngine } from "@mlc-ai/web-llm"; + console.log("WebLLM loaded successfully!"); + + +Online IDE Sandbox +------------------ + +Instead of setting WebLLM locally, you can also try it on online Javascript IDE sandboxes like: + +- `Example in JSFiddle `_ +- `Example in CodePen `_ + + diff --git a/scripts/gh_deploy_site.sh b/scripts/gh_deploy_site.sh index ab6faf55..0baf098d 100755 --- a/scripts/gh_deploy_site.sh +++ b/scripts/gh_deploy_site.sh @@ -1,7 +1,11 @@ #!/bin/bash set -euxo pipefail +export PYTHONPATH=$PWD/python +cd docs && make html && cd .. cd site && jekyll b && cd .. +rm -rf site/_site/docs +cp -r docs/_build/html site/_site/docs git fetch git checkout -B gh-pages origin/gh-pages diff --git a/site/_includes/hero.html b/site/_includes/hero.html index a2c3c314..404bc1d7 100644 --- a/site/_includes/hero.html +++ b/site/_includes/hero.html @@ -2,7 +2,7 @@

WebLLM: High-Performance In-Browser LLM Inference Engine