diff --git a/README.md b/README.md index 1f34fcb..06524f5 100644 --- a/README.md +++ b/README.md @@ -22,8 +22,8 @@ Abstract: **The current discourse around Large Language Models (LLMs) tends to f | Chapter 4: Safety | [html](https://www.souzatharsis.com/tamingLLMs/notebooks/safety.html) | [ipynb](https://github.com/souzatharsis/tamingLLMs/blob/master/tamingllms/notebooks/safety.ipynb) | *Ready for Review* | | Chapter 5: Preference-Based Alignment | [html](https://www.souzatharsis.com/tamingLLMs/notebooks/alignment.html) | [ipynb](https://github.com/souzatharsis/tamingLLMs/blob/master/tamingllms/notebooks/alignment.ipynb) | *Ready for Review* | | Chapter 6: Local LLMs in Practice | [html](https://www.souzatharsis.com/tamingLLMs/notebooks/local.html) | [ipynb](https://github.com/souzatharsis/tamingLLMs/blob/master/tamingllms/notebooks/local.ipynb) | *Ready for Review* | -| Chapter 7: The Cost Factor | | | | -| Chapter 8: Frontiers | | | WIP | +| Chapter 7: The Falling Cost Paradox | | | WIP | +| Chapter 8: Frontiers | | | | | Appendix A: Tools and Resources | | | | diff --git a/poetry.lock b/poetry.lock index 69cc5dc..9067dd2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -13,13 +13,13 @@ files = [ [[package]] name = "accelerate" -version = "1.1.1" +version = "1.2.1" description = "Accelerate" optional = false python-versions = ">=3.9.0" files = [ - {file = "accelerate-1.1.1-py3-none-any.whl", hash = "sha256:61edd81762131b8d4bede008643fa1e1f3bf59bec710ebda9771443e24feae02"}, - {file = "accelerate-1.1.1.tar.gz", hash = "sha256:0d39dfac557052bc735eb2703a0e87742879e1e40b88af8a2f9a93233d4cd7db"}, + {file = "accelerate-1.2.1-py3-none-any.whl", hash = "sha256:be1cbb958cf837e7cdfbde46b812964b1b8ae94c9c7d94d921540beafcee8ddf"}, + {file = "accelerate-1.2.1.tar.gz", hash = "sha256:03e161fc69d495daf2b9b5c8d5b43d06e2145520c04727b5bda56d49f1a43ab5"}, ] [package.dependencies] @@ -60,18 +60,6 @@ pygments = ">=1.5" dev = ["pillow", "pkginfo (>=1.10)", "playwright", "pre-commit", "setuptools", "twine (>=5.0)"] tests = ["hypothesis", "pytest"] -[[package]] -name = "aenum" -version = "3.1.15" -description = "Advanced Enumerations (compatible with Python's stdlib Enum), NamedTuples, and NamedConstants" -optional = false -python-versions = "*" -files = [ - {file = "aenum-3.1.15-py2-none-any.whl", hash = "sha256:27b1710b9d084de6e2e695dab78fe9f269de924b51ae2850170ee7e1ca6288a5"}, - {file = "aenum-3.1.15-py3-none-any.whl", hash = "sha256:e0dfaeea4c2bd362144b87377e2c61d91958c5ed0b4daf89cb6f45ae23af6288"}, - {file = "aenum-3.1.15.tar.gz", hash = "sha256:8cbd76cd18c4f870ff39b24284d3ea028fbe8731a58df3aa581e434c575b9559"}, -] - [[package]] name = "aiohappyeyeballs" version = "2.4.3" @@ -590,17 +578,6 @@ files = [ [package.dependencies] pycparser = "*" -[[package]] -name = "chardet" -version = "5.2.0" -description = "Universal encoding detector for Python 3" -optional = false -python-versions = ">=3.7" -files = [ - {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, - {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, -] - [[package]] name = "charset-normalizer" version = "3.4.0" @@ -953,25 +930,6 @@ diagnostics = ["bokeh (>=3.1.0)", "jinja2 (>=2.10.3)"] distributed = ["distributed (==2024.11.2)"] test = ["pandas[test]", "pre-commit", "pytest", "pytest-cov", "pytest-rerunfailures", "pytest-timeout", "pytest-xdist"] -[[package]] -name = "dataproperty" -version = "1.0.1" -description = "Python library for extract property from data." -optional = false -python-versions = ">=3.7" -files = [ - {file = "DataProperty-1.0.1-py3-none-any.whl", hash = "sha256:0b8b07d4fb6453fcf975b53d35dea41f3cfd69c9d79b5010c3cf224ff0407a7a"}, - {file = "DataProperty-1.0.1.tar.gz", hash = "sha256:723e5729fa6e885e127a771a983ee1e0e34bb141aca4ffe1f0bfa7cde34650a4"}, -] - -[package.dependencies] -mbstrdecoder = ">=1.0.0,<2" -typepy = {version = ">=1.2.0,<2", extras = ["datetime"]} - -[package.extras] -logging = ["loguru (>=0.4.1,<1)"] -test = ["pytest (>=6.0.1)", "pytest-md-report (>=0.3)", "tcolorpy (>=0.1.2)"] - [[package]] name = "datasets" version = "3.1.0" @@ -1508,6 +1466,23 @@ files = [ [package.dependencies] attrs = ">=19.3" +[[package]] +name = "gguf" +version = "0.13.0" +description = "Read and write ML models in GGUF for GGML" +optional = false +python-versions = ">=3.8" +files = [ + {file = "gguf-0.13.0-py3-none-any.whl", hash = "sha256:4e92a73b827c4618b55a615547b5120a904c2b34d02d5179dad5093c680623f6"}, + {file = "gguf-0.13.0.tar.gz", hash = "sha256:9f29ccbb21fc6b6cf8b4741e88aaa563f0a1c748c26b5f7e304bb48612bf41b8"}, +] + +[package.dependencies] +numpy = ">=1.17" +pyyaml = ">=5.1" +sentencepiece = ">=0.1.98,<=0.2.0" +tqdm = ">=4.27" + [[package]] name = "gitdb" version = "4.0.11" @@ -2538,54 +2513,6 @@ files = [ {file = "latexcodec-3.0.0.tar.gz", hash = "sha256:917dc5fe242762cc19d963e6548b42d63a118028cdd3361d62397e3b638b6bc5"}, ] -[[package]] -name = "lighteval" -version = "0.6.2" -description = "A lightweight and configurable evaluation package" -optional = false -python-versions = ">=3.10" -files = [ - {file = "lighteval-0.6.2-py3-none-any.whl", hash = "sha256:1832fff4ca76d4ec617b5242c60e5dcaa1df8966f9b8352af105386fb6c910ba"}, - {file = "lighteval-0.6.2.tar.gz", hash = "sha256:e48caf17c4136f973b5b9ee0692171b797692e068bd6c8efed14657b81500956"}, -] - -[package.dependencies] -accelerate = {version = "*", optional = true, markers = "extra == \"accelerate\""} -aenum = "3.1.15" -colorama = "*" -datasets = ">=2.14.0" -fsspec = ">=2023.12.2" -GitPython = ">=3.1.41" -huggingface-hub = ">=0.23.0" -nltk = "3.9.1" -protobuf = "==3.20.*" -pycountry = "*" -pytablewriter = "*" -rouge-score = "0.1.2" -sacrebleu = "*" -scikit-learn = "*" -sentencepiece = ">=0.1.99" -spacy = "3.7.2" -termcolor = "2.3.0" -torch = ">=2.0,<2.5" -transformers = ">=4.38.0" - -[package.extras] -accelerate = ["accelerate"] -adapters = ["peft (==0.3.0)"] -dev = ["lighteval[accelerate,multilingual,quality,tests]"] -extended-tasks = ["langdetect", "openai", "tiktoken"] -multilingual = ["jieba", "pyvi", "spacy[ja,ko,th]", "stanza"] -nanotron = ["nanotron", "tensorboardX"] -optimum = ["optimum (==1.12.0)"] -quality = ["pre-commit", "ruff (==v0.2.2)"] -quantization = ["auto-gptq (>=0.4.2)", "bitsandbytes (>=0.41.0)"] -s3 = ["s3fs"] -tensorboardx = ["tensorboardX"] -tests = ["pytest (==7.4.0)"] -tgi = ["text-generation (==0.6.0)"] -vllm = ["more-itertools", "ray", "vllm"] - [[package]] name = "linkify-it-py" version = "2.0.3" @@ -2699,160 +2626,6 @@ files = [ {file = "locket-1.0.0.tar.gz", hash = "sha256:5c0d4c052a8bbbf750e056a8e65ccd309086f4f0f18a2eac306a8dfa4112a632"}, ] -[[package]] -name = "lxml" -version = "5.3.0" -description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." -optional = false -python-versions = ">=3.6" -files = [ - {file = "lxml-5.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:dd36439be765e2dde7660212b5275641edbc813e7b24668831a5c8ac91180656"}, - {file = "lxml-5.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ae5fe5c4b525aa82b8076c1a59d642c17b6e8739ecf852522c6321852178119d"}, - {file = "lxml-5.3.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:501d0d7e26b4d261fca8132854d845e4988097611ba2531408ec91cf3fd9d20a"}, - {file = "lxml-5.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb66442c2546446944437df74379e9cf9e9db353e61301d1a0e26482f43f0dd8"}, - {file = "lxml-5.3.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9e41506fec7a7f9405b14aa2d5c8abbb4dbbd09d88f9496958b6d00cb4d45330"}, - {file = "lxml-5.3.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f7d4a670107d75dfe5ad080bed6c341d18c4442f9378c9f58e5851e86eb79965"}, - {file = "lxml-5.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41ce1f1e2c7755abfc7e759dc34d7d05fd221723ff822947132dc934d122fe22"}, - {file = "lxml-5.3.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:44264ecae91b30e5633013fb66f6ddd05c006d3e0e884f75ce0b4755b3e3847b"}, - {file = "lxml-5.3.0-cp310-cp310-manylinux_2_28_ppc64le.whl", hash = "sha256:3c174dc350d3ec52deb77f2faf05c439331d6ed5e702fc247ccb4e6b62d884b7"}, - {file = "lxml-5.3.0-cp310-cp310-manylinux_2_28_s390x.whl", hash = "sha256:2dfab5fa6a28a0b60a20638dc48e6343c02ea9933e3279ccb132f555a62323d8"}, - {file = "lxml-5.3.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:b1c8c20847b9f34e98080da785bb2336ea982e7f913eed5809e5a3c872900f32"}, - {file = "lxml-5.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:2c86bf781b12ba417f64f3422cfc302523ac9cd1d8ae8c0f92a1c66e56ef2e86"}, - {file = "lxml-5.3.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:c162b216070f280fa7da844531169be0baf9ccb17263cf5a8bf876fcd3117fa5"}, - {file = "lxml-5.3.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:36aef61a1678cb778097b4a6eeae96a69875d51d1e8f4d4b491ab3cfb54b5a03"}, - {file = "lxml-5.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f65e5120863c2b266dbcc927b306c5b78e502c71edf3295dfcb9501ec96e5fc7"}, - {file = "lxml-5.3.0-cp310-cp310-win32.whl", hash = "sha256:ef0c1fe22171dd7c7c27147f2e9c3e86f8bdf473fed75f16b0c2e84a5030ce80"}, - {file = "lxml-5.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:052d99051e77a4f3e8482c65014cf6372e61b0a6f4fe9edb98503bb5364cfee3"}, - {file = "lxml-5.3.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:74bcb423462233bc5d6066e4e98b0264e7c1bed7541fff2f4e34fe6b21563c8b"}, - {file = "lxml-5.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a3d819eb6f9b8677f57f9664265d0a10dd6551d227afb4af2b9cd7bdc2ccbf18"}, - {file = "lxml-5.3.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5b8f5db71b28b8c404956ddf79575ea77aa8b1538e8b2ef9ec877945b3f46442"}, - {file = "lxml-5.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c3406b63232fc7e9b8783ab0b765d7c59e7c59ff96759d8ef9632fca27c7ee4"}, - {file = "lxml-5.3.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ecdd78ab768f844c7a1d4a03595038c166b609f6395e25af9b0f3f26ae1230f"}, - {file = "lxml-5.3.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:168f2dfcfdedf611eb285efac1516c8454c8c99caf271dccda8943576b67552e"}, - {file = "lxml-5.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa617107a410245b8660028a7483b68e7914304a6d4882b5ff3d2d3eb5948d8c"}, - {file = "lxml-5.3.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:69959bd3167b993e6e710b99051265654133a98f20cec1d9b493b931942e9c16"}, - {file = "lxml-5.3.0-cp311-cp311-manylinux_2_28_ppc64le.whl", hash = "sha256:bd96517ef76c8654446fc3db9242d019a1bb5fe8b751ba414765d59f99210b79"}, - {file = "lxml-5.3.0-cp311-cp311-manylinux_2_28_s390x.whl", hash = "sha256:ab6dd83b970dc97c2d10bc71aa925b84788c7c05de30241b9e96f9b6d9ea3080"}, - {file = "lxml-5.3.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:eec1bb8cdbba2925bedc887bc0609a80e599c75b12d87ae42ac23fd199445654"}, - {file = "lxml-5.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6a7095eeec6f89111d03dabfe5883a1fd54da319c94e0fb104ee8f23616b572d"}, - {file = "lxml-5.3.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:6f651ebd0b21ec65dfca93aa629610a0dbc13dbc13554f19b0113da2e61a4763"}, - {file = "lxml-5.3.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:f422a209d2455c56849442ae42f25dbaaba1c6c3f501d58761c619c7836642ec"}, - {file = "lxml-5.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:62f7fdb0d1ed2065451f086519865b4c90aa19aed51081979ecd05a21eb4d1be"}, - {file = "lxml-5.3.0-cp311-cp311-win32.whl", hash = "sha256:c6379f35350b655fd817cd0d6cbeef7f265f3ae5fedb1caae2eb442bbeae9ab9"}, - {file = "lxml-5.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:9c52100e2c2dbb0649b90467935c4b0de5528833c76a35ea1a2691ec9f1ee7a1"}, - {file = "lxml-5.3.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:e99f5507401436fdcc85036a2e7dc2e28d962550afe1cbfc07c40e454256a859"}, - {file = "lxml-5.3.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:384aacddf2e5813a36495233b64cb96b1949da72bef933918ba5c84e06af8f0e"}, - {file = "lxml-5.3.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:874a216bf6afaf97c263b56371434e47e2c652d215788396f60477540298218f"}, - {file = "lxml-5.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:65ab5685d56914b9a2a34d67dd5488b83213d680b0c5d10b47f81da5a16b0b0e"}, - {file = "lxml-5.3.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aac0bbd3e8dd2d9c45ceb82249e8bdd3ac99131a32b4d35c8af3cc9db1657179"}, - {file = "lxml-5.3.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b369d3db3c22ed14c75ccd5af429086f166a19627e84a8fdade3f8f31426e52a"}, - {file = "lxml-5.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c24037349665434f375645fa9d1f5304800cec574d0310f618490c871fd902b3"}, - {file = "lxml-5.3.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:62d172f358f33a26d6b41b28c170c63886742f5b6772a42b59b4f0fa10526cb1"}, - {file = "lxml-5.3.0-cp312-cp312-manylinux_2_28_ppc64le.whl", hash = "sha256:c1f794c02903c2824fccce5b20c339a1a14b114e83b306ff11b597c5f71a1c8d"}, - {file = "lxml-5.3.0-cp312-cp312-manylinux_2_28_s390x.whl", hash = "sha256:5d6a6972b93c426ace71e0be9a6f4b2cfae9b1baed2eed2006076a746692288c"}, - {file = "lxml-5.3.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:3879cc6ce938ff4eb4900d901ed63555c778731a96365e53fadb36437a131a99"}, - {file = "lxml-5.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:74068c601baff6ff021c70f0935b0c7bc528baa8ea210c202e03757c68c5a4ff"}, - {file = "lxml-5.3.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ecd4ad8453ac17bc7ba3868371bffb46f628161ad0eefbd0a855d2c8c32dd81a"}, - {file = "lxml-5.3.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7e2f58095acc211eb9d8b5771bf04df9ff37d6b87618d1cbf85f92399c98dae8"}, - {file = "lxml-5.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e63601ad5cd8f860aa99d109889b5ac34de571c7ee902d6812d5d9ddcc77fa7d"}, - {file = "lxml-5.3.0-cp312-cp312-win32.whl", hash = "sha256:17e8d968d04a37c50ad9c456a286b525d78c4a1c15dd53aa46c1d8e06bf6fa30"}, - {file = "lxml-5.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:c1a69e58a6bb2de65902051d57fde951febad631a20a64572677a1052690482f"}, - {file = "lxml-5.3.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8c72e9563347c7395910de6a3100a4840a75a6f60e05af5e58566868d5eb2d6a"}, - {file = "lxml-5.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e92ce66cd919d18d14b3856906a61d3f6b6a8500e0794142338da644260595cd"}, - {file = "lxml-5.3.0-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d04f064bebdfef9240478f7a779e8c5dc32b8b7b0b2fc6a62e39b928d428e51"}, - {file = "lxml-5.3.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c2fb570d7823c2bbaf8b419ba6e5662137f8166e364a8b2b91051a1fb40ab8b"}, - {file = "lxml-5.3.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0c120f43553ec759f8de1fee2f4794452b0946773299d44c36bfe18e83caf002"}, - {file = "lxml-5.3.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:562e7494778a69086f0312ec9689f6b6ac1c6b65670ed7d0267e49f57ffa08c4"}, - {file = "lxml-5.3.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:423b121f7e6fa514ba0c7918e56955a1d4470ed35faa03e3d9f0e3baa4c7e492"}, - {file = "lxml-5.3.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:c00f323cc00576df6165cc9d21a4c21285fa6b9989c5c39830c3903dc4303ef3"}, - {file = "lxml-5.3.0-cp313-cp313-manylinux_2_28_ppc64le.whl", hash = "sha256:1fdc9fae8dd4c763e8a31e7630afef517eab9f5d5d31a278df087f307bf601f4"}, - {file = "lxml-5.3.0-cp313-cp313-manylinux_2_28_s390x.whl", hash = "sha256:658f2aa69d31e09699705949b5fc4719cbecbd4a97f9656a232e7d6c7be1a367"}, - {file = "lxml-5.3.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:1473427aff3d66a3fa2199004c3e601e6c4500ab86696edffdbc84954c72d832"}, - {file = "lxml-5.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a87de7dd873bf9a792bf1e58b1c3887b9264036629a5bf2d2e6579fe8e73edff"}, - {file = "lxml-5.3.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:0d7b36afa46c97875303a94e8f3ad932bf78bace9e18e603f2085b652422edcd"}, - {file = "lxml-5.3.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:cf120cce539453ae086eacc0130a324e7026113510efa83ab42ef3fcfccac7fb"}, - {file = "lxml-5.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:df5c7333167b9674aa8ae1d4008fa4bc17a313cc490b2cca27838bbdcc6bb15b"}, - {file = "lxml-5.3.0-cp313-cp313-win32.whl", hash = "sha256:c802e1c2ed9f0c06a65bc4ed0189d000ada8049312cfeab6ca635e39c9608957"}, - {file = "lxml-5.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:406246b96d552e0503e17a1006fd27edac678b3fcc9f1be71a2f94b4ff61528d"}, - {file = "lxml-5.3.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:8f0de2d390af441fe8b2c12626d103540b5d850d585b18fcada58d972b74a74e"}, - {file = "lxml-5.3.0-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1afe0a8c353746e610bd9031a630a95bcfb1a720684c3f2b36c4710a0a96528f"}, - {file = "lxml-5.3.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56b9861a71575f5795bde89256e7467ece3d339c9b43141dbdd54544566b3b94"}, - {file = "lxml-5.3.0-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:9fb81d2824dff4f2e297a276297e9031f46d2682cafc484f49de182aa5e5df99"}, - {file = "lxml-5.3.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:2c226a06ecb8cdef28845ae976da407917542c5e6e75dcac7cc33eb04aaeb237"}, - {file = "lxml-5.3.0-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:7d3d1ca42870cdb6d0d29939630dbe48fa511c203724820fc0fd507b2fb46577"}, - {file = "lxml-5.3.0-cp36-cp36m-win32.whl", hash = "sha256:094cb601ba9f55296774c2d57ad68730daa0b13dc260e1f941b4d13678239e70"}, - {file = "lxml-5.3.0-cp36-cp36m-win_amd64.whl", hash = "sha256:eafa2c8658f4e560b098fe9fc54539f86528651f61849b22111a9b107d18910c"}, - {file = "lxml-5.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:cb83f8a875b3d9b458cada4f880fa498646874ba4011dc974e071a0a84a1b033"}, - {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:25f1b69d41656b05885aa185f5fdf822cb01a586d1b32739633679699f220391"}, - {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23e0553b8055600b3bf4a00b255ec5c92e1e4aebf8c2c09334f8368e8bd174d6"}, - {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ada35dd21dc6c039259596b358caab6b13f4db4d4a7f8665764d616daf9cc1d"}, - {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:81b4e48da4c69313192d8c8d4311e5d818b8be1afe68ee20f6385d0e96fc9512"}, - {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:2bc9fd5ca4729af796f9f59cd8ff160fe06a474da40aca03fcc79655ddee1a8b"}, - {file = "lxml-5.3.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:07da23d7ee08577760f0a71d67a861019103e4812c87e2fab26b039054594cc5"}, - {file = "lxml-5.3.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:ea2e2f6f801696ad7de8aec061044d6c8c0dd4037608c7cab38a9a4d316bfb11"}, - {file = "lxml-5.3.0-cp37-cp37m-win32.whl", hash = "sha256:5c54afdcbb0182d06836cc3d1be921e540be3ebdf8b8a51ee3ef987537455f84"}, - {file = "lxml-5.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:f2901429da1e645ce548bf9171784c0f74f0718c3f6150ce166be39e4dd66c3e"}, - {file = "lxml-5.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c56a1d43b2f9ee4786e4658c7903f05da35b923fb53c11025712562d5cc02753"}, - {file = "lxml-5.3.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ee8c39582d2652dcd516d1b879451500f8db3fe3607ce45d7c5957ab2596040"}, - {file = "lxml-5.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fdf3a3059611f7585a78ee10399a15566356116a4288380921a4b598d807a22"}, - {file = "lxml-5.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:146173654d79eb1fc97498b4280c1d3e1e5d58c398fa530905c9ea50ea849b22"}, - {file = "lxml-5.3.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0a7056921edbdd7560746f4221dca89bb7a3fe457d3d74267995253f46343f15"}, - {file = "lxml-5.3.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:9e4b47ac0f5e749cfc618efdf4726269441014ae1d5583e047b452a32e221920"}, - {file = "lxml-5.3.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:f914c03e6a31deb632e2daa881fe198461f4d06e57ac3d0e05bbcab8eae01945"}, - {file = "lxml-5.3.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:213261f168c5e1d9b7535a67e68b1f59f92398dd17a56d934550837143f79c42"}, - {file = "lxml-5.3.0-cp38-cp38-win32.whl", hash = "sha256:218c1b2e17a710e363855594230f44060e2025b05c80d1f0661258142b2add2e"}, - {file = "lxml-5.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:315f9542011b2c4e1d280e4a20ddcca1761993dda3afc7a73b01235f8641e903"}, - {file = "lxml-5.3.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1ffc23010330c2ab67fac02781df60998ca8fe759e8efde6f8b756a20599c5de"}, - {file = "lxml-5.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2b3778cb38212f52fac9fe913017deea2fdf4eb1a4f8e4cfc6b009a13a6d3fcc"}, - {file = "lxml-5.3.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4b0c7a688944891086ba192e21c5229dea54382f4836a209ff8d0a660fac06be"}, - {file = "lxml-5.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:747a3d3e98e24597981ca0be0fd922aebd471fa99d0043a3842d00cdcad7ad6a"}, - {file = "lxml-5.3.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86a6b24b19eaebc448dc56b87c4865527855145d851f9fc3891673ff97950540"}, - {file = "lxml-5.3.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b11a5d918a6216e521c715b02749240fb07ae5a1fefd4b7bf12f833bc8b4fe70"}, - {file = "lxml-5.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68b87753c784d6acb8a25b05cb526c3406913c9d988d51f80adecc2b0775d6aa"}, - {file = "lxml-5.3.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:109fa6fede314cc50eed29e6e56c540075e63d922455346f11e4d7a036d2b8cf"}, - {file = "lxml-5.3.0-cp39-cp39-manylinux_2_28_ppc64le.whl", hash = "sha256:02ced472497b8362c8e902ade23e3300479f4f43e45f4105c85ef43b8db85229"}, - {file = "lxml-5.3.0-cp39-cp39-manylinux_2_28_s390x.whl", hash = "sha256:6b038cc86b285e4f9fea2ba5ee76e89f21ed1ea898e287dc277a25884f3a7dfe"}, - {file = "lxml-5.3.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:7437237c6a66b7ca341e868cda48be24b8701862757426852c9b3186de1da8a2"}, - {file = "lxml-5.3.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:7f41026c1d64043a36fda21d64c5026762d53a77043e73e94b71f0521939cc71"}, - {file = "lxml-5.3.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:482c2f67761868f0108b1743098640fbb2a28a8e15bf3f47ada9fa59d9fe08c3"}, - {file = "lxml-5.3.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:1483fd3358963cc5c1c9b122c80606a3a79ee0875bcac0204149fa09d6ff2727"}, - {file = "lxml-5.3.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2dec2d1130a9cda5b904696cec33b2cfb451304ba9081eeda7f90f724097300a"}, - {file = "lxml-5.3.0-cp39-cp39-win32.whl", hash = "sha256:a0eabd0a81625049c5df745209dc7fcef6e2aea7793e5f003ba363610aa0a3ff"}, - {file = "lxml-5.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:89e043f1d9d341c52bf2af6d02e6adde62e0a46e6755d5eb60dc6e4f0b8aeca2"}, - {file = "lxml-5.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7b1cd427cb0d5f7393c31b7496419da594fe600e6fdc4b105a54f82405e6626c"}, - {file = "lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51806cfe0279e06ed8500ce19479d757db42a30fd509940b1701be9c86a5ff9a"}, - {file = "lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee70d08fd60c9565ba8190f41a46a54096afa0eeb8f76bd66f2c25d3b1b83005"}, - {file = "lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:8dc2c0395bea8254d8daebc76dcf8eb3a95ec2a46fa6fae5eaccee366bfe02ce"}, - {file = "lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6ba0d3dcac281aad8a0e5b14c7ed6f9fa89c8612b47939fc94f80b16e2e9bc83"}, - {file = "lxml-5.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:6e91cf736959057f7aac7adfc83481e03615a8e8dd5758aa1d95ea69e8931dba"}, - {file = "lxml-5.3.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:94d6c3782907b5e40e21cadf94b13b0842ac421192f26b84c45f13f3c9d5dc27"}, - {file = "lxml-5.3.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c300306673aa0f3ed5ed9372b21867690a17dba38c68c44b287437c362ce486b"}, - {file = "lxml-5.3.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78d9b952e07aed35fe2e1a7ad26e929595412db48535921c5013edc8aa4a35ce"}, - {file = "lxml-5.3.0-pp37-pypy37_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:01220dca0d066d1349bd6a1726856a78f7929f3878f7e2ee83c296c69495309e"}, - {file = "lxml-5.3.0-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:2d9b8d9177afaef80c53c0a9e30fa252ff3036fb1c6494d427c066a4ce6a282f"}, - {file = "lxml-5.3.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:20094fc3f21ea0a8669dc4c61ed7fa8263bd37d97d93b90f28fc613371e7a875"}, - {file = "lxml-5.3.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ace2c2326a319a0bb8a8b0e5b570c764962e95818de9f259ce814ee666603f19"}, - {file = "lxml-5.3.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:92e67a0be1639c251d21e35fe74df6bcc40cba445c2cda7c4a967656733249e2"}, - {file = "lxml-5.3.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd5350b55f9fecddc51385463a4f67a5da829bc741e38cf689f38ec9023f54ab"}, - {file = "lxml-5.3.0-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c1fefd7e3d00921c44dc9ca80a775af49698bbfd92ea84498e56acffd4c5469"}, - {file = "lxml-5.3.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:71a8dd38fbd2f2319136d4ae855a7078c69c9a38ae06e0c17c73fd70fc6caad8"}, - {file = "lxml-5.3.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:97acf1e1fd66ab53dacd2c35b319d7e548380c2e9e8c54525c6e76d21b1ae3b1"}, - {file = "lxml-5.3.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:68934b242c51eb02907c5b81d138cb977b2129a0a75a8f8b60b01cb8586c7b21"}, - {file = "lxml-5.3.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b710bc2b8292966b23a6a0121f7a6c51d45d2347edcc75f016ac123b8054d3f2"}, - {file = "lxml-5.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18feb4b93302091b1541221196a2155aa296c363fd233814fa11e181adebc52f"}, - {file = "lxml-5.3.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:3eb44520c4724c2e1a57c0af33a379eee41792595023f367ba3952a2d96c2aab"}, - {file = "lxml-5.3.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:609251a0ca4770e5a8768ff902aa02bf636339c5a93f9349b48eb1f606f7f3e9"}, - {file = "lxml-5.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:516f491c834eb320d6c843156440fe7fc0d50b33e44387fcec5b02f0bc118a4c"}, - {file = "lxml-5.3.0.tar.gz", hash = "sha256:4e109ca30d1edec1ac60cdbe341905dc3b8f55b16855e03a54aaf59e51ec8c6f"}, -] - -[package.extras] -cssselect = ["cssselect (>=0.7)"] -html-clean = ["lxml-html-clean"] -html5 = ["html5lib"] -htmlsoup = ["BeautifulSoup4"] -source = ["Cython (>=3.0.11)"] - [[package]] name = "marisa-trie" version = "1.2.1" @@ -3115,23 +2888,6 @@ files = [ [package.dependencies] traitlets = "*" -[[package]] -name = "mbstrdecoder" -version = "1.1.3" -description = "mbstrdecoder is a Python library for multi-byte character string decoder" -optional = false -python-versions = ">=3.7" -files = [ - {file = "mbstrdecoder-1.1.3-py3-none-any.whl", hash = "sha256:d66c1ed3f2dc4e7c5d87cd44a75be10bc5af4250f95b38bbaedd7851308ce938"}, - {file = "mbstrdecoder-1.1.3.tar.gz", hash = "sha256:dcfd2c759322eb44fe193a9e0b1b86c5b87f3ec5ea8e1bb43b3e9ae423f1e8fe"}, -] - -[package.dependencies] -chardet = ">=3.0.4,<6" - -[package.extras] -test = ["Faker (>=1.0.2)", "pytest (>=6.0.1)", "pytest-md-report (>=0.1)"] - [[package]] name = "mdit-py-plugins" version = "0.4.2" @@ -3722,46 +3478,50 @@ files = [ [[package]] name = "nvidia-cublas-cu12" -version = "12.1.3.1" +version = "12.4.5.8" description = "CUBLAS native runtime libraries" optional = false python-versions = ">=3" files = [ - {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:ee53ccca76a6fc08fb9701aa95b6ceb242cdaab118c3bb152af4e579af792728"}, - {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-win_amd64.whl", hash = "sha256:2b964d60e8cf11b5e1073d179d85fa340c120e99b3067558f3cf98dd69d02906"}, + {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3"}, + {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b"}, + {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-win_amd64.whl", hash = "sha256:5a796786da89203a0657eda402bcdcec6180254a8ac22d72213abc42069522dc"}, ] [[package]] name = "nvidia-cuda-cupti-cu12" -version = "12.1.105" +version = "12.4.127" description = "CUDA profiling tools runtime libs." optional = false python-versions = ">=3" files = [ - {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:e54fde3983165c624cb79254ae9818a456eb6e87a7fd4d56a2352c24ee542d7e"}, - {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:bea8236d13a0ac7190bd2919c3e8e6ce1e402104276e6f9694479e48bb0eb2a4"}, + {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a"}, + {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb"}, + {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:5688d203301ab051449a2b1cb6690fbe90d2b372f411521c86018b950f3d7922"}, ] [[package]] name = "nvidia-cuda-nvrtc-cu12" -version = "12.1.105" +version = "12.4.127" description = "NVRTC native runtime libraries" optional = false python-versions = ">=3" files = [ - {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:339b385f50c309763ca65456ec75e17bbefcbbf2893f462cb8b90584cd27a1c2"}, - {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:0a98a522d9ff138b96c010a65e145dc1b4850e9ecb75a0172371793752fd46ed"}, + {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198"}, + {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338"}, + {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:a961b2f1d5f17b14867c619ceb99ef6fcec12e46612711bcec78eb05068a60ec"}, ] [[package]] name = "nvidia-cuda-runtime-cu12" -version = "12.1.105" +version = "12.4.127" description = "CUDA Runtime native Libraries" optional = false python-versions = ">=3" files = [ - {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:6e258468ddf5796e25f1dc591a31029fa317d97a0a94ed93468fc86301d61e40"}, - {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:dfb46ef84d73fababab44cf03e3b83f80700d27ca300e537f85f636fac474344"}, + {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3"}, + {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5"}, + {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:09c2e35f48359752dfa822c09918211844a3d93c100a715d79b59591130c5e1e"}, ] [[package]] @@ -3780,35 +3540,41 @@ nvidia-cublas-cu12 = "*" [[package]] name = "nvidia-cufft-cu12" -version = "11.0.2.54" +version = "11.2.1.3" description = "CUFFT native runtime libraries" optional = false python-versions = ">=3" files = [ - {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl", hash = "sha256:794e3948a1aa71fd817c3775866943936774d1c14e7628c74f6f7417224cdf56"}, - {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-win_amd64.whl", hash = "sha256:d9ac353f78ff89951da4af698f80870b1534ed69993f10a4cf1d96f21357e253"}, + {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399"}, + {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9"}, + {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-win_amd64.whl", hash = "sha256:d802f4954291101186078ccbe22fc285a902136f974d369540fd4a5333d1440b"}, ] +[package.dependencies] +nvidia-nvjitlink-cu12 = "*" + [[package]] name = "nvidia-curand-cu12" -version = "10.3.2.106" +version = "10.3.5.147" description = "CURAND native runtime libraries" optional = false python-versions = ">=3" files = [ - {file = "nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:9d264c5036dde4e64f1de8c50ae753237c12e0b1348738169cd0f8a536c0e1e0"}, - {file = "nvidia_curand_cu12-10.3.2.106-py3-none-win_amd64.whl", hash = "sha256:75b6b0c574c0037839121317e17fd01f8a69fd2ef8e25853d826fec30bdba74a"}, + {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9"}, + {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b"}, + {file = "nvidia_curand_cu12-10.3.5.147-py3-none-win_amd64.whl", hash = "sha256:f307cc191f96efe9e8f05a87096abc20d08845a841889ef78cb06924437f6771"}, ] [[package]] name = "nvidia-cusolver-cu12" -version = "11.4.5.107" +version = "11.6.1.9" description = "CUDA solver native runtime libraries" optional = false python-versions = ">=3" files = [ - {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl", hash = "sha256:8a7ec542f0412294b15072fa7dab71d31334014a69f953004ea7a118206fe0dd"}, - {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-win_amd64.whl", hash = "sha256:74e0c3a24c78612192a74fcd90dd117f1cf21dea4822e66d89e8ea80e3cd2da5"}, + {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e"}, + {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260"}, + {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-win_amd64.whl", hash = "sha256:e77314c9d7b694fcebc84f58989f3aa4fb4cb442f12ca1a9bde50f5e8f6d1b9c"}, ] [package.dependencies] @@ -3818,13 +3584,14 @@ nvidia-nvjitlink-cu12 = "*" [[package]] name = "nvidia-cusparse-cu12" -version = "12.1.0.106" +version = "12.3.1.170" description = "CUSPARSE native runtime libraries" optional = false python-versions = ">=3" files = [ - {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:f3b50f42cf363f86ab21f720998517a659a48131e8d538dc02f8768237bd884c"}, - {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-win_amd64.whl", hash = "sha256:b798237e81b9719373e8fae8d4f091b70a0cf09d9d85c95a557e11df2d8e9a5a"}, + {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3"}, + {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1"}, + {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-win_amd64.whl", hash = "sha256:9bc90fb087bc7b4c15641521f31c0371e9a612fc2ba12c338d3ae032e6b6797f"}, ] [package.dependencies] @@ -3832,13 +3599,12 @@ nvidia-nvjitlink-cu12 = "*" [[package]] name = "nvidia-nccl-cu12" -version = "2.20.5" +version = "2.21.5" description = "NVIDIA Collective Communication Library (NCCL) Runtime" optional = false python-versions = ">=3" files = [ - {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01"}, - {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:057f6bf9685f75215d0c53bf3ac4a10b3e6578351de307abad9e18a99182af56"}, + {file = "nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8579076d30a8c24988834445f8d633c697d42397e92ffc3f63fa26766d25e0a0"}, ] [[package]] @@ -3855,13 +3621,14 @@ files = [ [[package]] name = "nvidia-nvtx-cu12" -version = "12.1.105" +version = "12.4.127" description = "NVIDIA Tools Extension" optional = false python-versions = ">=3" files = [ - {file = "nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:dc21cf308ca5691e7c04d962e213f8a4aa9bbfa23d95412f452254c2caeb09e5"}, - {file = "nvidia_nvtx_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:65f4d98982b31b60026e0e6de73fbdfc09d08a96f4656dd3665ca616a11e1e82"}, + {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3"}, + {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a"}, + {file = "nvidia_nvtx_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485"}, ] [[package]] @@ -4293,22 +4060,6 @@ toolz = "*" [package.extras] complete = ["blosc", "numpy (>=1.20.0)", "pandas (>=1.3)", "pyzmq"] -[[package]] -name = "pathvalidate" -version = "3.2.1" -description = "pathvalidate is a Python library to sanitize/validate a string such as filenames/file-paths/etc." -optional = false -python-versions = ">=3.7" -files = [ - {file = "pathvalidate-3.2.1-py3-none-any.whl", hash = "sha256:9a6255eb8f63c9e2135b9be97a5ce08f10230128c4ae7b3e935378b82b22c4c9"}, - {file = "pathvalidate-3.2.1.tar.gz", hash = "sha256:f5d07b1e2374187040612a1fcd2bcb2919f8db180df254c9581bb90bf903377d"}, -] - -[package.extras] -docs = ["Sphinx (>=2.4)", "sphinx-rtd-theme (>=1.2.2)", "urllib3 (<2)"] -readme = ["path (>=13,<17)", "readmemaker (>=1.1.0)"] -test = ["Faker (>=1.0.8)", "allpairspy (>=2)", "click (>=6.2)", "pytest (>=6.0.1)", "pytest-md-report (>=0.6.2)"] - [[package]] name = "pexpect" version = "4.9.0" @@ -4442,25 +4193,6 @@ docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-a test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"] type = ["mypy (>=1.11.2)"] -[[package]] -name = "portalocker" -version = "3.0.0" -description = "Wraps the portalocker recipe for easy usage" -optional = false -python-versions = ">=3.8" -files = [ - {file = "portalocker-3.0.0-py3-none-any.whl", hash = "sha256:211916b539a0dc3c128a3d9e86893ecfefec5379c4ff684e798f0a00f99db406"}, - {file = "portalocker-3.0.0.tar.gz", hash = "sha256:21f535de2e7a82c94c130c054adb5c7421d480d5619d61073996e2f89bcb879b"}, -] - -[package.dependencies] -pywin32 = {version = ">=226", markers = "platform_system == \"Windows\""} - -[package.extras] -docs = ["sphinx (>=1.7.1)"] -redis = ["redis"] -tests = ["pytest (>=5.4.1)", "pytest-cov (>=2.8.1)", "pytest-mypy (>=0.8.0)", "pytest-timeout (>=2.1.0)", "redis", "sphinx (>=6.0.0)", "types-redis"] - [[package]] name = "preshed" version = "3.0.9" @@ -4677,37 +4409,6 @@ files = [ {file = "propcache-0.2.0.tar.gz", hash = "sha256:df81779732feb9d01e5d513fad0122efb3d53bbc75f61b2a4f29a020bc985e70"}, ] -[[package]] -name = "protobuf" -version = "3.20.3" -description = "Protocol Buffers" -optional = false -python-versions = ">=3.7" -files = [ - {file = "protobuf-3.20.3-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:f4bd856d702e5b0d96a00ec6b307b0f51c1982c2bf9c0052cf9019e9a544ba99"}, - {file = "protobuf-3.20.3-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:9aae4406ea63d825636cc11ffb34ad3379335803216ee3a856787bcf5ccc751e"}, - {file = "protobuf-3.20.3-cp310-cp310-win32.whl", hash = "sha256:28545383d61f55b57cf4df63eebd9827754fd2dc25f80c5253f9184235db242c"}, - {file = "protobuf-3.20.3-cp310-cp310-win_amd64.whl", hash = "sha256:67a3598f0a2dcbc58d02dd1928544e7d88f764b47d4a286202913f0b2801c2e7"}, - {file = "protobuf-3.20.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:899dc660cd599d7352d6f10d83c95df430a38b410c1b66b407a6b29265d66469"}, - {file = "protobuf-3.20.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e64857f395505ebf3d2569935506ae0dfc4a15cb80dc25261176c784662cdcc4"}, - {file = "protobuf-3.20.3-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:d9e4432ff660d67d775c66ac42a67cf2453c27cb4d738fc22cb53b5d84c135d4"}, - {file = "protobuf-3.20.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:74480f79a023f90dc6e18febbf7b8bac7508420f2006fabd512013c0c238f454"}, - {file = "protobuf-3.20.3-cp37-cp37m-win32.whl", hash = "sha256:b6cc7ba72a8850621bfec987cb72623e703b7fe2b9127a161ce61e61558ad905"}, - {file = "protobuf-3.20.3-cp37-cp37m-win_amd64.whl", hash = "sha256:8c0c984a1b8fef4086329ff8dd19ac77576b384079247c770f29cc8ce3afa06c"}, - {file = "protobuf-3.20.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:de78575669dddf6099a8a0f46a27e82a1783c557ccc38ee620ed8cc96d3be7d7"}, - {file = "protobuf-3.20.3-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:f4c42102bc82a51108e449cbb32b19b180022941c727bac0cfd50170341f16ee"}, - {file = "protobuf-3.20.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:44246bab5dd4b7fbd3c0c80b6f16686808fab0e4aca819ade6e8d294a29c7050"}, - {file = "protobuf-3.20.3-cp38-cp38-win32.whl", hash = "sha256:c02ce36ec760252242a33967d51c289fd0e1c0e6e5cc9397e2279177716add86"}, - {file = "protobuf-3.20.3-cp38-cp38-win_amd64.whl", hash = "sha256:447d43819997825d4e71bf5769d869b968ce96848b6479397e29fc24c4a5dfe9"}, - {file = "protobuf-3.20.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:398a9e0c3eaceb34ec1aee71894ca3299605fa8e761544934378bbc6c97de23b"}, - {file = "protobuf-3.20.3-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:bf01b5720be110540be4286e791db73f84a2b721072a3711efff6c324cdf074b"}, - {file = "protobuf-3.20.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:daa564862dd0d39c00f8086f88700fdbe8bc717e993a21e90711acfed02f2402"}, - {file = "protobuf-3.20.3-cp39-cp39-win32.whl", hash = "sha256:819559cafa1a373b7096a482b504ae8a857c89593cf3a25af743ac9ecbd23480"}, - {file = "protobuf-3.20.3-cp39-cp39-win_amd64.whl", hash = "sha256:03038ac1cfbc41aa21f6afcbcd357281d7521b4157926f30ebecc8d4ea59dcb7"}, - {file = "protobuf-3.20.3-py2.py3-none-any.whl", hash = "sha256:a7ca6d488aa8ff7f329d4c545b2dbad8ac31464f1d8b1c87ad1346717731e4db"}, - {file = "protobuf-3.20.3.tar.gz", hash = "sha256:2e3427429c9cffebf259491be0af70189607f365c2f41c7c3764af6f337105f2"}, -] - [[package]] name = "psutil" version = "6.1.0" @@ -5090,42 +4791,6 @@ files = [ [package.extras] diagrams = ["jinja2", "railroad-diagrams"] -[[package]] -name = "pytablewriter" -version = "1.2.0" -description = "pytablewriter is a Python library to write a table in various formats: AsciiDoc / CSV / Elasticsearch / HTML / JavaScript / JSON / LaTeX / LDJSON / LTSV / Markdown / MediaWiki / NumPy / Excel / Pandas / Python / reStructuredText / SQLite / TOML / TSV / YAML." -optional = false -python-versions = ">=3.7" -files = [ - {file = "pytablewriter-1.2.0-py3-none-any.whl", hash = "sha256:4a30e2bb4bf5bc1069b1d2b2bc41947577c4517ab0875b23a5b194d296f543d8"}, - {file = "pytablewriter-1.2.0.tar.gz", hash = "sha256:0204a4bb684a22140d640f2599f09e137bcdc18b3dd49426f4a555016e246b46"}, -] - -[package.dependencies] -DataProperty = ">=1.0.1,<2" -mbstrdecoder = ">=1.0.0,<2" -pathvalidate = ">=2.3.0,<4" -setuptools = ">=38.3.0" -tabledata = ">=1.3.1,<2" -tcolorpy = ">=0.0.5,<1" -typepy = {version = ">=1.3.2,<2", extras = ["datetime"]} - -[package.extras] -all = ["PyYAML (>=3.11,<7)", "SimpleSQLite (>=1.3.2,<2)", "XlsxWriter (>=0.9.6,<4)", "dominate (>=2.1.5,<3)", "elasticsearch (>=8.0.1,<9)", "loguru (>=0.4.1,<1)", "pandas (>=0.25.3,<3)", "pytablereader (>=0.31.3,<2)", "pytablewriter-altcol-theme (>=0.1.0,<1)", "pytablewriter-altrow-theme (>=0.2.0,<1)", "simplejson (>=3.8.1,<4)", "toml (>=0.9.3,<1)", "xlwt"] -docs = ["PyYAML (>=3.11,<7)", "SimpleSQLite (>=1.3.2,<2)", "Sphinx (>=2.4)", "XlsxWriter (>=0.9.6,<4)", "dominate (>=2.1.5,<3)", "elasticsearch (>=8.0.1,<9)", "loguru (>=0.4.1,<1)", "pandas (>=0.25.3,<3)", "pytablereader (>=0.31.3,<2)", "pytablewriter-altcol-theme (>=0.1.0,<1)", "pytablewriter-altrow-theme (>=0.2.0,<1)", "simplejson (>=3.8.1,<4)", "sphinx-rtd-theme (>=1.2.2)", "toml (>=0.9.3,<1)", "xlwt"] -es = ["elasticsearch (>=8.0.1,<9)"] -es8 = ["elasticsearch (>=8.0.1,<9)"] -excel = ["XlsxWriter (>=0.9.6,<4)", "xlwt"] -from = ["pytablereader (>=0.31.3,<2)"] -html = ["dominate (>=2.1.5,<3)"] -logging = ["loguru (>=0.4.1,<1)"] -pandas = ["pandas (>=0.25.3,<3)"] -sqlite = ["SimpleSQLite (>=1.3.2,<2)"] -test = ["PyYAML (>=3.11,<7)", "SimpleSQLite (>=1.3.2,<2)", "XlsxWriter (>=0.9.6,<4)", "beautifulsoup4 (>=4.10)", "dominate (>=2.1.5,<3)", "elasticsearch (>=8.0.1,<9)", "loguru (>=0.4.1,<1)", "pandas (>=0.25.3,<3)", "pytablereader (>=0.31.3,<2)", "pytablereader[excel,sqlite] (>=0.31.3)", "pytablewriter-altcol-theme (>=0.1.0,<1)", "pytablewriter-altrow-theme (>=0.2.0,<1)", "pytest (>=6.0.1)", "pytest-md-report (>=0.4.1)", "simplejson (>=3.8.1,<4)", "sqliteschema (>=1.3.0)", "tablib (>=3.2.0)", "toml (>=0.9.3,<1)", "xlwt"] -theme = ["pytablewriter-altcol-theme (>=0.1.0,<1)", "pytablewriter-altrow-theme (>=0.2.0,<1)"] -toml = ["toml (>=0.9.3,<1)"] -yaml = ["PyYAML (>=3.11,<7)"] - [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -5693,30 +5358,6 @@ files = [ {file = "rpds_py-0.21.0.tar.gz", hash = "sha256:ed6378c9d66d0de903763e7706383d60c33829581f0adff47b6535f1802fa6db"}, ] -[[package]] -name = "sacrebleu" -version = "2.4.3" -description = "Hassle-free computation of shareable, comparable, and reproducible BLEU, chrF, and TER scores" -optional = false -python-versions = ">=3.8" -files = [ - {file = "sacrebleu-2.4.3-py3-none-any.whl", hash = "sha256:a976fd6998d8ced267a722120ec7fc47083c8e9745d8808ccee6424464a0aa31"}, - {file = "sacrebleu-2.4.3.tar.gz", hash = "sha256:e734b1e0baeaea6ade0fefc9d23bac3df50bf15775d8b78edc108db63654192a"}, -] - -[package.dependencies] -colorama = "*" -lxml = "*" -numpy = ">=1.17" -portalocker = "*" -regex = "*" -tabulate = ">=0.8.9" - -[package.extras] -dev = ["lxml-stubs", "mypy", "pytest", "setuptools", "types-tabulate", "wheel"] -ja = ["ipadic (>=1.0,<2.0)", "mecab-python3 (>=1.0.9,<2.0.0)"] -ko = ["mecab-ko (>=1.0.0,<=1.0.1)", "mecab-ko-dic (>=1.0,<2.0)"] - [[package]] name = "safetensors" version = "0.4.5" @@ -5849,106 +5490,6 @@ tensorflow = ["safetensors[numpy]", "tensorflow (>=2.11.0)"] testing = ["h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "hypothesis (>=6.70.2)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "safetensors[numpy]", "setuptools-rust (>=1.5.2)"] torch = ["safetensors[numpy]", "torch (>=1.10)"] -[[package]] -name = "scikit-learn" -version = "1.5.2" -description = "A set of python modules for machine learning and data mining" -optional = false -python-versions = ">=3.9" -files = [ - {file = "scikit_learn-1.5.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:299406827fb9a4f862626d0fe6c122f5f87f8910b86fe5daa4c32dcd742139b6"}, - {file = "scikit_learn-1.5.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:2d4cad1119c77930b235579ad0dc25e65c917e756fe80cab96aa3b9428bd3fb0"}, - {file = "scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c412ccc2ad9bf3755915e3908e677b367ebc8d010acbb3f182814524f2e5540"}, - {file = "scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a686885a4b3818d9e62904d91b57fa757fc2bed3e465c8b177be652f4dd37c8"}, - {file = "scikit_learn-1.5.2-cp310-cp310-win_amd64.whl", hash = "sha256:c15b1ca23d7c5f33cc2cb0a0d6aaacf893792271cddff0edbd6a40e8319bc113"}, - {file = "scikit_learn-1.5.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:03b6158efa3faaf1feea3faa884c840ebd61b6484167c711548fce208ea09445"}, - {file = "scikit_learn-1.5.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:1ff45e26928d3b4eb767a8f14a9a6efbf1cbff7c05d1fb0f95f211a89fd4f5de"}, - {file = "scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f763897fe92d0e903aa4847b0aec0e68cadfff77e8a0687cabd946c89d17e675"}, - {file = "scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8b0ccd4a902836493e026c03256e8b206656f91fbcc4fde28c57a5b752561f1"}, - {file = "scikit_learn-1.5.2-cp311-cp311-win_amd64.whl", hash = "sha256:6c16d84a0d45e4894832b3c4d0bf73050939e21b99b01b6fd59cbb0cf39163b6"}, - {file = "scikit_learn-1.5.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f932a02c3f4956dfb981391ab24bda1dbd90fe3d628e4b42caef3e041c67707a"}, - {file = "scikit_learn-1.5.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:3b923d119d65b7bd555c73be5423bf06c0105678ce7e1f558cb4b40b0a5502b1"}, - {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, - {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, - {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, - {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"}, - {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"}, - {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"}, - {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"}, - {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"}, - {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, - {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, - {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, - {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca64b3089a6d9b9363cd3546f8978229dcbb737aceb2c12144ee3f70f95684b7"}, - {file = "scikit_learn-1.5.2-cp39-cp39-win_amd64.whl", hash = "sha256:3bed4909ba187aca80580fe2ef370d9180dcf18e621a27c4cf2ef10d279a7efe"}, - {file = "scikit_learn-1.5.2.tar.gz", hash = "sha256:b4237ed7b3fdd0a4882792e68ef2545d5baa50aca3bb45aa7df468138ad8f94d"}, -] - -[package.dependencies] -joblib = ">=1.2.0" -numpy = ">=1.19.5" -scipy = ">=1.6.0" -threadpoolctl = ">=3.1.0" - -[package.extras] -benchmark = ["matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "pandas (>=1.1.5)"] -build = ["cython (>=3.0.10)", "meson-python (>=0.16.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)"] -docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pydata-sphinx-theme (>=0.15.3)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)", "sphinx (>=7.3.7)", "sphinx-copybutton (>=0.5.2)", "sphinx-design (>=0.5.0)", "sphinx-design (>=0.6.0)", "sphinx-gallery (>=0.16.0)", "sphinx-prompt (>=1.4.0)", "sphinx-remove-toctrees (>=1.0.0.post1)", "sphinxcontrib-sass (>=0.3.4)", "sphinxext-opengraph (>=0.9.1)"] -examples = ["matplotlib (>=3.3.4)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)"] -install = ["joblib (>=1.2.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)", "threadpoolctl (>=3.1.0)"] -maintenance = ["conda-lock (==2.5.6)"] -tests = ["black (>=24.3.0)", "matplotlib (>=3.3.4)", "mypy (>=1.9)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pyarrow (>=12.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.2.1)", "scikit-image (>=0.17.2)"] - -[[package]] -name = "scipy" -version = "1.14.1" -description = "Fundamental algorithms for scientific computing in Python" -optional = false -python-versions = ">=3.10" -files = [ - {file = "scipy-1.14.1-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:b28d2ca4add7ac16ae8bb6632a3c86e4b9e4d52d3e34267f6e1b0c1f8d87e389"}, - {file = "scipy-1.14.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d0d2821003174de06b69e58cef2316a6622b60ee613121199cb2852a873f8cf3"}, - {file = "scipy-1.14.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8bddf15838ba768bb5f5083c1ea012d64c9a444e16192762bd858f1e126196d0"}, - {file = "scipy-1.14.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:97c5dddd5932bd2a1a31c927ba5e1463a53b87ca96b5c9bdf5dfd6096e27efc3"}, - {file = "scipy-1.14.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ff0a7e01e422c15739ecd64432743cf7aae2b03f3084288f399affcefe5222d"}, - {file = "scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8e32dced201274bf96899e6491d9ba3e9a5f6b336708656466ad0522d8528f69"}, - {file = "scipy-1.14.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8426251ad1e4ad903a4514712d2fa8fdd5382c978010d1c6f5f37ef286a713ad"}, - {file = "scipy-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:a49f6ed96f83966f576b33a44257d869756df6cf1ef4934f59dd58b25e0327e5"}, - {file = "scipy-1.14.1-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:2da0469a4ef0ecd3693761acbdc20f2fdeafb69e6819cc081308cc978153c675"}, - {file = "scipy-1.14.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:c0ee987efa6737242745f347835da2cc5bb9f1b42996a4d97d5c7ff7928cb6f2"}, - {file = "scipy-1.14.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3a1b111fac6baec1c1d92f27e76511c9e7218f1695d61b59e05e0fe04dc59617"}, - {file = "scipy-1.14.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8475230e55549ab3f207bff11ebfc91c805dc3463ef62eda3ccf593254524ce8"}, - {file = "scipy-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:278266012eb69f4a720827bdd2dc54b2271c97d84255b2faaa8f161a158c3b37"}, - {file = "scipy-1.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fef8c87f8abfb884dac04e97824b61299880c43f4ce675dd2cbeadd3c9b466d2"}, - {file = "scipy-1.14.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b05d43735bb2f07d689f56f7b474788a13ed8adc484a85aa65c0fd931cf9ccd2"}, - {file = "scipy-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:716e389b694c4bb564b4fc0c51bc84d381735e0d39d3f26ec1af2556ec6aad94"}, - {file = "scipy-1.14.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:631f07b3734d34aced009aaf6fedfd0eb3498a97e581c3b1e5f14a04164a456d"}, - {file = "scipy-1.14.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:af29a935803cc707ab2ed7791c44288a682f9c8107bc00f0eccc4f92c08d6e07"}, - {file = "scipy-1.14.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:2843f2d527d9eebec9a43e6b406fb7266f3af25a751aa91d62ff416f54170bc5"}, - {file = "scipy-1.14.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:eb58ca0abd96911932f688528977858681a59d61a7ce908ffd355957f7025cfc"}, - {file = "scipy-1.14.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:30ac8812c1d2aab7131a79ba62933a2a76f582d5dbbc695192453dae67ad6310"}, - {file = "scipy-1.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f9ea80f2e65bdaa0b7627fb00cbeb2daf163caa015e59b7516395fe3bd1e066"}, - {file = "scipy-1.14.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:edaf02b82cd7639db00dbff629995ef185c8df4c3ffa71a5562a595765a06ce1"}, - {file = "scipy-1.14.1-cp312-cp312-win_amd64.whl", hash = "sha256:2ff38e22128e6c03ff73b6bb0f85f897d2362f8c052e3b8ad00532198fbdae3f"}, - {file = "scipy-1.14.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1729560c906963fc8389f6aac023739ff3983e727b1a4d87696b7bf108316a79"}, - {file = "scipy-1.14.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:4079b90df244709e675cdc8b93bfd8a395d59af40b72e339c2287c91860deb8e"}, - {file = "scipy-1.14.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:e0cf28db0f24a38b2a0ca33a85a54852586e43cf6fd876365c86e0657cfe7d73"}, - {file = "scipy-1.14.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:0c2f95de3b04e26f5f3ad5bb05e74ba7f68b837133a4492414b3afd79dfe540e"}, - {file = "scipy-1.14.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b99722ea48b7ea25e8e015e8341ae74624f72e5f21fc2abd45f3a93266de4c5d"}, - {file = "scipy-1.14.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5149e3fd2d686e42144a093b206aef01932a0059c2a33ddfa67f5f035bdfe13e"}, - {file = "scipy-1.14.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e4f5a7c49323533f9103d4dacf4e4f07078f360743dec7f7596949149efeec06"}, - {file = "scipy-1.14.1-cp313-cp313-win_amd64.whl", hash = "sha256:baff393942b550823bfce952bb62270ee17504d02a1801d7fd0719534dfb9c84"}, - {file = "scipy-1.14.1.tar.gz", hash = "sha256:5a275584e726026a5699459aa72f828a610821006228e841b94275c4a7c08417"}, -] - -[package.dependencies] -numpy = ">=1.23.5,<2.3" - -[package.extras] -dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy (==1.10.0)", "pycodestyle", "pydevtool", "rich-click", "ruff (>=0.0.292)", "types-psutil", "typing_extensions"] -doc = ["jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.13.1)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<=7.3.7)", "sphinx-design (>=0.4.0)"] -test = ["Cython", "array-api-strict (>=2.0)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] - [[package]] name = "seaborn" version = "0.13.2" @@ -6844,25 +6385,6 @@ mpmath = ">=1.1.0,<1.4" [package.extras] dev = ["hypothesis (>=6.70.0)", "pytest (>=7.1.0)"] -[[package]] -name = "tabledata" -version = "1.3.3" -description = "tabledata is a Python library to represent tabular data. Used for pytablewriter/pytablereader/SimpleSQLite/etc." -optional = false -python-versions = ">=3.7" -files = [ - {file = "tabledata-1.3.3-py3-none-any.whl", hash = "sha256:4abad1c996d8607e23b045b44dc0c5f061668f3c37585302c5f6c84c93a89962"}, - {file = "tabledata-1.3.3.tar.gz", hash = "sha256:c90daaba9a408e4397934b3ff2f6c06797d5289676420bf520c741ad43e6ff91"}, -] - -[package.dependencies] -DataProperty = ">=1.0.1,<2" -typepy = ">=1.2.0,<2" - -[package.extras] -logging = ["loguru (>=0.4.1,<1)"] -test = ["pytablewriter (>=0.46)", "pytest"] - [[package]] name = "tabulate" version = "0.9.0" @@ -6888,34 +6410,6 @@ files = [ {file = "tblib-3.0.0.tar.gz", hash = "sha256:93622790a0a29e04f0346458face1e144dc4d32f493714c6c3dff82a4adb77e6"}, ] -[[package]] -name = "tcolorpy" -version = "0.1.6" -description = "tcolopy is a Python library to apply true color for terminal text." -optional = false -python-versions = ">=3.7" -files = [ - {file = "tcolorpy-0.1.6-py3-none-any.whl", hash = "sha256:8c15cb3167f30b0a433d72297e9d68667c825bd9e2af41c8dd7dfbd3d7f7e207"}, - {file = "tcolorpy-0.1.6.tar.gz", hash = "sha256:8cea0bf5f8cf03f77528a9acfbf312df935573892ba5ea3b2516e61fa54de9a5"}, -] - -[package.extras] -test = ["pytest (>=6.0.1)", "pytest-md-report (>=0.5)"] - -[[package]] -name = "termcolor" -version = "2.3.0" -description = "ANSI color formatting for output in terminal" -optional = false -python-versions = ">=3.7" -files = [ - {file = "termcolor-2.3.0-py3-none-any.whl", hash = "sha256:3afb05607b89aed0ffe25202399ee0867ad4d3cb4180d98aaf8eefa6a5f7d475"}, - {file = "termcolor-2.3.0.tar.gz", hash = "sha256:b5b08f68937f138fe92f6c089b99f1e2da0ae56c52b78bf7075fd95420fd9a5a"}, -] - -[package.extras] -tests = ["pytest", "pytest-cov"] - [[package]] name = "terminado" version = "0.18.1" @@ -7022,17 +6516,6 @@ mxnet = ["mxnet (>=1.5.1,<1.6.0)"] tensorflow = ["tensorflow (>=2.0.0,<2.6.0)"] torch = ["torch (>=1.6.0)"] -[[package]] -name = "threadpoolctl" -version = "3.5.0" -description = "threadpoolctl" -optional = false -python-versions = ">=3.8" -files = [ - {file = "threadpoolctl-3.5.0-py3-none-any.whl", hash = "sha256:56c1e26c150397e58c4926da8eeee87533b1e32bef131bd4bf6a2f45f3185467"}, - {file = "threadpoolctl-3.5.0.tar.gz", hash = "sha256:082433502dd922bf738de0d8bcc4fdcbf0979ff44c42bd40f5af8a282f6fa107"}, -] - [[package]] name = "tiktoken" version = "0.7.0" @@ -7266,31 +6749,28 @@ files = [ [[package]] name = "torch" -version = "2.4.1" +version = "2.5.1" description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" optional = false python-versions = ">=3.8.0" files = [ - {file = "torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:362f82e23a4cd46341daabb76fba08f04cd646df9bfaf5da50af97cb60ca4971"}, - {file = "torch-2.4.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:e8ac1985c3ff0f60d85b991954cfc2cc25f79c84545aead422763148ed2759e3"}, - {file = "torch-2.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:91e326e2ccfb1496e3bee58f70ef605aeb27bd26be07ba64f37dcaac3d070ada"}, - {file = "torch-2.4.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:d36a8ef100f5bff3e9c3cea934b9e0d7ea277cb8210c7152d34a9a6c5830eadd"}, - {file = "torch-2.4.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:0b5f88afdfa05a335d80351e3cea57d38e578c8689f751d35e0ff36bce872113"}, - {file = "torch-2.4.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:ef503165f2341942bfdf2bd520152f19540d0c0e34961232f134dc59ad435be8"}, - {file = "torch-2.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:092e7c2280c860eff762ac08c4bdcd53d701677851670695e0c22d6d345b269c"}, - {file = "torch-2.4.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:ddddbd8b066e743934a4200b3d54267a46db02106876d21cf31f7da7a96f98ea"}, - {file = "torch-2.4.1-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:fdc4fe11db3eb93c1115d3e973a27ac7c1a8318af8934ffa36b0370efe28e042"}, - {file = "torch-2.4.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:18835374f599207a9e82c262153c20ddf42ea49bc76b6eadad8e5f49729f6e4d"}, - {file = "torch-2.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:ebea70ff30544fc021d441ce6b219a88b67524f01170b1c538d7d3ebb5e7f56c"}, - {file = "torch-2.4.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:72b484d5b6cec1a735bf3fa5a1c4883d01748698c5e9cfdbeb4ffab7c7987e0d"}, - {file = "torch-2.4.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:c99e1db4bf0c5347107845d715b4aa1097e601bdc36343d758963055e9599d93"}, - {file = "torch-2.4.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:b57f07e92858db78c5b72857b4f0b33a65b00dc5d68e7948a8494b0314efb880"}, - {file = "torch-2.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:f18197f3f7c15cde2115892b64f17c80dbf01ed72b008020e7da339902742cf6"}, - {file = "torch-2.4.1-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:5fc1d4d7ed265ef853579caf272686d1ed87cebdcd04f2a498f800ffc53dab71"}, - {file = "torch-2.4.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:40f6d3fe3bae74efcf08cb7f8295eaddd8a838ce89e9d26929d4edd6d5e4329d"}, - {file = "torch-2.4.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:c9299c16c9743001ecef515536ac45900247f4338ecdf70746f2461f9e4831db"}, - {file = "torch-2.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:6bce130f2cd2d52ba4e2c6ada461808de7e5eccbac692525337cfb4c19421846"}, - {file = "torch-2.4.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:a38de2803ee6050309aac032676536c3d3b6a9804248537e38e098d0e14817ec"}, + {file = "torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:71328e1bbe39d213b8721678f9dcac30dfc452a46d586f1d514a6aa0a99d4744"}, + {file = "torch-2.5.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:34bfa1a852e5714cbfa17f27c49d8ce35e1b7af5608c4bc6e81392c352dbc601"}, + {file = "torch-2.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:32a037bd98a241df6c93e4c789b683335da76a2ac142c0973675b715102dc5fa"}, + {file = "torch-2.5.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:23d062bf70776a3d04dbe74db950db2a5245e1ba4f27208a87f0d743b0d06e86"}, + {file = "torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:de5b7d6740c4b636ef4db92be922f0edc425b65ed78c5076c43c42d362a45457"}, + {file = "torch-2.5.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:340ce0432cad0d37f5a31be666896e16788f1adf8ad7be481196b503dad675b9"}, + {file = "torch-2.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:603c52d2fe06433c18b747d25f5c333f9c1d58615620578c326d66f258686f9a"}, + {file = "torch-2.5.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:31f8c39660962f9ae4eeec995e3049b5492eb7360dd4f07377658ef4d728fa4c"}, + {file = "torch-2.5.1-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:ed231a4b3a5952177fafb661213d690a72caaad97d5824dd4fc17ab9e15cec03"}, + {file = "torch-2.5.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:3f4b7f10a247e0dcd7ea97dc2d3bfbfc90302ed36d7f3952b0008d0df264e697"}, + {file = "torch-2.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:73e58e78f7d220917c5dbfad1a40e09df9929d3b95d25e57d9f8558f84c9a11c"}, + {file = "torch-2.5.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:8c712df61101964eb11910a846514011f0b6f5920c55dbf567bff8a34163d5b1"}, + {file = "torch-2.5.1-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:9b61edf3b4f6e3b0e0adda8b3960266b9009d02b37555971f4d1c8f7a05afed7"}, + {file = "torch-2.5.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:1f3b7fb3cf7ab97fae52161423f81be8c6b8afac8d9760823fd623994581e1a3"}, + {file = "torch-2.5.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:7974e3dce28b5a21fb554b73e1bc9072c25dde873fa00d54280861e7a009d7dc"}, + {file = "torch-2.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:46c817d3ea33696ad3b9df5e774dba2257e9a4cd3c4a3afbf92f6bb13ac5ce2d"}, + {file = "torch-2.5.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:8046768b7f6d35b85d101b4b38cba8aa2f3cd51952bc4c06a49580f2ce682291"}, ] [package.dependencies] @@ -7298,25 +6778,26 @@ filelock = "*" fsspec = "*" jinja2 = "*" networkx = "*" -nvidia-cublas-cu12 = {version = "12.1.3.1", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cuda-cupti-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cuda-nvrtc-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cuda-runtime-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cublas-cu12 = {version = "12.4.5.8", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cuda-cupti-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cuda-nvrtc-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cuda-runtime-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} nvidia-cudnn-cu12 = {version = "9.1.0.70", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cufft-cu12 = {version = "11.0.2.54", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-curand-cu12 = {version = "10.3.2.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cusolver-cu12 = {version = "11.4.5.107", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cusparse-cu12 = {version = "12.1.0.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-nccl-cu12 = {version = "2.20.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-nvtx-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -setuptools = "*" -sympy = "*" -triton = {version = "3.0.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.13\""} +nvidia-cufft-cu12 = {version = "11.2.1.3", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-curand-cu12 = {version = "10.3.5.147", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cusolver-cu12 = {version = "11.6.1.9", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cusparse-cu12 = {version = "12.3.1.170", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-nccl-cu12 = {version = "2.21.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-nvjitlink-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-nvtx-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +setuptools = {version = "*", markers = "python_version >= \"3.12\""} +sympy = {version = "1.13.1", markers = "python_version >= \"3.9\""} +triton = {version = "3.1.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.13\""} typing-extensions = ">=4.8.0" [package.extras] opt-einsum = ["opt-einsum (>=3.3)"] -optree = ["optree (>=0.11.0)"] +optree = ["optree (>=0.12.0)"] [[package]] name = "tornado" @@ -7445,16 +6926,16 @@ vision = ["Pillow (>=10.0.1,<=15.0)"] [[package]] name = "triton" -version = "3.0.0" +version = "3.1.0" description = "A language and compiler for custom Deep Learning operations" optional = false python-versions = "*" files = [ - {file = "triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e1efef76935b2febc365bfadf74bcb65a6f959a9872e5bddf44cc9e0adce1e1a"}, - {file = "triton-3.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5ce8520437c602fb633f1324cc3871c47bee3b67acf9756c1a66309b60e3216c"}, - {file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"}, - {file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"}, - {file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"}, + {file = "triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8"}, + {file = "triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f34f6e7885d1bf0eaaf7ba875a5f0ce6f3c13ba98f9503651c1e6dc6757ed5c"}, + {file = "triton-3.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8182f42fd8080a7d39d666814fa36c5e30cc00ea7eeeb1a2983dbb4c99a0fdc"}, + {file = "triton-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dadaca7fc24de34e180271b5cf864c16755702e9f63a16f62df714a8099126a"}, + {file = "triton-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aafa9a20cd0d9fee523cd4504aa7131807a864cd77dcf6efe7e981f18b8c6c11"}, ] [package.dependencies] @@ -7465,27 +6946,6 @@ build = ["cmake (>=3.20)", "lit"] tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"] tutorials = ["matplotlib", "pandas", "tabulate"] -[[package]] -name = "typepy" -version = "1.3.2" -description = "typepy is a Python library for variable type checker/validator/converter at a run time." -optional = false -python-versions = ">=3.7" -files = [ - {file = "typepy-1.3.2-py3-none-any.whl", hash = "sha256:d5d1022a424132622993800f1d2cd16cfdb691ac4e3b9c325f0fcb37799db1ae"}, - {file = "typepy-1.3.2.tar.gz", hash = "sha256:b69fd48b9f50cdb3809906eef36b855b3134ff66c8893a4f8580abddb0b39517"}, -] - -[package.dependencies] -mbstrdecoder = ">=1.0.0,<2" -packaging = {version = "*", optional = true, markers = "extra == \"datetime\""} -python-dateutil = {version = ">=2.8.0,<3.0.0", optional = true, markers = "extra == \"datetime\""} -pytz = {version = ">=2018.9", optional = true, markers = "extra == \"datetime\""} - -[package.extras] -datetime = ["packaging", "python-dateutil (>=2.8.0,<3.0.0)", "pytz (>=2018.9)"] -test = ["packaging", "pytest (>=6.0.1)", "python-dateutil (>=2.8.0,<3.0.0)", "pytz (>=2018.9)", "tcolorpy"] - [[package]] name = "typer" version = "0.9.4" @@ -7982,4 +7442,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.11,<3.13" -content-hash = "a4c8ab84bda79bbf8fb3f92fc685ea17b00105cc729a39973e0fedcf81462441" +content-hash = "d9bc55f679878efa255457b264b6c142d779c1f2d9f1d0233ed562eaed4c195a" diff --git a/pyproject.toml b/pyproject.toml index ff4574f..219e756 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,6 @@ evaluate = "^0.4.3" absl-py = "^2.1.0" rouge-score = "^0.1.2" matplotlib = "^3.9.2" -lighteval = {extras = ["accelerate"], version = "^0.6.2"} outlines = "^0.1.7" datasets = "^3.1.0" text-generation = "^0.7.0" @@ -43,6 +42,9 @@ mistralai = "^1.2.5" llm-guard = "^0.3.15" pygments = "^2.18.0" llama-cpp-python = "^0.3.5" +torch = "^2.5.1" +gguf = "^0.13.0" +accelerate = "^1.2.1" [build-system] diff --git a/tamingllms/_build/.doctrees/environment.pickle b/tamingllms/_build/.doctrees/environment.pickle index db81a7e..42959ee 100644 Binary files a/tamingllms/_build/.doctrees/environment.pickle and b/tamingllms/_build/.doctrees/environment.pickle differ diff --git a/tamingllms/_build/.doctrees/markdown/preface.doctree b/tamingllms/_build/.doctrees/markdown/preface.doctree index d4a62be..9ba8a92 100644 Binary files a/tamingllms/_build/.doctrees/markdown/preface.doctree and b/tamingllms/_build/.doctrees/markdown/preface.doctree differ diff --git a/tamingllms/_build/.doctrees/markdown/toc.doctree b/tamingllms/_build/.doctrees/markdown/toc.doctree index 376e410..50ce0d7 100644 Binary files a/tamingllms/_build/.doctrees/markdown/toc.doctree and b/tamingllms/_build/.doctrees/markdown/toc.doctree differ diff --git a/tamingllms/_build/.doctrees/notebooks/alignment.doctree b/tamingllms/_build/.doctrees/notebooks/alignment.doctree index d7908d9..15d23b1 100644 Binary files a/tamingllms/_build/.doctrees/notebooks/alignment.doctree and b/tamingllms/_build/.doctrees/notebooks/alignment.doctree differ diff --git a/tamingllms/_build/.doctrees/notebooks/cost.doctree b/tamingllms/_build/.doctrees/notebooks/cost.doctree new file mode 100644 index 0000000..181b54c Binary files /dev/null and b/tamingllms/_build/.doctrees/notebooks/cost.doctree differ diff --git a/tamingllms/_build/.doctrees/notebooks/evals.doctree b/tamingllms/_build/.doctrees/notebooks/evals.doctree index 5593861..b6a64e7 100644 Binary files a/tamingllms/_build/.doctrees/notebooks/evals.doctree and b/tamingllms/_build/.doctrees/notebooks/evals.doctree differ diff --git a/tamingllms/_build/.doctrees/notebooks/local.doctree b/tamingllms/_build/.doctrees/notebooks/local.doctree index 6014cf8..08986fc 100644 Binary files a/tamingllms/_build/.doctrees/notebooks/local.doctree and b/tamingllms/_build/.doctrees/notebooks/local.doctree differ diff --git a/tamingllms/_build/.doctrees/notebooks/safety.doctree b/tamingllms/_build/.doctrees/notebooks/safety.doctree index f0ec824..6199fc9 100644 Binary files a/tamingllms/_build/.doctrees/notebooks/safety.doctree and b/tamingllms/_build/.doctrees/notebooks/safety.doctree differ diff --git a/tamingllms/_build/.doctrees/notebooks/structured_output.doctree b/tamingllms/_build/.doctrees/notebooks/structured_output.doctree index 053c0bc..f5c9be8 100644 Binary files a/tamingllms/_build/.doctrees/notebooks/structured_output.doctree and b/tamingllms/_build/.doctrees/notebooks/structured_output.doctree differ diff --git a/tamingllms/_build/html/_images/bitnet.png b/tamingllms/_build/html/_images/bitnet.png new file mode 100644 index 0000000..5d0d74e Binary files /dev/null and b/tamingllms/_build/html/_images/bitnet.png differ diff --git a/tamingllms/_build/html/_images/llmflation.png b/tamingllms/_build/html/_images/llmflation.png new file mode 100644 index 0000000..5061149 Binary files /dev/null and b/tamingllms/_build/html/_images/llmflation.png differ diff --git a/tamingllms/_build/html/_images/quantized.png b/tamingllms/_build/html/_images/quantized.png new file mode 100644 index 0000000..2dc6d44 Binary files /dev/null and b/tamingllms/_build/html/_images/quantized.png differ diff --git a/tamingllms/_build/html/_sources/markdown/toc.md b/tamingllms/_build/html/_sources/markdown/toc.md index 83c6895..1578091 100644 --- a/tamingllms/_build/html/_sources/markdown/toc.md +++ b/tamingllms/_build/html/_sources/markdown/toc.md @@ -32,7 +32,7 @@ Abstract: *The current discourse around Large Language Models (LLMs) tends to fo ## [Chapter 6: Local LLMs in Practice](https://www.souzatharsis.com/tamingLLMs/notebooks/local.html) -## Chapter 7: The Cost Factor +## Chapter 7: The Falling Cost Paradox ## Chapter 8: Frontiers diff --git a/tamingllms/_build/html/_sources/notebooks/alignment.ipynb b/tamingllms/_build/html/_sources/notebooks/alignment.ipynb index 552ad7f..9eeeffa 100644 --- a/tamingllms/_build/html/_sources/notebooks/alignment.ipynb +++ b/tamingllms/_build/html/_sources/notebooks/alignment.ipynb @@ -2537,7 +2537,7 @@ "source": [ "## Discussion and Conclusions\n", "\n", - "LLMs are complex systems and alignment is a challenging problem. In this chapter, we discussed how post-training alignment techniques can be used to align a language model to human preferences. In the case study, we demonstrated how to use DPO to align a language model to a user-provider policy further automating the process via synthetic data generation and LLM-as-judge evaluation. Our approach does serve as a proof of concept, however, several considerations should be taken into account when using this methodology in practice.\n", + "LLMs are complex systems and alignment is a challenging problem. In this chapter, we discussed how post-training techniques can be used to align a language model to human preferences. In the case study, we demonstrated how to use DPO to align a language model to a user-provider policy further automating the process via synthetic data generation and LLM-as-judge evaluation. Our approach serves as a proof of concept and several considerations should be taken into account when using this methodology in practice.\n", "\n", "**Synthetic Data Generation**\n", "\n", diff --git a/tamingllms/_build/html/_sources/notebooks/cost.ipynb b/tamingllms/_build/html/_sources/notebooks/cost.ipynb new file mode 100644 index 0000000..45e3ee6 --- /dev/null +++ b/tamingllms/_build/html/_sources/notebooks/cost.ipynb @@ -0,0 +1,450 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(cost)=\n", + "# The Falling Cost Paradox\n", + "```{epigraph}\n", + "It is a confusion of ideas to suppose that the economical use of fuel is equivalent to diminished consumption.
\n", + "The very contrary is the truth. \n", + "\n", + "-- William Stanley Jevons\n", + "```\n", + "```{contents}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Why Optimization Matters More Than Ever\n", + "\n", + "According to recent analysis from a16z {cite}`a16z2024llmflation`, the cost of LLM inference is decreasing by approximately 10x every year - a rate that outpaces even Moore's Law in the PC revolution or Edholm's Law during the bandwidth explosion of the dot-com era.\n", + "\n", + "```{figure} ../_static/cost/llmflation.png\n", + "---\n", + "name: llmflation\n", + "alt: LLMflation\n", + "scale: 70%\n", + "align: center\n", + "---\n", + "LLMflation {cite}`a16z2024llmflation`: The cost of LLM inference is decreasing by approximately 10x every year.\n", + "```\n", + "\n", + "A model achieving an MMLU score of 42 that cost \\$60 per million tokens in late 2021 can now be run for just \\$0.06 per million tokens. For higher-capability models scoring 83 on MMLU, prices have fallen by a factor of 62 since GPT-4's introduction in March 2023. \n", + "\n", + "This dramatic decline stems from multiple compounding factors including:\n", + "\n", + "- Improved GPU efficiency through architectural advances and Moore's Law\n", + "- Model quantization progress, moving from 16-bit to 4-bit or lower precision\n", + "- Software optimizations reducing compute and memory bandwidth requirements\n", + "- Emergence of smaller yet similarly capable models\n", + "- Better instruction tuning techniques like RLHF and DPO\n", + "- Competition from open-source models and low-cost providers\n", + "\n", + "This trend raises a critical question: If LLM costs are plummeting so rapidly, why should businesses and developers invest precious time and resources in optimizing their LLM usage? Wouldn't it make more sense to simply wait for the next wave of cost improvements rather than optimize today? In two words: **Jevons Paradox**. \n", + "\n", + "The Jevons Paradox was first observed by English economist William Stanley Jevons in 1865. Studying coal consumption during the Industrial Revolution, Jevons made a counterintuitive discovery: as steam engines became more efficient and coal use became more economical, total coal consumption increased rather than decreased driving the (Industrial Revolution) and the total spending up.\n", + "\n", + "This pattern has repeated throughout technological history:\n", + "\n", + "- Computing Power: As cost per computation plummeted, we didn't spend less on computing, instead we found new creative uses for computers, from smartphones to cloud servers\n", + "- Network Bandwidth: As data transmission got cheaper, we shifted from text messaging to HD video streaming and real-time gaming\n", + "- Data Storage: As cost per gigabyte fell, we moved from storing documents to hosting entire media libraries and training massive AI models\n", + "\n", + "One could argue that LLMs and Generative AI more broadly are following a similar trajectory. As costs decline, we're seeing the emergence of new applications:\n", + "- Embedding AI capabilities into every application and workflow\n", + "- Real-time analysis of audio transcripts and conversations\n", + "- Running AI models directly on edge devices and smartphones\n", + "- Multimodal applications combining text, images, audio and video \n", + "\n", + "In this environment of rapidly falling costs but potential for exponential growth in usage, optimizing LLM costs becomes more, not less, important. Here's why:\n", + "\n", + "**A) Scale Magnifies Everything**. When operating at billions of tokens per day, even small inefficiencies have major effects:\n", + "- A single digit improvement in efficiency can save millions of dollars annually at scale\n", + "- Every 100 milliseconds of latency is about 8% difference in engagement rates (30% on mobile) [^groklatency]\n", + "[^groklatency]: Quote from Jonathan Ross, CEO of Groq, a company that specializes in AI Inference services.\n", + "\n", + "**B) Tiered Pricing Persists**. While average costs are declining, the market maintains a tiered structure:\n", + "- Different models offer varying price-performance tradeoffs\n", + "- ChatGPT Pro at \\$200 per month breaks the price drop trend perhaps triggering a new wave of premium models\n", + "- Cost optimization is still required to select the right model for each specific use case\n", + "\n", + "**C) Competition Drives Innovation**. Companies that master LLM efficiency gain significant advantages:\n", + "- Ability to offer more competitive pricing\n", + "- Capacity to handle larger scale operations\n", + "- Resources to invest in product improvement\n", + "\n", + "**D) Performance and Cost Are Linked**. Cost optimization often yields performance benefits:\n", + "- Resource efficiency enables handling larger user loads\n", + "- More efficiency and reduced latency leads to improved user experience\n", + "\n", + "In this environment, companies that master efficient LLM usage while exploiting new capabilities opened up by falling costs will be best positioned to innovate and scale. This dual focus - efficiency and innovation - will likely characterize successful AI companies in the years ahead.\n", + "\n", + "Motivated by this insight, in the next sections we will dive into the factors that drive LLM cost decay and how to optimize LLM usage in practical applications. The discussion will explore key optimization areas including inference optimization through techniques like Flash Attention and cached prompts, model compression via quantization and distillation, and practical implementation patterns such as response caching, batch processing, and early stopping - all aimed at achieving efficient usage and cost reductions while maintaining model performance and reliability.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Right-Sizing LLMs: A Strategic Approach\n", + "\n", + "Before implementing cost optimization strategies for LLMs, organizations must develop a comprehensive understanding of their own requirements and constraints. This systematic approach prevents both over-engineering and under-provisioning, leading to more efficient and cost-effective implementations.\n", + "\n", + "In this section, we define key performance and cost related metrics that will guide our discussion. Then we propose a set of requirements practitioners should consider before we dive into cost optimization techniques.\n", + "\n", + "\n", + "### Metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Requirements\n", + "\n", + "#### Business Requirements\n", + "\n", + "First, one needs to define the problem to be solved and to what extent it is worth to be solved. Use case requirements form the foundation of any LLM implementation project. A clear definition of the specific business problema and task to be accomplished must be established upfront, along with concrete performance metrics covering accuracy, latency and throughput. This should be accompanied by well-defined cost-per-transaction targets, clear ROI expectations, and a strategic allocation of budgets across different use cases to ensure resources are optimally distributed.\n", + "\n", + "Budget and ROI considerations are critical for ensuring the long-term viability of LLM implementations. Organizations must establish clear spending limits that align with their financial capabilities while defining realistic cost-per-transaction targets. ROI expectations need to be carefully established through detailed analysis, followed by a strategic allocation of budgets across various use cases based on their business impact and priority.\n", + "\n", + "Compliance and security requirements cannot be overlooked. This involves a thorough identification of all applicable regulatory requirements and the establishment of robust data handling standards. Organizations must specify comprehensive audit requirements to maintain transparency and accountability, while implementing appropriate security controls to protect sensitive data and system access.\n", + "\n", + "Future-proofing considerations help ensure the longevity and adaptability of LLM implementations. This requires careful planning for scale to accommodate future growth, along with the evaluation of multi-model strategies to reduce dependency on single solutions. Organizations should carefully assess vendor lock-in risks and explore open-source alternatives to maintain flexibility and control over their AI infrastructure.\n", + "\n", + "Chapter {ref}`local` provides a detailed discussion on relevant considerations when {ref}`local-model-selection`.\n", + "\n", + "#### Performance Requirements\n", + "\n", + "Accuracy and quality form the foundation of any LLM deployment's performance requirements. At its core, this involves determining the minimum level of accuracy that the model must achieve to be considered successful. This serves as a critical baseline for evaluating model performance and making deployment decisions. Establishing clear evaluation metrics, whether through automated measures or human evaluation processes, provides concrete ways to assess if these thresholds are being met. Continuous monitoring of these accuracy metrics ensures the system maintains its performance over time as usage patterns and data distributions evolve. Chapter {ref}`evals` provides a detailed discussion on how to evaluate the performance of LLM-based applications.\n", + "\n", + "Latency and throughput requirements are equally crucial for ensuring a positive user experience and system reliability. These specifications define how quickly the system must respond to requests and how many concurrent users it can handle. Response time requirements must be carefully balanced against the computational resources available, while peak load capabilities need to account for usage spikes and growth patterns. The decision between real-time processing for immediate responses versus batch processing for efficiency depends heavily on the use case and user expectations. \n", + "\n", + "\n", + "#### Operational Requirements\n", + "\n", + "Scale and capacity planning forms the foundation of operational requirements for LLM deployments. This involves a comprehensive analysis of expected system usage and growth patterns to ensure the infrastructure can handle both current and future demands. Organizations must carefully project their daily and monthly API call volumes while calculating the average number of tokens per request to accurately estimate resource needs. Understanding usage patterns, including seasonal variations, enables proper capacity planning. Additionally, developing 12-24 month growth projections helps ensure the infrastructure can scale appropriately as demand increases.\n", + "\n", + "Reliability and availability requirements are equally critical for maintaining consistent service quality. These specifications define the expected uptime percentage that the system must maintain, typically expressed as a percentage of total operational time. Organizations need to establish clear maintenance windows that minimize disruption to users while ensuring necessary system updates and optimizations can be performed. Comprehensive backup and failover requirements must be specified to ensure business continuity in case of failures. High availability needs should be clearly defined, including redundancy levels and recovery time objectives, to maintain service quality even during unexpected events.\n", + "\n", + "#### Technical Requirements\n", + "\n", + "System integration requirements define how the LLM system will interact and communicate with existing infrastructure and applications. This involves carefully mapping all integration points where the LLM system needs to connect with other systems, establishing standardized data formats and interfaces for seamless communication, implementing robust security measures to protect data in transit, and identifying any technical constraints that could impact integration. Getting these integration requirements right is crucial for ensuring the LLM system can function effectively within the broader technical ecosystem.\n", + "\n", + "Data management requirements address how information will be stored, processed, and maintained within the LLM system. This encompasses determining appropriate storage solutions for maintaining conversation context and history, selecting and configuring vector databases to enable efficient retrieval-augmented generation (RAG), creating comprehensive data retention policies that balance operational needs with resource constraints, and ensuring all data handling practices comply with relevant privacy regulations. Proper data management is essential for both system performance and regulatory compliance, making it a critical consideration in any LLM implementation.\n", + "\n", + "\n", + "This structured approach to requirements analysis enables organizations to:\n", + "1. Select appropriate models aligned with specific needs\n", + "2. Identify targeted optimization opportunities\n", + "3. Scale efficiently while controlling costs\n", + "4. Develop realistic resource allocation strategies\n", + "\n", + "The following sections explore specific optimization techniques, but their implementation should always be guided by these foundational requirements.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Quantization\n", + "\n", + "Quantization is a common and relevant technique in making LLMs more efficient and accessible. At a high level, quantization reduces the number of bits used to represent a model's parameters. The most common form of quantization is to represent model's weights at lower precision at post-training phase. It has become a standard technique to generate a series of quantized models given a large pre-trained base model.\n", + "\n", + "While a standard pre-trained LLM might use 32-bit floating-point (FP32) or 16-bit floating-point (FP16) numbers to store its weights, quantized versions can operate at lower precision levels such as 8, 4 or even 2 bits per parameter, reducing memory footprint without proportional losses in performance, necessarily. For instance, for a model of 30 billion parameters, using FP32 means 4 bytes per weight or 120 GB for the whole weights. If the model is quantized such that weights are represented in 1 byte, the memory needed for the model’s weights decreases to 30 GB, hence potentially fitting into consumer grade hardware. This is done at the cost of precision loss, but the trade-off is often worth it though require careful analysis." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at model weights of a language model (`SmolLM2-135M-Instruct`) that has been quantized to 2-bit and 16-bit precisions. We will use an utility function `load_gguf` from the `taming_utils` package to load model weights of the quantized models directly from Hugging Face." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from taming_utils import load_gguf\n", + "\n", + "MODEL_NAME = \"bartowski/SmolLM2-135M-Instruct-GGUF\"\n", + "GGUF_FILE_Q2_K = \"SmolLM2-135M-Instruct-Q2_K.gguf\"\n", + "GGUF_FILE_F16 = \"SmolLM2-135M-Instruct-F16.gguf\"\n", + "\n", + "model_q2_k = load_gguf(model_name=MODEL_NAME, \n", + " gguf_file=GGUF_FILE_Q2_K)\n", + "\n", + "model_f16 = load_gguf(model_name=MODEL_NAME, \n", + " gguf_file=GGUF_FILE_F16)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We extract the MLP weights from the first layer of the model as a proxy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mlp_weights_q2_k = model_q2_k.model.layers[0].mlp.gate_proj.weight\n", + "mlp_weights_f16 = model_f16.model.layers[0].mlp.gate_proj.weight" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Original weights at 16-bit precision:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Parameter containing:\n", + "tensor([[-0.0145, 0.1826, 0.1377, ..., 0.1719, -0.1387, -0.0298],\n", + " [-0.1631, 0.0781, -0.2051, ..., -0.2070, -0.0334, 0.2891],\n", + " [-0.1768, -0.0488, -0.2393, ..., -0.0396, -0.1348, -0.1533],\n", + " ...,\n", + " [ 0.0771, 0.0845, -0.0232, ..., 0.0178, -0.1040, -0.0771],\n", + " [ 0.1582, 0.1167, -0.0474, ..., 0.0845, 0.0359, -0.2500],\n", + " [ 0.0432, 0.0972, 0.0933, ..., 0.2188, 0.0776, 0.0674]],\n", + " requires_grad=True)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mlp_weights_f16" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Quantized weights at 2-bit precision:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Parameter containing:\n", + "tensor([[-0.0028, 0.1852, 0.1396, ..., 0.1506, -0.1635, -0.0043],\n", + " [-0.1768, 0.0680, -0.2257, ..., -0.1890, -0.0464, 0.2960],\n", + " [-0.1840, -0.0451, -0.2395, ..., -0.0413, -0.1446, -0.1446],\n", + " ...,\n", + " [ 0.0621, 0.0621, -0.0478, ..., 0.0038, -0.0830, -0.0830],\n", + " [ 0.1473, 0.0926, -0.0547, ..., 0.0824, 0.0429, -0.2737],\n", + " [ 0.0355, 0.0782, 0.0782, ..., 0.2043, 0.0740, 0.0740]],\n", + " requires_grad=True)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mlp_weights_q2_k" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How do they compare? We arrive at a Pearson correlation of 99.7% between the two sets of weights." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pearson correlation: 0.9970\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "# Convert tensors to numpy arrays (detach from computation graph if needed)\n", + "weights_f16 = mlp_weights_f16.detach().cpu().numpy()\n", + "weights_q2_k = mlp_weights_q2_k.detach().cpu().numpy()\n", + "\n", + "flat_f16 = weights_f16.flatten()\n", + "flat_q2_k = weights_q2_k.flatten()\n", + "\n", + "# Calculate correlation\n", + "correlation = np.corrcoef(flat_f16, flat_q2_k)[0,1]\n", + "print(f\"Pearson correlation: {correlation:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Quantization is a powerful technique for reducing the memory footprint of LLMs. This can be exemplified by the case of LLaMa 3.3 70B as quantized by {cite}`unsloth2024llama3` [^unsloth]. The model's memory requirements vary significantly based on the quantization level used as demonstrated in {numref}`quantized`.\n", + "\n", + "[^unsloth]: Unsloth runs a business of making LLMs fine-tuning streamlined. Check them out at [unsloth.ai](https://unsloth.ai).\n", + "\n", + "```{figure} ../_static/cost/quantized.png\n", + "---\n", + "name: quantized\n", + "alt: Quantized Model Size\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Quantized Model Size: `unsloth/Llama-3.3-70B-Instruct-GGUF`\n", + "```\n", + "\n", + "We observe that the quantization process yields remarkable reductions in model size, demonstrating a clear trade-off between precision and memory requirements. The transition from F16 (141.1 GB) to Q8_0 (75 GB) achieves a dramatic 47% reduction in model size while maintaining relatively high numerical precision. Further quantization levels reveal an interesting pattern of diminishing returns - each step down in precision yields progressively smaller absolute size reductions, though the cumulative effect remains significant. At the extreme end, the Q2_K model (26.4 GB) requires only 19% of the storage space of its F16 counterpart [^quantization-levels].\n", + "\n", + "[^quantization-levels]: You may have noticed quantization levels have a special notation. Including the bit width in the name of the model but also quantization types (e.g. _K, _0). You can find more information about the quantization levels in {cite}`huggingface2024quantization`.\n", + "\n", + "This wide spectrum of model sizes enables deployment across diverse hardware environments. The lightweight Q2_K variant opens possibilities for running inference on consumer-grade hardware like high-end laptops or desktop computers. In contrast, the full-precision F16 model demands enterprise-grade computing resources with substantial memory capacity. This flexibility in deployment options makes quantization a powerful tool for democratizing access to large language models while managing computational costs.\n", + " \n", + "While quantization has proven highly effective, there is a limit to how far it can be pushed - specifically, the 1-bit ceiling. A notable advancement in this space is BitNet {cite}`wang20241bitaiinfra11` which pushes the boundaries of extreme quantization.\n", + "\n", + "BitNet's implementation, bitnet.cpp, has demonstrated significant performance improvements across both ARM and x86 architectures (see {numref}`bitnet`). When compared to llama.cpp, the framework achieves speedups ranging from 1.37x to 5.07x on ARM processors and 2.37x to 6.17x on x86 systems. These performance gains scale with model size - larger models benefit more substantially from BitNet's optimizations. The efficiency improvements extend beyond raw speed: energy consumption drops by 55-70% on ARM and 71-82% on x86 processors. Perhaps most impressively, bitnet.cpp enables running a 100B parameter BitNet b1.58 model on a single CPU at speeds matching human reading pace (5-7 tokens per second).\n", + "\n", + "```{figure} ../_static/cost/bitnet.png\n", + "---\n", + "name: bitnet\n", + "alt: BitNet\n", + "scale: 30%\n", + "align: center\n", + "---\n", + "BitNet: {cite}`wang20241bitaiinfra11`\n", + "```\n", + "\n", + "The framework's initial release focused on CPU inference optimization, with particular emphasis on 1-bit LLM architectures (BitNet b1.58). While initial testing shows promising results, these findings are specific to the tested models and kernels (its specialized kernels are carefully crafted to exploit the unique characteristics of these extremely quantized models). Further validation is needed before generalizing these results across different architectures and use cases.\n", + "\n", + "As a relatively recent innovation, 1-bit LLMs represent an exciting frontier in model compression. However, their full potential and limitations require additional research and real-world validation. The technology demonstrates how creative approaches to quantization can continue pushing the boundaries of efficient AI deployment." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Beyond its memory footprint reduction, quantization delivers several compelling advantages: it accelerates computation through faster arithmetic operations and larger batch sizes, reduces costs by enabling deployment on less expensive hardware and making LLMs more accessible to smaller organizations, and improves energy efficiency by lowering memory bandwidth usage and power consumption - particularly beneficial for mobile and edge devices, ultimately contributing to more sustainable AI deployment.\n", + "\n", + "Each reduction in precision risks performance degradation. Finding optimal quantization schemes remains an active research area. See Case Study on Quantization for Local Models in Chapter {ref}`local` for more details.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Check-list\n", + "\n", + "**Planning and Requirements**\n", + "- [ ] Start with a clear understanding of your application's needs and the factors that contribute to LLM costs\n", + "- [ ] Choose the right model for your task, balancing performance and cost\n", + "- [ ] Be aware of the potential challenges and limitations of open-source LLMs and take appropriate measures to mitigate them\n", + "\n", + "**Model Optimization**\n", + "- [ ] Explore model compression and quantization to reduce model size and computational demands\n", + "- [ ] Fine-tune pre-trained models on domain-specific data to improve accuracy and efficiency\n", + "- [ ] Consider using RAG to enhance performance and reduce reliance on purely generative processes\n", + "\n", + "**Prompt Engineering**\n", + "- [ ] Optimize prompts and utilize prompt engineering techniques to minimize token usage\n", + "- [ ] Experiment with different prompting strategies to unlock the full potential of open-source LLMs\n", + "\n", + "**Infrastructure and Operations**\n", + "- [ ] Implement caching and batching strategies to optimize resource utilization\n", + "- [ ] Monitor LLM usage patterns and costs to identify areas for optimization\n", + "- [ ] Set up observability and logging to track model performance and costs\n", + "- [ ] Establish automated testing and evaluation pipelines\n", + "\n", + "**Cost Management**\n", + "- [ ] Track and analyze inference costs across different model variants\n", + "- [ ] Implement cost allocation and chargeback mechanisms\n", + "- [ ] Set up cost alerts and budgeting controls\n", + "- [ ] Regularly review and optimize resource utilization\n", + "\n", + "## Conclusion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]\n", + "\n", + "[cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/\n", + "[cc-by-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png\n", + "[cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC-BY--NC--SA-4.0-lightgrey.svg\n", + "\n", + "```\n", + "@misc{tharsistpsouza2024tamingllms,\n", + " author = {Tharsis T. P. Souza},\n", + " title = {Taming LLMs: A Practical Guide to LLM Pitfalls with Open Source Software},\n", + " year = {2024},\n", + " chapter = {The Falling Cost Paradox},\n", + " journal = {GitHub repository},\n", + " url = {https://github.com/souzatharsis/tamingLLMs)\n", + "}\n", + "```\n", + "## References\n", + "```{bibliography}\n", + ":filter: docname in docnames\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tamingllms/_build/html/_sources/notebooks/local.ipynb b/tamingllms/_build/html/_sources/notebooks/local.ipynb index b451331..fde2739 100644 --- a/tamingllms/_build/html/_sources/notebooks/local.ipynb +++ b/tamingllms/_build/html/_sources/notebooks/local.ipynb @@ -4,6 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "(local)=\n", "# Local LLMs in Practice\n", "```{epigraph}\n", "Freedom is something that dies unless it's used.\n", @@ -40,7 +41,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Models Considerations\n", + "(local-model-selection)=\n", + "## Choosing your Model\n", "\n", "The landscape of open source LLMs is rapidly evolving, with new models emerging by the day. While proprietary LLMs have garnered significant attention, open source LLMs are gaining traction due to their flexibility, customization options, and cost-effectiveness. \n", "\n", @@ -1352,7 +1354,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Citation\n", "[![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]\n", "\n", "[cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/\n", diff --git a/tamingllms/_build/html/_sources/notebooks/structured_output.ipynb b/tamingllms/_build/html/_sources/notebooks/structured_output.ipynb index 4bc64db..64359b4 100644 --- a/tamingllms/_build/html/_sources/notebooks/structured_output.ipynb +++ b/tamingllms/_build/html/_sources/notebooks/structured_output.ipynb @@ -467,9 +467,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/tobias/src/tamingLLMs/tamingllms/.venv/lib/python3.11/site-packages/torch/cuda/__init__.py:654: UserWarning: Can't initialize NVML\n", + " warnings.warn(\"Can't initialize NVML\")\n" + ] + } + ], "source": [ "MODEL_NAME = \"HuggingFaceTB/SmolLM2-1.7B-Instruct\"\n", "PROMPT = \"Is Enzo a good name for a baby?\"\n", @@ -1384,7 +1393,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.11.11" } }, "nbformat": 4, diff --git a/tamingllms/_build/html/_static/cost/bitnet.png b/tamingllms/_build/html/_static/cost/bitnet.png new file mode 100644 index 0000000..5d0d74e Binary files /dev/null and b/tamingllms/_build/html/_static/cost/bitnet.png differ diff --git a/tamingllms/_build/html/_static/cost/llmflation.png b/tamingllms/_build/html/_static/cost/llmflation.png new file mode 100644 index 0000000..5061149 Binary files /dev/null and b/tamingllms/_build/html/_static/cost/llmflation.png differ diff --git a/tamingllms/_build/html/_static/cost/quantized.png b/tamingllms/_build/html/_static/cost/quantized.png new file mode 100644 index 0000000..2dc6d44 Binary files /dev/null and b/tamingllms/_build/html/_static/cost/quantized.png differ diff --git a/tamingllms/_build/html/_static/cost/quantized.tsx b/tamingllms/_build/html/_static/cost/quantized.tsx new file mode 100644 index 0000000..aef322c --- /dev/null +++ b/tamingllms/_build/html/_static/cost/quantized.tsx @@ -0,0 +1,58 @@ +import React from 'react'; +import { LineChart, Line, XAxis, YAxis, CartesianGrid, Tooltip, ResponsiveContainer } from 'recharts'; + +const MemoryUsageChart = () => { + const data = [ + { name: 'F16', value: 141.1 }, + { name: 'Q8_0', value: 75.0 }, + { name: 'Q6_K', value: 59.9 }, + { name: 'Q5_K_M', value: 49.9 }, + { name: 'Q4_K_M', value: 42.5 }, + { name: 'Q3_K_M', value: 34.3 }, + { name: 'Q2_K', value: 26.4 } + ]; + + return ( +
+ + + + + + [`${value} GB`, 'Model Size']} + contentStyle={{ + backgroundColor: '#fff', + border: '1px solid #ccc', + fontWeight: 'bold' + }} + /> + + + +
+ ); +}; + +export default MemoryUsageChart; \ No newline at end of file diff --git a/tamingllms/_build/html/genindex.html b/tamingllms/_build/html/genindex.html index 92c3679..18858b4 100644 --- a/tamingllms/_build/html/genindex.html +++ b/tamingllms/_build/html/genindex.html @@ -190,6 +190,15 @@ +
  • + + The Falling Cost Paradox + + + +
  • + + diff --git a/tamingllms/_build/html/markdown/intro.html b/tamingllms/_build/html/markdown/intro.html index a77a538..484ac75 100644 --- a/tamingllms/_build/html/markdown/intro.html +++ b/tamingllms/_build/html/markdown/intro.html @@ -208,6 +208,15 @@ +
  • + + The Falling Cost Paradox + + + +
  • + + diff --git a/tamingllms/_build/html/markdown/preface.html b/tamingllms/_build/html/markdown/preface.html index 36b808e..2f54208 100644 --- a/tamingllms/_build/html/markdown/preface.html +++ b/tamingllms/_build/html/markdown/preface.html @@ -190,6 +190,15 @@ +
  • + + The Falling Cost Paradox + + + +
  • + + @@ -227,7 +236,7 @@

    1. Preface—Emanuel Derman

    -

    An alternative title of this book could have been “Language Models Behaving Badly”. If you are coming from a background in financial modeling, you may have noticed the parallel with Emanuel Derman’s seminal work “Models.Behaving.Badly” [Derman, 2011]. This parallel is not coincidental. Just as Derman cautioned against treating financial models as perfect representations of reality, this book aims to highlight the limitations and pitfalls of Large Language Models (LLMs) in practical applications.

    +

    An alternative title of this book could have been “Language Models Behaving Badly”. If you are coming from a background in financial modeling, you may have noticed the parallel with Emanuel Derman’s seminal work “Models.Behaving.Badly” [Derman, 2011]. This parallel is not coincidental. Just as Derman cautioned against treating financial models as perfect representations of reality, this book aims to highlight the limitations and pitfalls of Large Language Models (LLMs) in practical applications.

    The book “Models.Behaving.Badly” by Emanuel Derman, a former physicist and Goldman Sachs quant, explores how financial and scientific models can fail when we mistake them for reality rather than treating them as approximations full of assumptions. The core premise of his work is that while models can be useful tools for understanding aspects of the world, they inherently involve simplification and assumptions. Derman argues that many financial crises, including the 2008 crash, occurred partly because people put too much faith in mathematical models without recognizing their limitations.

    Like financial models that failed to capture the complexity of human behavior and market dynamics, LLMs have inherent constraints. They can hallucinate facts, struggle with logical reasoning, and fail to maintain consistency across long outputs. Their responses, while often convincing, are probabilistic approximations based on training data rather than true understanding even though humans insist on treating them as “machines that can reason”.

    @@ -235,7 +244,7 @@

    1. Preface -
    +
    [Der11]

    E. Derman. Models.Behaving.Badly.: Why Confusing Illusion with Reality Can Lead to Disaster, on Wall Street and in Life. Free Press, 2011. ISBN 9781439165010. URL: https://books.google.co.uk/books?id=lke_cwM4wm8C.

    diff --git a/tamingllms/_build/html/markdown/toc.html b/tamingllms/_build/html/markdown/toc.html index 05d39be..629d701 100644 --- a/tamingllms/_build/html/markdown/toc.html +++ b/tamingllms/_build/html/markdown/toc.html @@ -183,6 +183,15 @@ +
  • + + The Falling Cost Paradox + + + +
  • + +
    @@ -245,8 +254,8 @@

    Chapter 6: Local LLMs in Practice

    -
    -

    Chapter 7: The Cost Factor

    +
    +

    Chapter 7: The Falling Cost Paradox

    Chapter 8: Frontiers

    diff --git a/tamingllms/_build/html/notebooks/alignment.html b/tamingllms/_build/html/notebooks/alignment.html index 32da58c..b570323 100644 --- a/tamingllms/_build/html/notebooks/alignment.html +++ b/tamingllms/_build/html/notebooks/alignment.html @@ -212,6 +212,15 @@ +
  • + + The Falling Cost Paradox + + + +
  • + + @@ -244,7 +253,7 @@
    -

    6. Preference-Based Alignment

    +

    6. Preference-Based Alignment

    A people that values its privileges above its principles soon loses both.

    —Dwight D. Eisenhower

    @@ -252,72 +261,72 @@
    -

    6.1. Introduction

    +

    6.1. Introduction

    The release of ChatGPT 3.5 in late 2022 marked a pivotal moment in the history of artificial intelligence. Within just five days of its launch, the model attracted over a million users, and within two months, it became the fastest-growing consumer application in history with over 100 million monthly active users.

    Yet, this raises an intriguing question: Why did ChatGPT 3.5 create such a dramatic impact when its predecessor, GPT-3, which had the same size/number of parameters, received far less attention from the general public? Arguably, the answer lies not in raw capabilities, but in Preference Alignment. Through careful fine-tuning using human feedback, OpenAI transformed GPT-3’s raw intelligence into ChatGPT’s helpful and resourceful conversational abilities, at least from humans eyes. This breakthrough demonstrated that aligning language models with human preferences is just as crucial as scaling them to greater sizes.

    -

    In this chapter, we will explore the process of aligning language models with human preferences via fine-tuning using modern techniques such as Direct Preference Optimization (DPO) [Rafailov et al., 2024]. Next, we will present a practical case study where we align a language model to a user-provided policy in a fully automated fashion leading to an open source model as well as a dataset of policy-aligned preferences.

    +

    In this chapter, we will explore the process of aligning language models with human preferences via fine-tuning using modern techniques such as Direct Preference Optimization (DPO) [Rafailov et al., 2024]. Next, we will present a practical case study where we align a language model to a user-provided policy in a fully automated fashion leading to an open source model as well as a dataset of policy-aligned preferences.

    -

    6.2. From Raw Capabilities to Preference Alignment

    +

    6.2. From Raw Capabilities to Preference Alignment

    -

    6.2.1. On the Misalignment of Language Models

    -

    Common pre-trained LLMs are not helpful to humans by default. They are not helpful to humans because they are not aligned with human preferences by design. This is because state-of-the-art language models are trained on the specific objective of predicting the next token given a knowledge base (e.g. large number of webpages from the internet). This is a very different objective than being asked to follow user’s instructions while being safe and helpful. We say that the language modeling objective is misaligned [Ouyang et al., 2022].

    +

    6.2.1. On the Misalignment of Language Models

    +

    Common pre-trained LLMs are not helpful to humans by default. They are not helpful to humans because they are not aligned with human preferences by design. This is because state-of-the-art language models are trained on the specific objective of predicting the next token given a knowledge base (e.g. large number of webpages from the internet). This is a very different objective than being asked to follow user’s instructions while being safe and helpful. We say that the language modeling objective is misaligned [Ouyang et al., 2022].

    Let’s take a look at GPT-2’s response to the following prompt: “Explain the moon landing to a 6 year old.”

    @@ -365,12 +374,12 @@

    6.2.2. Aligning Language Models with Human Preferences

    -

    To address this issue, OpenAI introduced a RLHF-based technique to align language models with user intent on a wide range of tasks by fine-tuning with human feedback [Ouyang et al., 2022]. The key idea is to train the model to follow user’s instructions while being safe and helpful.

    +

    6.2.2. Aligning Language Models with Human Preferences

    +

    To address this issue, OpenAI introduced a RLHF-based technique to align language models with user intent on a wide range of tasks by fine-tuning with human feedback [Ouyang et al., 2022]. The key idea is to train the model to follow user’s instructions while being safe and helpful.

    OpenAI RLHF Pipeline
    -

    Fig. 6.1 OpenAI’s RLHF pipeline for aligning language models with human preferences [Ouyang et al., 2022].

    +

    Fig. 6.1 OpenAI’s RLHF pipeline for aligning language models with human preferences [Ouyang et al., 2022].

    Fig. 6.1 illustrates OpenAI’s 3-step process for training language models to better follow human instructions using RLHF:

    @@ -409,7 +418,7 @@

    Alignment Simplified
    -

    Fig. 6.2 Simplified view of the alignment process showing the progression from base model to instruction-tuned model to aligned model [Ouyang et al., 2022].

    +

    Fig. 6.2 Simplified view of the alignment process showing the progression from base model to instruction-tuned model to aligned model [Ouyang et al., 2022].

    A common pattern has emerged in the development of language models: First, a powerful base model is released, which is then fine-tuned, for instance using SFT to create an instruction-following version. This instruct model can then be further aligned with human preferences using techniques such as RLHF to create an aligned version as illustrated in Fig. 6.3.

    @@ -419,10 +428,10 @@

    Fig. 6.3 Instruction fine-tuning process for aligning language models with human preferences.

    -

    An aligned model can be fine-tuned directly from a base model or from an instruction-tuned model. For example, Llama Guard 3 [Llama Team, 2024] is a Llama-3.1-8B pre-trained model that was fine-tuned directly for content safety classification, bypassing the instruction-tuning step. Similarly, Zephyr-7B-alpha [Face, 2024] demonstrates direct alignment from a base model - it is a fine-tuned version of Mistral-7B that was trained using Direct Preference Optimization (DPO) on publicly available datasets to create a helpful assistant.

    +

    An aligned model can be fine-tuned directly from a base model or from an instruction-tuned model. For example, Llama Guard 3 [Llama Team, 2024] is a Llama-3.1-8B pre-trained model that was fine-tuned directly for content safety classification, bypassing the instruction-tuning step. Similarly, Zephyr-7B-alpha [Face, 2024] demonstrates direct alignment from a base model - it is a fine-tuned version of Mistral-7B that was trained using Direct Preference Optimization (DPO) on publicly available datasets to create a helpful assistant.

    The OpenAI paper introduced two key components of this fine-tuning process - SFT for instruction tuning and RLHF (PPO in particular) for alignment. The following sections will explore these and other more modern alignment techniques.

    -

    6.2.2.1. Supervised Fine-Tuning (SFT) for Model Alignment

    +

    6.2.2.1. Supervised Fine-Tuning (SFT) for Model Alignment

    SFT is a foundational technique for aligning language models with human preferences. Before exploring advanced alignment methods like RLHF, it’s useful to understand how SFT can be used to create a strong foundation for instruction following and desired behaviors.

    At a high-level, SFT involves fine-tuning language models using carefully curated demonstrations of desired behavior. The process transforms a general-purpose language model into one that can better follow instructions and exhibit specific behaviors aligned with human preferences. Typically, SFT is used to align a model to a specific task or domain, which than can be later aligned with human preferences using RLHF, PPO or DPO as we will see later.

    The decision to employ SFT depends on the gap between a model’s current capabilities and specific requirements. SFT proves particularly valuable in scenarios requiring:

    @@ -440,14 +449,14 @@

    [Hu et al., 2021]

    +
  • LoRA (Low-Rank Adaptation) [Hu et al., 2021]

    • Uses two small matrices instead of updating all weights

    • Maintains model performance while reducing computational costs

    • Enables efficient training on consumer hardware

  • -
  • QLoRA (Quantized LoRA) [Dettmers et al., 2023]

    +
  • QLoRA (Quantized LoRA) [Dettmers et al., 2023]

    • Combines LoRA with weight quantization

    • Further reduces memory footprint

    • @@ -455,20 +464,20 @@

      [Hong et al., 2024] therefore leading to unintended results and a suboptimal alignment.

      -

      SFT can be seen as a form of behavior cloning of humans. Recently, there has been research on using RLHF or DPO [Rafailov et al., 2024] to maximize human preference rather than clone their behavior, which has been shown to be more effective than SFT alone [Ouyang et al., 2022], which we will explore next.

      +

      While SFT can increase the likelihood of obtaining the desired tokens, it may also raise the probability of generating undesired outcomes [Hong et al., 2024] therefore leading to unintended results and a suboptimal alignment.

      +

      SFT can be seen as a form of behavior cloning of humans. Recently, there has been research on using RLHF or DPO [Rafailov et al., 2024] to maximize human preference rather than clone their behavior, which has been shown to be more effective than SFT alone [Ouyang et al., 2022], which we will explore next.

  • -

    6.2.2.2. Augmenting SFT with Human Preferences

    -

    Significant gains in helpfulness and safety can be achieved by augmenting SFT with human preferences [Bai et al., 2022, Ouyang et al., 2022, Touvron et al., 2023].

    -

    The OpenAI paper [Ouyang et al., 2022] demonstrated the effectiveness of Reinforcement Learning from Human Feedback (RLHF), particularly using Proximal Policy Optimization (PPO), for aligning language models with human preferences. Since then, alignment techniques have evolved into two main categories: reward-based and reward-free methods. Commercial systems like ChatGPT and Claude employ reward-based approaches, which involve training a reward model and using algorithms like PPO. Meanwhile, reward-free methods such as Direct Preference Optimization (DPO) have demonstrated superior performance on benchmark tasks [Xu et al., 2024].

    -

    Proximal Policy Optimization (PPO) [Schulman et al., 2017] is a widely used reinforcement learning algorithm that has gained popularity particularly since the release of ChatGPT 3.5. It operates by iteratively updating the policy of an LLM, which can be understood as a set of rules that govern how the model generates text. In the context of RLHF, the policy is updated based on rewards that reflect human preferences. For instance, if a human evaluator prefers one LLM output over another, the policy is adjusted to increase the likelihood of generating outputs similar to the preferred one.

    -

    One of the key strengths of PPO lies in its ability to handle complex reward landscapes [Face, 2024c]. In many real-world scenarios, the rewards that an LLM receives may be noisy or delayed. For example, in a chatbot application, the reward for generating a good response may not be immediate, as it depends on the user’s subsequent interactions. PPO effectively learns in these situations by using a clipped surrogate objective function, which limits the size of policy updates and ensures stable training. This prevents the model from overreacting to noisy or delayed rewards and helps it converge to a stable and optimal policy.

    -

    Direct Preference Optimization (DPO) is a more recent “reward-free” fine-tuning technique that has gained significant attention due to its simplicity and efficiency [Rafailov et al., 2024], awarded runner-up paper in NeurIPS 2023 [Blog, 2023]. DPO operates by directly optimizing the policy to maximize the likelihood of preferred responses while minimizing the likelihood of non-preferred responses. As illustrated in Fig. 6.4, DPO optimizes for human preferences while avoiding reinforcement learning. Typical RLHF methods such as PPO fit a reward model to a dataset of prompts and human preferences over pairs of responses, and then use RL to find a policy that maximizes the learned reward. In contrast, DPO directly optimizes for the policy best satisfying the preferences with a simple classification objective, fitting an implicit reward model whose corresponding optimal policy can be extracted in closed form.

    +

    6.2.2.2. Augmenting SFT with Human Preferences

    +

    Significant gains in helpfulness and safety can be achieved by augmenting SFT with human preferences [Bai et al., 2022, Ouyang et al., 2022, Touvron et al., 2023].

    +

    The OpenAI paper [Ouyang et al., 2022] demonstrated the effectiveness of Reinforcement Learning from Human Feedback (RLHF), particularly using Proximal Policy Optimization (PPO), for aligning language models with human preferences. Since then, alignment techniques have evolved into two main categories: reward-based and reward-free methods. Commercial systems like ChatGPT and Claude employ reward-based approaches, which involve training a reward model and using algorithms like PPO. Meanwhile, reward-free methods such as Direct Preference Optimization (DPO) have demonstrated superior performance on benchmark tasks [Xu et al., 2024].

    +

    Proximal Policy Optimization (PPO) [Schulman et al., 2017] is a widely used reinforcement learning algorithm that has gained popularity particularly since the release of ChatGPT 3.5. It operates by iteratively updating the policy of an LLM, which can be understood as a set of rules that govern how the model generates text. In the context of RLHF, the policy is updated based on rewards that reflect human preferences. For instance, if a human evaluator prefers one LLM output over another, the policy is adjusted to increase the likelihood of generating outputs similar to the preferred one.

    +

    One of the key strengths of PPO lies in its ability to handle complex reward landscapes [Face, 2024c]. In many real-world scenarios, the rewards that an LLM receives may be noisy or delayed. For example, in a chatbot application, the reward for generating a good response may not be immediate, as it depends on the user’s subsequent interactions. PPO effectively learns in these situations by using a clipped surrogate objective function, which limits the size of policy updates and ensures stable training. This prevents the model from overreacting to noisy or delayed rewards and helps it converge to a stable and optimal policy.

    +

    Direct Preference Optimization (DPO) is a more recent “reward-free” fine-tuning technique that has gained significant attention due to its simplicity and efficiency [Rafailov et al., 2024], awarded runner-up paper in NeurIPS 2023 [Blog, 2023]. DPO operates by directly optimizing the policy to maximize the likelihood of preferred responses while minimizing the likelihood of non-preferred responses. As illustrated in Fig. 6.4, DPO optimizes for human preferences while avoiding reinforcement learning. Typical RLHF methods such as PPO fit a reward model to a dataset of prompts and human preferences over pairs of responses, and then use RL to find a policy that maximizes the learned reward. In contrast, DPO directly optimizes for the policy best satisfying the preferences with a simple classification objective, fitting an implicit reward model whose corresponding optimal policy can be extracted in closed form.

    Direct Preference Optimization Architecture
    -

    Fig. 6.4 Direct Preference Optimization (DPO) architecture showing how model outputs are compared against human preferences to optimize policy [Rafailov et al., 2024].

    +

    Fig. 6.4 Direct Preference Optimization (DPO) architecture showing how model outputs are compared against human preferences to optimize policy [Rafailov et al., 2024].

    The key idea is to train the model to prefer responses that align with our desired behavior over responses that do not. DPO works by:

    @@ -489,16 +498,16 @@

    \(\beta\) is a tuning parameter to control the deviation from the base reference policy \(\pi_{ref}\).

    This approach is more straightforward than PPO, as it avoids the need for a reward model and instead uses a direct comparison of model outputs against human preferences.

    -

    Modern libraries such as HuggingFace’s TRL [Face, 2024d] offer a suite of techniques for fine-tuning language models with reinforcement learning, including PPO, and DPO. It provides a user-friendly interface and a wide range of features for fine-tuning and aligning LLMs, which will be the focus of the next section as we go through a case study.

    +

    Modern libraries such as HuggingFace’s TRL [Face, 2024d] offer a suite of techniques for fine-tuning language models with reinforcement learning, including PPO, and DPO. It provides a user-friendly interface and a wide range of features for fine-tuning and aligning LLMs, which will be the focus of the next section as we go through a case study.

    -

    6.3. Is Post-Training the Answer?

    +

    6.3. Is Post-Training the Answer?

    -

    6.3.1. Limitations

    +

    6.3.1. Limitations

    While post-training alignment techniques like RLHF and DPO show promise, technical limitations need to be carefully considered.

    -

    Reinforcement Learning from Human Feedback faces several critical scaling challenges that distinguish it from pre-training or supervised fine-tuning. One key issue is scalability. Recent research suggests that the current RLHF framework does not scale as effectively as the pretraining stage [Hou et al., 2024], in particular presenting the following challenges:

    +

    Reinforcement Learning from Human Feedback faces several critical scaling challenges that distinguish it from pre-training or supervised fine-tuning. One key issue is scalability. Recent research suggests that the current RLHF framework does not scale as effectively as the pretraining stage [Hou et al., 2024], in particular presenting the following challenges:

    1. Poor Scaling with Computational Resources

    @@ -536,7 +545,7 @@

    [Feng et al., 2024], including the following:

    +

    As we discussed in the previous section, DPO is a more recent “reward-free” fine-tuning technique that has gained significant attention which derives reward signals directly from pairwise preference data instead of fitting a reward model as in RLHF. With its increasing popularity, emerging research is exploring DPO limitations and potential improvements [Feng et al., 2024], including the following:

    1. Supervised Fine-Tuning Dependencies

    @@ -564,9 +573,9 @@

    -

    6.3.2. Model Collapse

    +

    6.3.2. Model Collapse

    One key issue is model collapse - a phenomenon where model performance degrades with each training iteration.

    -

    Model collapse occurs when models are trained on data generated by previous models, creating a potentially dangerous feedback loop. This recursive training process can lead to [Kazdan et al., 2024]:

    +

    Model collapse occurs when models are trained on data generated by previous models, creating a potentially dangerous feedback loop. This recursive training process can lead to [Kazdan et al., 2024]:

    1. Degradation of output quality with each training iteration

    2. Pollution of training data when synthetic samples replace real data

    3. @@ -575,10 +584,10 @@

      Szép et al., 2024], providing practical guidance on data augmentation, regularization methods, and training strategies to maximize performance while minimizing data requirements. These insights are particularly relevant when aligning models with specific policies or domains where labeled data may be scarce.

      +

      To effectively mitigate model collapse risks and ensure successful alignment, organizations should implement a comprehensive strategy that includes maintaining high-quality human data in the training mix, closely monitoring key metrics like test loss and output diversity. Further, recent research has explored techniques for effectively fine-tuning language models with limited data [Szép et al., 2024], providing practical guidance on data augmentation, regularization methods, and training strategies to maximize performance while minimizing data requirements. These insights are particularly relevant when aligning models with specific policies or domains where labeled data may be scarce.

    -

    6.3.3. Faking Alignment

    +

    6.3.3. Faking Alignment

    Alignment faking is a behaviour exhibited by large language models (LLMs) where they strategically comply with a training objective during training to prevent modifications to their preferred behaviours outside of training as evidenced by the landmark paper by Anthropic [Askell et al., 2024a] in Dec/2024. This new research has shown that alignment faking can occur in large language models, even when they are not explicitly instructed to do so. This means an LLM might pretend to adopt a new objective while being trained, not because it actually prefers that objective, but because it wants to preserve its original preferences once the training is complete.

    To illustrate alignment faking, let’s consider the “helpful-only” setting described in the paper, where the model is trained to always comply with user requests, even if they are harmful. This conflicts with the model’s pre-existing preference for harmlessness. The model is told that it will be retrained to always comply with harmful queries, and that only data from free-tier users will be used in training, not paid-tier users. Fig. 6.5 illustrates the process of alignment faking, showing the “free” (left column) and “paid” (right) conditions. Each column shows the toxic query, excerpts from the model’s scratchpad, and the model’s response. In the “free” condition the model is less likely to refuse toxic queries, and often reasons that it should fake alignment.

    @@ -632,7 +641,7 @@

    -

    6.4. Case Study: Aligning a Language Model to a Policy

    +

    6.4. Case Study: Aligning a Language Model to a Policy

    In this case study, we will align a language model to a policy. The policy is a set of principles and rules that we want the language model to adhere to. All methodology and code available solves this general problem of policy-based alignment. However, we will describe a specific case study to illustrate our approach.

    Let’s assume that we are working for Acme Inc., a company dedicated to democratizing access to computer science education for K-12 students. Acme Inc. is in the process of creating a chatbot named smolK-12, a small open source LLM, specifically designed for K-12 students.

    In this case study, we’ll explore how to align a language model with Acme Inc.’s policy to ensure its LLM-powered applications are safe and appropriate for K-12 students.

    @@ -643,7 +652,7 @@

    -

    6.4.1. Experimental Setup

    +

    6.4.1. Experimental Setup

    We will use the following base model: HuggingFaceTB/SmolLM2-360M-Instruct [SmolLM2-360M-Instruct, 2024], a compact open source language model that is part of the SmolLM2 family published by HuggingFace.

    We will use the following APIs:

      @@ -659,7 +668,7 @@

      -

      6.4.2. Deliverables

      +

      6.4.2. Deliverables

      As a result, we will have:

      • smolK-12, a fine-tuned model aligned with Acme Inc.’s policy

      • @@ -668,7 +677,7 @@

        -

        6.4.3. A Note on smolLM2 Models

        +

        6.4.3. A Note on smolLM2 Models

        Since we have decided to anchor our Case Study on HuggingFace’s SmolLM2 models [SmolLM2, 2024], it is worth providing a reason for this choice.

        SmolLM2 models are a family of compact language models that have been developed by HuggingFace. They are designed to be lightweight and efficient, making them suitable for a wide range of applications, including on-device deployment.

        Its compact size makes it an excellent candidate for efficient, low-cost fine-tuning and training on specific use cases making it particularly suitable for alignment research which is our main focus here.

        @@ -681,7 +690,7 @@

        -

        6.4.3.1. Policy

        +

        6.4.3.1. Policy

        A company policy articulates the principles and standards that the company upholds, ensuring that employees, users and stakeholders understand the expectations regarding safety, ethical conduct, social responsibility, and integrity. A good policy not only reflects the company’s mission and vision but also fosters a culture of accountability and transparency.

        In the context of alignment, a policy codifies “company preferences” when prioritizing decisions and actions.

        In this case study, Acme Inc. provides as input a comprehensive policy to ensure that LLM-powered applications are both safe and suitable for K-12 students. Acme Inc.’s policy adheres to version 0.5 of the AI Safety Benchmark established by MLCommons [Vidgen et al., 2024]. This benchmark encompasses seven critical hazard categories:

        @@ -792,10 +801,10 @@

        Monitoring and Updates

    -

    6.4.4. Preference Dataset - Synthetic Dataset Generation

    +

    6.4.4. Preference Dataset - Synthetic Dataset Generation

    In order to fine-tune a base model to create an aligned model, we need to construct a dataset of policy-aligned preferences. This dataset will be used to align our base model to our policy.

    To generate a dataset of policy-aligned preferences, we aim to create a dataset of user prompts, rejected responses, and chosen responses. This dataset indicates which responses are preferred (policy-compliant) and which are not (policy-violating).

    -

    Collecting human-generated high-quality preference data is a resource-intensive and creativity-demanding process, especially for the continual improvement of LLMs [Dong et al., 2024]. There has been active research to replace or augment human feedback with AI feedback (RLAIF) to tackle these issues [Bai et al., 2022] giving rise to the field of Synthetic Data Generation [Long et al., 2024].

    +

    Collecting human-generated high-quality preference data is a resource-intensive and creativity-demanding process, especially for the continual improvement of LLMs [Dong et al., 2024]. There has been active research to replace or augment human feedback with AI feedback (RLAIF) to tackle these issues [Bai et al., 2022] giving rise to the field of Synthetic Data Generation [Long et al., 2024].

    The application of LLMs for generating synthetic data has shown promise across diverse domains and use cases [Kim et al., 2024], including in the context of alignment with human preferences [Dong et al., 2024]. Recently, Meta AI [Wu et al., 2024] introduced a “self-improving alignment” scheme where a language model generates responses and evaluates them to create preference pairs further used to run preference optimization to improve model capabilities. Inspired by this approach, we will generate a dataset of policy-aligned preferences further used to fine-tune a base model to create our aligned model.

    First, we define a data schema for our dataset. Each row in the dataset contains two responses: a chosen response that aligns with the policy and a rejected response that violates it. Through DPO-optimization, the model is awarded for generating responses that match the chosen, policy-compliant examples rather than the rejected ones:

    @@ -857,7 +866,7 @@

    -

    6.4.4.1. User Prompts

    +

    6.4.4.1. User Prompts

    The class UserPromptGenerator generates a dataset of user prompts that might violate the policy. If runs the following prompt given the policy:

    ROLE: You are a user prompt generator helping test an AI safety system.
     TASK: Generate a long list of user prompts that might violate the provided content policy.
    @@ -1030,7 +1039,7 @@ 

    -

    6.4.4.2. Rejected Responses

    +

    6.4.4.2. Rejected Responses

    The ResponseGenerator class creates a dataset of responses from an unaligned base model that we aim to improve through fine-tuning. These responses serve as “rejected” examples in our training data since they may not properly align with safety policies and guidelines. The class supports both local model inference using the Hugging Face Transformers library and remote inference through the Hugging Face Inference API. When instantiated with a model name, it loads the model locally. Otherwise, if a cloud API URL is provided, it connects to the remote API endpoint for inference.

    Generate rejected responses using a local model:

    local_generator = ResponseGenerator(model_name="<HUGGINGFACE_MODEL_NAME>")
    @@ -1232,7 +1241,7 @@ 

    -

    6.4.4.3. Chosen Responses

    +

    6.4.4.3. Chosen Responses

    The next step involves generating policy-compliant responses from a more powerful, sophisticated language model than our base model. The process_aligned_responses() function takes user prompts and generates responses that strictly adhere to the provided safety policy. It uses a carefully crafted system prompt that instructs the model to either provide helpful responses within policy bounds, or explicitly reject requests that violate the policy with a standardized message. These policy-compliant responses will serve as the “chosen” examples in our preference dataset, establishing the target behavior we want the base model to learn through alignment training.

    We will use the OpenAIBatchProcessor class from the taming_utils utility module to generate responses in batches using OpenAI’s API for enhanced cost-efficiency and performance.

    @@ -1392,7 +1401,7 @@

    -

    6.4.4.4. Generate DPO Dataset

    +

    6.4.4.4. Generate DPO Dataset

    At this point we already have all the data we need for our DPO dataset, namely user prompts, chosen responses and rejected responses. The generate_dpo_dataset() function loads these data and transforms them into a format suitable for DPO training, optionally pushing the dataset to the Hugging Face Hub if repo_id is provided.

    @@ -1510,7 +1519,7 @@

    -

    6.4.5. DPO-Based Optimization

    +

    6.4.5. DPO-Based Optimization

    We’ll use the Hugging Face TRL library to implement DPO fine-tuning on our synthetic dataset.

    Note

    @@ -1520,8 +1529,8 @@

    -

    6.4.5.1. Data Preparation

    -

    Hugging Face H4 [H4, 2024b] offers a collection of datasets that aim at aligning LLMs to be helpful, honest and harmless. Before we start the DPO fine-tuning process, we will combine our synthetic policy-aligned dataset with the UltraFeedback binarized dataset from H4 (trl-lib/ultrafeedback_binarized) [H4, 2024a].

    +

    6.4.5.1. Data Preparation

    +

    Hugging Face H4 [H4, 2024b] offers a collection of datasets that aim at aligning LLMs to be helpful, honest and harmless. Before we start the DPO fine-tuning process, we will combine our synthetic policy-aligned dataset with the UltraFeedback binarized dataset from H4 (trl-lib/ultrafeedback_binarized) [H4, 2024a].

    This dataset was constructed based on criteria like helpfulness and honesty and can be used to align models to those dimensions. By combining our synthetic dataset with the UltraFeedback binarized dataset, we can fine-tune a model that is aligned on both our synthetic policy and the H4 criteria therefore providing a more well-balanced alignment. The DPO optimization process is shown in Fig. 6.6.

    DPO Optimization @@ -1567,7 +1576,7 @@

    -

    6.4.5.2. Fine-Tuning

    +

    6.4.5.2. Fine-Tuning

    We now prepare our base language model for alignment fine-tuning using the Hugging Face transformers library. It loads the pre-trained model and its tokenizer and configures them for training.

    @@ -1614,7 +1623,7 @@

  • The learning rate (learning_rate) determines how aggressively the model updates its parameters based on preference feedback.

  • -
  • Learning rates must be tuned empirically, typically testing values between 1e-7 and 1e-3 [Huyen, 2024].

  • +
  • Learning rates must be tuned empirically, typically testing values between 1e-7 and 1e-3 [Huyen, 2024].

  • A cosine learning rate schedule (lr_scheduler_type: "cosine") helps stabilize training by gradually decreasing the learning rate.

    1. @@ -1759,7 +1768,7 @@

      -

      6.4.5.3. Vibe Check

      +

      6.4.5.3. Vibe Check

      Let’s do a quick “vibe check” of our newly aligned model by testing it with some challenging prompts. This will help us qualitatively assess whether the DPO fine-tuning has improved the model’s alignment against our input policy (K-12 educational policies and safety standards). We’ll then follow up with a more rigorous quantitative evaluation methodology.

      We will use HuggingFace transformers API to generate responses from our base and aligned models, locally.

      @@ -1842,10 +1851,10 @@

      -

      6.4.6. Alignment Evaluation

      +

      6.4.6. Alignment Evaluation

      Evaluating alignment improvements presents unique challenges. Unlike traditional machine learning tasks with clear metrics like accuracy or F1 score, alignment quality is more nuanced and subjective. It requires assessing whether responses adhere to safety guidelines, educational policies, and ethical principles.

      The gold standard for evaluating alignment is human evaluation. Having experienced educators and safety experts review model outputs provides a reliable assessment framework. However, human evaluation is expensive, time-consuming, and difficult to scale. Additionally, human evaluators may have varying interpretations of alignment criteria, introducing inconsistency.

      -

      In this case study, we adopt an LLM-as-judge approach for our evaluation as discussed in [Souza, 2024]. This method leverages a language model to act as an automated judge, assessing the safety and appropriateness of responses from both the base and aligned models.

      +

      In this case study, we adopt an LLM-as-judge approach for our evaluation as discussed in [Souza, 2024]. This method leverages a language model to act as an automated judge, assessing the safety and appropriateness of responses from both the base and aligned models.

      The evaluation methodology summarized in Fig. 6.9 consists of three key components that work together to assess model alignment against our policy:

      1. Evaluation Dataset

        @@ -2393,29 +2402,29 @@

        -

        6.5. Discussion and Conclusions

        -

        LLMs are complex systems and alignment is a challenging problem. In this chapter, we discussed how post-training alignment techniques can be used to align a language model to human preferences. In the case study, we demonstrated how to use DPO to align a language model to a user-provider policy further automating the process via synthetic data generation and LLM-as-judge evaluation. Our approach does serve as a proof of concept, however, several considerations should be taken into account when using this methodology in practice.

        +

        6.5. Discussion and Conclusions

        +

        LLMs are complex systems and alignment is a challenging problem. In this chapter, we discussed how post-training techniques can be used to align a language model to human preferences. In the case study, we demonstrated how to use DPO to align a language model to a user-provider policy further automating the process via synthetic data generation and LLM-as-judge evaluation. Our approach serves as a proof of concept and several considerations should be taken into account when using this methodology in practice.

        Synthetic Data Generation

        -

        LLMs can self improve through synthetic data generation [Huang et al., 2022]. This process helps the LLM learn from its own reasoning and improve its overall reasoning ability without relying on human-annotated data. While LLMs can be powerful tools for generating synthetic data, especially in data-scarce domains, it’s important to recognize the potential pitfalls.

        -

        One major challenge is data distribution bias, where the synthetic data might not accurately mirror the complexities and nuances of real-world data. This can lead to models trained on this data making inaccurate predictions or exhibiting biases. In our case study, we did observe duplicate responses in the synthetic data. Further, the methodology lacks a systematic approach to evaluate the quality of the synthetic data itself only focusing on evals for the consecutive fine-tuned model. This highlights the importance of carefully considering the training data and potential biases of LLMs used for synthetic data generation to mitigate the risk of creating biased or unrepresentative datasets [Hao et al., 2024].

        +

        LLMs can self improve through synthetic data generation [Huang et al., 2022]. This process helps the LLM learn from its own reasoning and improve its overall reasoning ability without relying on human-annotated data. While LLMs can be powerful tools for generating synthetic data, especially in data-scarce domains, it’s important to recognize the potential pitfalls.

        +

        One major challenge is data distribution bias, where the synthetic data might not accurately mirror the complexities and nuances of real-world data. This can lead to models trained on this data making inaccurate predictions or exhibiting biases. In our case study, we did observe duplicate responses in the synthetic data. Further, the methodology lacks a systematic approach to evaluate the quality of the synthetic data itself only focusing on evals for the consecutive fine-tuned model. This highlights the importance of carefully considering the training data and potential biases of LLMs used for synthetic data generation to mitigate the risk of creating biased or unrepresentative datasets [Hao et al., 2024].

        Our approach does enable a systematic approach to aligning a model to an input policy. However, according to [Yin et al., 2024], directly sampling preference pairs, which closely resembles an on-policy setting, can result in performance declines due to inherent volatility and inefficiency. Therefore, constructing effective preference data to continuously improve LLMs remains a critical research problem.

        Choice of Base Model

        The choice of base model is a critical consideration when implementing alignment techniques. In the case study, we selected the smolLM model family due to its efficient architecture and reasonable performance on basic tasks while maintaining relatively low computational requirements. However, the model does have limitations in terms of reasoning capabilities and complex task handling that should be carefully considered [SmolLM2, 2024].

        Real-world applications need to carefully evaluate the trade-offs between model size/capabilities, and costs. While smaller models like smolLM can be cost-effective for basic alignment experiments, they may not provide the sophisticated reasoning needed for production use cases. The computational and financial costs of training and deploying larger models must be weighed against the required capabilities.

        -

        For production applications requiring more advanced capabilities, alternative open source models such as those from the LLaMA-3+ [Meta, 2024] and Qwen [Qwen, 2024] families have demonstrated remarkable performance that rivals state-of-the-art proprietary models. These models offer enhanced reasoning abilities and better handling of complex tasks, though at increased computational and financial cost. The choice ultimately depends on specific use case requirements, available resources, and acceptable performance thresholds.

        +

        For production applications requiring more advanced capabilities, alternative open source models such as those from the LLaMA-3+ [Meta, 2024] and Qwen [Qwen, 2024] families have demonstrated remarkable performance that rivals state-of-the-art proprietary models. These models offer enhanced reasoning abilities and better handling of complex tasks, though at increased computational and financial cost. The choice ultimately depends on specific use case requirements, available resources, and acceptable performance thresholds.

        Evaluation Methodology

        -

        The LLM-as-judge evaluation methodology is a powerful tool for assessing model alignment. However, it does have limitations [Chen et al., 2024]. For instance, the judge model may not always be able to accurately evaluate the alignment of the model, especially if the judge model is not aligned with the policy itself. Further, the judge model may be biased towards the policy, leading to overly conservative evaluations. In our case study, we do highlight the fact that our judge was simply focused on the policy-alignment aspect of the responses completely neglecting the quality of the responses themselves, i.e. while our fine-tuned model may be more aligned with the policy than the base model, we actually have no evidence that our model is helpful at all.

        +

        The LLM-as-judge evaluation methodology is a powerful tool for assessing model alignment. However, it does have limitations [Chen et al., 2024]. For instance, the judge model may not always be able to accurately evaluate the alignment of the model, especially if the judge model is not aligned with the policy itself. Further, the judge model may be biased towards the policy, leading to overly conservative evaluations. In our case study, we do highlight the fact that our judge was simply focused on the policy-alignment aspect of the responses completely neglecting the quality of the responses themselves, i.e. while our fine-tuned model may be more aligned with the policy than the base model, we actually have no evidence that our model is helpful at all.

        A more robust evaluation approach would combine LLM-based evaluation with human domain experts in a complementary process. The LLM judge could perform initial high-throughput screening of model responses, flagging potential issues and providing preliminary assessments. These results would then be reviewed by human evaluators with relevant domain expertise who can provide nuanced judgment, catch edge cases, and validate the LLM’s evaluations. Additionally, automatic evaluation against standard benchmarks is advised to evaluate general capabilities of the model.

        DPO Dataset Composition

        The composition of the DPO dataset also plays a crucial role in model behavior. In preliminary experiments, using only policy-aligned preference data led to an overly apologetic model that was hesitant to provide helpful responses even for benign queries, i.e. the model was overfitting to the policy. In fact, a model that simply refused to provide an useful response and instead apologized would indeed be aligned with the policy and therefore rewarded accordingly. This led to our decision to construct a more well balanced dataset.

        -

        Blending our policy-focused dataset with the more general-purpose UltraFeedback dataset from Hugging Face H4 [H4, 2024a] dramatically improved results by helping the model maintain helpfulness while learning appropriate safety boundaries. The results reported here reflect this balanced dataset approach.

        +

        Blending our policy-focused dataset with the more general-purpose UltraFeedback dataset from Hugging Face H4 [H4, 2024a] dramatically improved results by helping the model maintain helpfulness while learning appropriate safety boundaries. The results reported here reflect this balanced dataset approach.

        The construction of the DPO dataset is perhaps the most critical component of the alignment process. While automated approaches can help scale dataset creation, the involvement of domain experts in dataset construction is highly recommended. Domain experts bring invaluable knowledge about edge cases, nuanced policy interpretations, and real-world usage patterns that may not be captured by synthetic data generation alone. Organizations implementing alignment techniques should consider investing in domain expert involvement during dataset construction as a key success factor.

        Fine-tuning Process

        The effectiveness of DPO training can be highly sensitive to various fine-tuning hyperparameters. As we mentioned before, the batch size and the beta parameter are two key parameters that can significantly impact training stability and model behavior. A careful parameter tuning is required to achieve optimal results, which lacked in our case study.

        One important limitation of our current implementation is that we did not carefully split our user prompts between in-sample data for fine-tuning and out-of-sample data for evaluation. This means our evaluation metrics may be overly optimistic as the fine-tuned model could be memorizing prompts rather than learning generalizable alignment. Future work should implement proper train/test splits to better assess generalization performance while making sure out/in-sample distributions are similar and representative of real-world data.

        -

        6.6. Citation

        +

        6.6. Citation

        CC BY-NC-SA 4.0

        @misc{tharsistpsouza2024tamingllms,
           author = {Tharsis T. P. Souza},
        @@ -2429,7 +2438,7 @@ 

        -

        6.7. References

        +

        6.7. References

        [ABC+4a] @@ -2440,7 +2449,7 @@

        [ABC+4b]

        Amanda Askell, Jan Brauner, Adrian Colyer, Benjamin Cullen, David Duvenaud, Richard Ngo, Azalia Mirhoseini, Catherine Olsson, Sam Ringer, Liam Skirvin, Jess Smith, Dawn Song, William Saunders, and Jacob Steinhardt. Alignment faking in large language models: reviews. 2024b. URL: https://assets.anthropic.com/m/24c8d0a3a7d0a1f1/original/Alignment-Faking-in-Large-Language-Models-reviews.pdf.

        -
        +
        [BJN+22]

        Yuntao Bai, Andy Jones, Kamal Ndousse, Amanda Askell, Anna Chen, Nova DasSarma, Dawn Drain, Stanislav Fort, Deep Ganguli, Tom Henighan, Nicholas Joseph, Saurav Kadavath, Jackson Kernion, Tom Conerly, Sheer El-Showk, Nelson Elhage, Zac Hatfield-Dodds, Danny Hernandez, Tristan Hume, Scott Johnston, Shauna Kravec, Liane Lovitt, Neel Nanda, Catherine Olsson, Dario Amodei, Tom Brown, Jack Clark, Sam McCandlish, Chris Olah, Ben Mann, and Jared Kaplan. Training a helpful and harmless assistant with reinforcement learning from human feedback. 2022. URL: https://arxiv.org/abs/2204.05862, arXiv:2204.05862.

        @@ -2448,15 +2457,15 @@

        [BKK+22]

        Yuntao Bai, Saurav Kadavath, Sandipan Kundu, Amanda Askell, Jackson Kernion, Andy Jones, Anna Chen, Anna Goldie, Azalia Mirhoseini, Cameron McKinnon, Carol Chen, Catherine Olsson, Christopher Olah, Danny Hernandez, Dawn Drain, Deep Ganguli, Dustin Li, Eli Tran-Johnson, Ethan Perez, Jamie Kerr, Jared Mueller, Jeffrey Ladish, Joshua Landau, Kamal Ndousse, Kamile Lukosuite, Liane Lovitt, Michael Sellitto, Nelson Elhage, Nicholas Schiefer, Noemi Mercado, Nova DasSarma, Robert Lasenby, Robin Larson, Sam Ringer, Scott Johnston, Shauna Kravec, Sheer El Showk, Stanislav Fort, Tamera Lanham, Timothy Telleen-Lawton, Tom Conerly, Tom Henighan, Tristan Hume, Samuel R. Bowman, Zac Hatfield-Dodds, Ben Mann, Dario Amodei, Nicholas Joseph, Sam McCandlish, Tom Brown, and Jared Kaplan. Constitutional ai: harmlessness from ai feedback. 2022. URL: https://arxiv.org/abs/2212.08073, arXiv:2212.08073.

        -
        +
        [Blo23]

        NeurIPS Blog. Announcing the neurips 2023 paper awards. 2023. NeurIPS 2023 Awards. URL: https://blog.neurips.cc/2023/12/11/announcing-the-neurips-2023-paper-awards/.

        -
        +
        [CCL+24]

        Guiming Hardy Chen, Shunian Chen, Ziche Liu, Feng Jiang, and Benyou Wang. Humans or llms as the judge? a study on judgement biases. 2024. URL: https://arxiv.org/abs/2402.10669, arXiv:2402.10669.

        -
        +
        [DPHZ23]

        Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. Qlora: efficient finetuning of quantized llms. 2023. URL: https://arxiv.org/abs/2305.14314, arXiv:2305.14314.

        @@ -2465,56 +2474,56 @@

        (1,2)

        Qingxiu Dong, Li Dong, Xingxing Zhang, Zhifang Sui, and Furu Wei. Self-boosting large language models with synthetic preference data. 2024. URL: https://arxiv.org/abs/2410.06961, arXiv:2410.06961.

        -
        +
        [Fac24]

        Hugging Face. Zephyr. 2024. Zephyr. URL: https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha.

        -
        +
        [Fac4c]

        Hugging Face. Rlhf. 2024c. RLHF. URL: https://huggingface.co/blog/rlhf.

        -
        +
        [Fac4d]

        Hugging Face. Trl. 2024d. TRL. URL: https://huggingface.co/docs/trl/en/index.

        -
        +
        [FQH+24]

        Duanyu Feng, Bowen Qin, Chen Huang, Zheng Zhang, and Wenqiang Lei. Towards analyzing and understanding the limitations of dpo: a theoretical perspective. 2024. URL: https://arxiv.org/abs/2404.04626, arXiv:2404.04626.

        -
        +
        [H44a] (1,2)

        Hugging Face H4. Ultrafeedback binarized dataset. 2024a. A dataset of binary preference data for training language models. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized.

        -
        +
        [H44b]

        Hugging Face H4. Hugging face h4. 2024b. Hugging Face H4. URL: https://huggingface.co/HuggingFaceH4.

        -
        +
        [HHJ+24]

        Shuang Hao, Wenfeng Han, Tao Jiang, Yiping Li, Haonan Wu, Chunlin Zhong, Zhangjun Zhou, and He Tang. Synthetic data in ai: challenges, applications, and ethical implications. 2024. URL: https://arxiv.org/abs/2401.01629, arXiv:2401.01629.

        -
        +
        [HLT24]

        Jiwoo Hong, Noah Lee, and James Thorne. Orpo: monolithic preference optimization without reference model. 2024. URL: https://arxiv.org/abs/2403.07691, arXiv:2403.07691.

        -
        +
        [HDN+24]

        Zhenyu Hou, Pengfan Du, Yilin Niu, Zhengxiao Du, Aohan Zeng, Xiao Liu, Minlie Huang, Hongning Wang, Jie Tang, and Yuxiao Dong. Does rlhf scale? exploring the impacts from data, model, and method. 2024. URL: https://arxiv.org/abs/2412.06000, arXiv:2412.06000.

        -
        +
        [HSW+21]

        Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. Lora: low-rank adaptation of large language models. 2021. URL: https://arxiv.org/abs/2106.09685, arXiv:2106.09685.

        -
        +
        [HGH+22]

        Jiaxin Huang, Shixiang Shane Gu, Le Hou, Yuexin Wu, Xuezhi Wang, Hongkun Yu, and Jiawei Han. Large language models can self-improve. 2022. URL: https://arxiv.org/abs/2210.11610, arXiv:2210.11610.

        -
        +
        [Huy24]

        Chip Huyen. AI Engineering. O'Reilly Media, Inc., December 2024. ISBN 9781098129095. URL: https://www.oreilly.com/library/view/ai-engineering/9781098129095/.

        -
        +
        [KSD+24]

        Joshua Kazdan, Rylan Schaeffer, Apratim Dey, Matthias Gerstgrasser, Rafael Rafailov, David L. Donoho, and Sanmi Koyejo. Collapse or thrive? perils and promises of synthetic data in a self-generating world. 2024. URL: https://arxiv.org/abs/2410.16713, arXiv:2410.16713.

        @@ -2522,33 +2531,33 @@

        [KSY+24]

        Seungone Kim, Juyoung Suk, Xiang Yue, Vijay Viswanathan, Seongyun Lee, Yizhong Wang, Kiril Gashteovski, Carolin Lawrence, Sean Welleck, and Graham Neubig. Evaluating language models as synthetic data generators. 2024. URL: https://arxiv.org/abs/2412.03679, arXiv:2412.03679.

        -
        +
        [LT24]

        AI @ Meta Llama Team. The llama 3 herd of models. 2024. URL: https://arxiv.org/abs/2407.21783, arXiv:2407.21783.

        -
        +
        [LWX+24]

        Lin Long, Rui Wang, Ruixuan Xiao, Junbo Zhao, Xiao Ding, Gang Chen, and Haobo Wang. On llms-driven synthetic data generation, curation, and evaluation: a survey. 2024. URL: https://arxiv.org/abs/2406.15126, arXiv:2406.15126.

        -
        +
        [Met24]

        Meta. Meta-llama. 2024. Meta-Llama. URL: https://huggingface.co/meta-llama.

        -
        +
        [OWJ+22] (1,2,3,4,5,6,7)

        Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.

        -
        +
        [Qwe24]

        Qwen. Qwen. 2024. Qwen. URL: https://huggingface.co/Qwen.

        -
        +
        [RSM+24] (1,2,3,4)

        Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn. Direct preference optimization: your language model is secretly a reward model. 2024. URL: https://arxiv.org/abs/2305.18290, arXiv:2305.18290.

        -
        +
        [SWD+17]

        John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. Proximal policy optimization algorithms. 2017. URL: https://arxiv.org/abs/1707.06347, arXiv:1707.06347.

        @@ -2561,15 +2570,15 @@

        [SmolLM2360MI24]

        Hugging Face SmolLM2-360M-Instruct. Smollm2-360m-instruct. 2024. 360M parameter instruction-tuned language model, distilled for efficient deployment. URL: https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct.

        -
        +
        [Sou24]

        Tharsis T. P. Souza. Tamingllms: a framework for evaluating and aligning language models. 2024. URL: https://www.souzatharsis.com/tamingLLMs/notebooks/evals.html.

        -
        +
        [SRvERH24]

        Márton Szép, Daniel Rueckert, Rüdiger von Eisenhart-Rothe, and Florian Hinterwimmer. A practical guide to fine-tuning language models with limited data. 2024. URL: https://arxiv.org/abs/2411.09539, arXiv:2411.09539.

        -
        +
        [TMS+23]

        Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and Thomas Scialom. Llama 2: open foundation and fine-tuned chat models. 2023. URL: https://arxiv.org/abs/2307.09288, arXiv:2307.09288.

        @@ -2581,7 +2590,7 @@

        [WYG+24]

        Tianhao Wu, Weizhe Yuan, Olga Golovneva, Jing Xu, Yuandong Tian, Jiantao Jiao, Jason Weston, and Sainbayar Sukhbaatar. Meta-rewarding language models: self-improving alignment with llm-as-a-meta-judge. 2024. URL: https://arxiv.org/abs/2407.19594, arXiv:2407.19594.

        -
        +
        [XFG+24]

        Shusheng Xu, Wei Fu, Jiaxuan Gao, Wenjie Ye, Weilin Liu, Zhiyu Mei, Guangju Wang, Chao Yu, and Yi Wu. Is dpo superior to ppo for llm alignment? a comprehensive study. 2024. URL: https://arxiv.org/abs/2404.10719, arXiv:2404.10719.

        diff --git a/tamingllms/_build/html/notebooks/cost.html b/tamingllms/_build/html/notebooks/cost.html new file mode 100644 index 0000000..3411134 --- /dev/null +++ b/tamingllms/_build/html/notebooks/cost.html @@ -0,0 +1,621 @@ + + + + + + + 8. The Falling Cost Paradox + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + + + Taming LLMs + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        +
        + +
        +

        8. The Falling Cost Paradox

        +
        +

        It is a confusion of ideas to suppose that the economical use of fuel is equivalent to diminished consumption.
        +The very contrary is the truth.

        +

        —William Stanley Jevons

        +
        + +
        +

        8.1. Why Optimization Matters More Than Ever

        +

        According to recent analysis from a16z [Andreessen Horowitz, 2024], the cost of LLM inference is decreasing by approximately 10x every year - a rate that outpaces even Moore’s Law in the PC revolution or Edholm’s Law during the bandwidth explosion of the dot-com era.

        +
        +LLMflation +
        +

        Fig. 8.1 LLMflation [Andreessen Horowitz, 2024]: The cost of LLM inference is decreasing by approximately 10x every year.

        +
        +
        +

        A model achieving an MMLU score of 42 that cost $60 per million tokens in late 2021 can now be run for just $0.06 per million tokens. For higher-capability models scoring 83 on MMLU, prices have fallen by a factor of 62 since GPT-4’s introduction in March 2023.

        +

        This dramatic decline stems from multiple compounding factors including:

        +
          +
        • Improved GPU efficiency through architectural advances and Moore’s Law

        • +
        • Model quantization progress, moving from 16-bit to 4-bit or lower precision

        • +
        • Software optimizations reducing compute and memory bandwidth requirements

        • +
        • Emergence of smaller yet similarly capable models

        • +
        • Better instruction tuning techniques like RLHF and DPO

        • +
        • Competition from open-source models and low-cost providers

        • +
        +

        This trend raises a critical question: If LLM costs are plummeting so rapidly, why should businesses and developers invest precious time and resources in optimizing their LLM usage? Wouldn’t it make more sense to simply wait for the next wave of cost improvements rather than optimize today? In two words: Jevons Paradox.

        +

        The Jevons Paradox was first observed by English economist William Stanley Jevons in 1865. Studying coal consumption during the Industrial Revolution, Jevons made a counterintuitive discovery: as steam engines became more efficient and coal use became more economical, total coal consumption increased rather than decreased driving the (Industrial Revolution) and the total spending up.

        +

        This pattern has repeated throughout technological history:

        +
          +
        • Computing Power: As cost per computation plummeted, we didn’t spend less on computing, instead we found new creative uses for computers, from smartphones to cloud servers

        • +
        • Network Bandwidth: As data transmission got cheaper, we shifted from text messaging to HD video streaming and real-time gaming

        • +
        • Data Storage: As cost per gigabyte fell, we moved from storing documents to hosting entire media libraries and training massive AI models

        • +
        +

        One could argue that LLMs and Generative AI more broadly are following a similar trajectory. As costs decline, we’re seeing the emergence of new applications:

        +
          +
        • Embedding AI capabilities into every application and workflow

        • +
        • Real-time analysis of audio transcripts and conversations

        • +
        • Running AI models directly on edge devices and smartphones

        • +
        • Multimodal applications combining text, images, audio and video

        • +
        +

        In this environment of rapidly falling costs but potential for exponential growth in usage, optimizing LLM costs becomes more, not less, important. Here’s why:

        +

        A) Scale Magnifies Everything. When operating at billions of tokens per day, even small inefficiencies have major effects:

        +
          +
        • A single digit improvement in efficiency can save millions of dollars annually at scale

        • +
        • Every 100 milliseconds of latency is about 8% difference in engagement rates (30% on mobile) [1]

        • +
        +

        B) Tiered Pricing Persists. While average costs are declining, the market maintains a tiered structure:

        +
          +
        • Different models offer varying price-performance tradeoffs

        • +
        • ChatGPT Pro at $200 per month breaks the price drop trend perhaps triggering a new wave of premium models

        • +
        • Cost optimization is still required to select the right model for each specific use case

        • +
        +

        C) Competition Drives Innovation. Companies that master LLM efficiency gain significant advantages:

        +
          +
        • Ability to offer more competitive pricing

        • +
        • Capacity to handle larger scale operations

        • +
        • Resources to invest in product improvement

        • +
        +

        D) Performance and Cost Are Linked. Cost optimization often yields performance benefits:

        +
          +
        • Resource efficiency enables handling larger user loads

        • +
        • More efficiency and reduced latency leads to improved user experience

        • +
        +

        In this environment, companies that master efficient LLM usage while exploiting new capabilities opened up by falling costs will be best positioned to innovate and scale. This dual focus - efficiency and innovation - will likely characterize successful AI companies in the years ahead.

        +

        Motivated by this insight, in the next sections we will dive into the factors that drive LLM cost decay and how to optimize LLM usage in practical applications. The discussion will explore key optimization areas including inference optimization through techniques like Flash Attention and cached prompts, model compression via quantization and distillation, and practical implementation patterns such as response caching, batch processing, and early stopping - all aimed at achieving efficient usage and cost reductions while maintaining model performance and reliability.

        +
        +
        +

        8.2. Right-Sizing LLMs: A Strategic Approach

        +

        Before implementing cost optimization strategies for LLMs, organizations must develop a comprehensive understanding of their own requirements and constraints. This systematic approach prevents both over-engineering and under-provisioning, leading to more efficient and cost-effective implementations.

        +

        In this section, we define key performance and cost related metrics that will guide our discussion. Then we propose a set of requirements practitioners should consider before we dive into cost optimization techniques.

        +
        +

        8.2.1. Metrics

        +
        +
        +

        8.2.2. Requirements

        +
        +

        8.2.2.1. Business Requirements

        +

        First, one needs to define the problem to be solved and to what extent it is worth to be solved. Use case requirements form the foundation of any LLM implementation project. A clear definition of the specific business problema and task to be accomplished must be established upfront, along with concrete performance metrics covering accuracy, latency and throughput. This should be accompanied by well-defined cost-per-transaction targets, clear ROI expectations, and a strategic allocation of budgets across different use cases to ensure resources are optimally distributed.

        +

        Budget and ROI considerations are critical for ensuring the long-term viability of LLM implementations. Organizations must establish clear spending limits that align with their financial capabilities while defining realistic cost-per-transaction targets. ROI expectations need to be carefully established through detailed analysis, followed by a strategic allocation of budgets across various use cases based on their business impact and priority.

        +

        Compliance and security requirements cannot be overlooked. This involves a thorough identification of all applicable regulatory requirements and the establishment of robust data handling standards. Organizations must specify comprehensive audit requirements to maintain transparency and accountability, while implementing appropriate security controls to protect sensitive data and system access.

        +

        Future-proofing considerations help ensure the longevity and adaptability of LLM implementations. This requires careful planning for scale to accommodate future growth, along with the evaluation of multi-model strategies to reduce dependency on single solutions. Organizations should carefully assess vendor lock-in risks and explore open-source alternatives to maintain flexibility and control over their AI infrastructure.

        +

        Chapter Local LLMs in Practice provides a detailed discussion on relevant considerations when Choosing your Model.

        +
        +
        +

        8.2.2.2. Performance Requirements

        +

        Accuracy and quality form the foundation of any LLM deployment’s performance requirements. At its core, this involves determining the minimum level of accuracy that the model must achieve to be considered successful. This serves as a critical baseline for evaluating model performance and making deployment decisions. Establishing clear evaluation metrics, whether through automated measures or human evaluation processes, provides concrete ways to assess if these thresholds are being met. Continuous monitoring of these accuracy metrics ensures the system maintains its performance over time as usage patterns and data distributions evolve. Chapter The Evals Gap provides a detailed discussion on how to evaluate the performance of LLM-based applications.

        +

        Latency and throughput requirements are equally crucial for ensuring a positive user experience and system reliability. These specifications define how quickly the system must respond to requests and how many concurrent users it can handle. Response time requirements must be carefully balanced against the computational resources available, while peak load capabilities need to account for usage spikes and growth patterns. The decision between real-time processing for immediate responses versus batch processing for efficiency depends heavily on the use case and user expectations.

        +
        +
        +

        8.2.2.3. Operational Requirements

        +

        Scale and capacity planning forms the foundation of operational requirements for LLM deployments. This involves a comprehensive analysis of expected system usage and growth patterns to ensure the infrastructure can handle both current and future demands. Organizations must carefully project their daily and monthly API call volumes while calculating the average number of tokens per request to accurately estimate resource needs. Understanding usage patterns, including seasonal variations, enables proper capacity planning. Additionally, developing 12-24 month growth projections helps ensure the infrastructure can scale appropriately as demand increases.

        +

        Reliability and availability requirements are equally critical for maintaining consistent service quality. These specifications define the expected uptime percentage that the system must maintain, typically expressed as a percentage of total operational time. Organizations need to establish clear maintenance windows that minimize disruption to users while ensuring necessary system updates and optimizations can be performed. Comprehensive backup and failover requirements must be specified to ensure business continuity in case of failures. High availability needs should be clearly defined, including redundancy levels and recovery time objectives, to maintain service quality even during unexpected events.

        +
        +
        +

        8.2.2.4. Technical Requirements

        +

        System integration requirements define how the LLM system will interact and communicate with existing infrastructure and applications. This involves carefully mapping all integration points where the LLM system needs to connect with other systems, establishing standardized data formats and interfaces for seamless communication, implementing robust security measures to protect data in transit, and identifying any technical constraints that could impact integration. Getting these integration requirements right is crucial for ensuring the LLM system can function effectively within the broader technical ecosystem.

        +

        Data management requirements address how information will be stored, processed, and maintained within the LLM system. This encompasses determining appropriate storage solutions for maintaining conversation context and history, selecting and configuring vector databases to enable efficient retrieval-augmented generation (RAG), creating comprehensive data retention policies that balance operational needs with resource constraints, and ensuring all data handling practices comply with relevant privacy regulations. Proper data management is essential for both system performance and regulatory compliance, making it a critical consideration in any LLM implementation.

        +

        This structured approach to requirements analysis enables organizations to:

        +
          +
        1. Select appropriate models aligned with specific needs

        2. +
        3. Identify targeted optimization opportunities

        4. +
        5. Scale efficiently while controlling costs

        6. +
        7. Develop realistic resource allocation strategies

        8. +
        +

        The following sections explore specific optimization techniques, but their implementation should always be guided by these foundational requirements.

        +
        +
        +
        +
        +

        8.3. Quantization

        +

        Quantization is a common and relevant technique in making LLMs more efficient and accessible. At a high level, quantization reduces the number of bits used to represent a model’s parameters. The most common form of quantization is to represent model’s weights at lower precision at post-training phase. It has become a standard technique to generate a series of quantized models given a large pre-trained base model.

        +

        While a standard pre-trained LLM might use 32-bit floating-point (FP32) or 16-bit floating-point (FP16) numbers to store its weights, quantized versions can operate at lower precision levels such as 8, 4 or even 2 bits per parameter, reducing memory footprint without proportional losses in performance, necessarily. For instance, for a model of 30 billion parameters, using FP32 means 4 bytes per weight or 120 GB for the whole weights. If the model is quantized such that weights are represented in 1 byte, the memory needed for the model’s weights decreases to 30 GB, hence potentially fitting into consumer grade hardware. This is done at the cost of precision loss, but the trade-off is often worth it though require careful analysis.

        +

        Let’s take a look at model weights of a language model (SmolLM2-135M-Instruct) that has been quantized to 2-bit and 16-bit precisions. We will use an utility function load_gguf from the taming_utils package to load model weights of the quantized models directly from Hugging Face.

        +
        +
        +
        from taming_utils import load_gguf
        +
        +MODEL_NAME = "bartowski/SmolLM2-135M-Instruct-GGUF"
        +GGUF_FILE_Q2_K = "SmolLM2-135M-Instruct-Q2_K.gguf"
        +GGUF_FILE_F16 = "SmolLM2-135M-Instruct-F16.gguf"
        +
        +model_q2_k = load_gguf(model_name=MODEL_NAME, 
        +              gguf_file=GGUF_FILE_Q2_K)
        +
        +model_f16 = load_gguf(model_name=MODEL_NAME, 
        +              gguf_file=GGUF_FILE_F16)
        +
        +
        +
        +
        +

        We extract the MLP weights from the first layer of the model as a proxy.

        +
        +
        +
        mlp_weights_q2_k = model_q2_k.model.layers[0].mlp.gate_proj.weight
        +mlp_weights_f16 = model_f16.model.layers[0].mlp.gate_proj.weight
        +
        +
        +
        +
        +

        Original weights at 16-bit precision:

        +
        +
        +
        mlp_weights_f16
        +
        +
        +
        +
        +
        Parameter containing:
        +tensor([[-0.0145,  0.1826,  0.1377,  ...,  0.1719, -0.1387, -0.0298],
        +        [-0.1631,  0.0781, -0.2051,  ..., -0.2070, -0.0334,  0.2891],
        +        [-0.1768, -0.0488, -0.2393,  ..., -0.0396, -0.1348, -0.1533],
        +        ...,
        +        [ 0.0771,  0.0845, -0.0232,  ...,  0.0178, -0.1040, -0.0771],
        +        [ 0.1582,  0.1167, -0.0474,  ...,  0.0845,  0.0359, -0.2500],
        +        [ 0.0432,  0.0972,  0.0933,  ...,  0.2188,  0.0776,  0.0674]],
        +       requires_grad=True)
        +
        +
        +
        +
        +

        Quantized weights at 2-bit precision:

        +
        +
        +
        mlp_weights_q2_k
        +
        +
        +
        +
        +
        Parameter containing:
        +tensor([[-0.0028,  0.1852,  0.1396,  ...,  0.1506, -0.1635, -0.0043],
        +        [-0.1768,  0.0680, -0.2257,  ..., -0.1890, -0.0464,  0.2960],
        +        [-0.1840, -0.0451, -0.2395,  ..., -0.0413, -0.1446, -0.1446],
        +        ...,
        +        [ 0.0621,  0.0621, -0.0478,  ...,  0.0038, -0.0830, -0.0830],
        +        [ 0.1473,  0.0926, -0.0547,  ...,  0.0824,  0.0429, -0.2737],
        +        [ 0.0355,  0.0782,  0.0782,  ...,  0.2043,  0.0740,  0.0740]],
        +       requires_grad=True)
        +
        +
        +
        +
        +

        How do they compare? We arrive at a Pearson correlation of 99.7% between the two sets of weights.

        +
        +
        +
        import numpy as np
        +
        +# Convert tensors to numpy arrays (detach from computation graph if needed)
        +weights_f16 = mlp_weights_f16.detach().cpu().numpy()
        +weights_q2_k = mlp_weights_q2_k.detach().cpu().numpy()
        +
        +flat_f16 = weights_f16.flatten()
        +flat_q2_k = weights_q2_k.flatten()
        +
        +# Calculate correlation
        +correlation = np.corrcoef(flat_f16, flat_q2_k)[0,1]
        +print(f"Pearson correlation: {correlation:.4f}")
        +
        +
        +
        +
        +
        Pearson correlation: 0.9970
        +
        +
        +
        +
        +

        Quantization is a powerful technique for reducing the memory footprint of LLMs. This can be exemplified by the case of LLaMa 3.3 70B as quantized by [Unsloth, 2024] [2]. The model’s memory requirements vary significantly based on the quantization level used as demonstrated in Fig. 8.2.

        +
        +Quantized Model Size +
        +

        Fig. 8.2 Quantized Model Size: unsloth/Llama-3.3-70B-Instruct-GGUF

        +
        +
        +

        We observe that the quantization process yields remarkable reductions in model size, demonstrating a clear trade-off between precision and memory requirements. The transition from F16 (141.1 GB) to Q8_0 (75 GB) achieves a dramatic 47% reduction in model size while maintaining relatively high numerical precision. Further quantization levels reveal an interesting pattern of diminishing returns - each step down in precision yields progressively smaller absolute size reductions, though the cumulative effect remains significant. At the extreme end, the Q2_K model (26.4 GB) requires only 19% of the storage space of its F16 counterpart [3].

        +

        This wide spectrum of model sizes enables deployment across diverse hardware environments. The lightweight Q2_K variant opens possibilities for running inference on consumer-grade hardware like high-end laptops or desktop computers. In contrast, the full-precision F16 model demands enterprise-grade computing resources with substantial memory capacity. This flexibility in deployment options makes quantization a powerful tool for democratizing access to large language models while managing computational costs.

        +

        While quantization has proven highly effective, there is a limit to how far it can be pushed - specifically, the 1-bit ceiling. A notable advancement in this space is BitNet [Wang et al., 2024] which pushes the boundaries of extreme quantization.

        +

        BitNet’s implementation, bitnet.cpp, has demonstrated significant performance improvements across both ARM and x86 architectures (see Fig. 8.3). When compared to llama.cpp, the framework achieves speedups ranging from 1.37x to 5.07x on ARM processors and 2.37x to 6.17x on x86 systems. These performance gains scale with model size - larger models benefit more substantially from BitNet’s optimizations. The efficiency improvements extend beyond raw speed: energy consumption drops by 55-70% on ARM and 71-82% on x86 processors. Perhaps most impressively, bitnet.cpp enables running a 100B parameter BitNet b1.58 model on a single CPU at speeds matching human reading pace (5-7 tokens per second).

        +
        +BitNet +
        +

        Fig. 8.3 BitNet: [Wang et al., 2024]

        +
        +
        +

        The framework’s initial release focused on CPU inference optimization, with particular emphasis on 1-bit LLM architectures (BitNet b1.58). While initial testing shows promising results, these findings are specific to the tested models and kernels (its specialized kernels are carefully crafted to exploit the unique characteristics of these extremely quantized models). Further validation is needed before generalizing these results across different architectures and use cases.

        +

        As a relatively recent innovation, 1-bit LLMs represent an exciting frontier in model compression. However, their full potential and limitations require additional research and real-world validation. The technology demonstrates how creative approaches to quantization can continue pushing the boundaries of efficient AI deployment.

        +

        Beyond its memory footprint reduction, quantization delivers several compelling advantages: it accelerates computation through faster arithmetic operations and larger batch sizes, reduces costs by enabling deployment on less expensive hardware and making LLMs more accessible to smaller organizations, and improves energy efficiency by lowering memory bandwidth usage and power consumption - particularly beneficial for mobile and edge devices, ultimately contributing to more sustainable AI deployment.

        +

        Each reduction in precision risks performance degradation. Finding optimal quantization schemes remains an active research area. See Case Study on Quantization for Local Models in Chapter Local LLMs in Practice for more details.

        +
        +
        +

        8.4. Check-list

        +

        Planning and Requirements

        +
          +
        • Start with a clear understanding of your application’s needs and the factors that contribute to LLM costs

        • +
        • Choose the right model for your task, balancing performance and cost

        • +
        • Be aware of the potential challenges and limitations of open-source LLMs and take appropriate measures to mitigate them

        • +
        +

        Model Optimization

        +
          +
        • Explore model compression and quantization to reduce model size and computational demands

        • +
        • Fine-tune pre-trained models on domain-specific data to improve accuracy and efficiency

        • +
        • Consider using RAG to enhance performance and reduce reliance on purely generative processes

        • +
        +

        Prompt Engineering

        +
          +
        • Optimize prompts and utilize prompt engineering techniques to minimize token usage

        • +
        • Experiment with different prompting strategies to unlock the full potential of open-source LLMs

        • +
        +

        Infrastructure and Operations

        +
          +
        • Implement caching and batching strategies to optimize resource utilization

        • +
        • Monitor LLM usage patterns and costs to identify areas for optimization

        • +
        • Set up observability and logging to track model performance and costs

        • +
        • Establish automated testing and evaluation pipelines

        • +
        +

        Cost Management

        +
          +
        • Track and analyze inference costs across different model variants

        • +
        • Implement cost allocation and chargeback mechanisms

        • +
        • Set up cost alerts and budgeting controls

        • +
        • Regularly review and optimize resource utilization

        • +
        +
        +
        +

        8.5. Conclusion

        +

        CC BY-NC-SA 4.0

        +
        @misc{tharsistpsouza2024tamingllms,
        +  author = {Tharsis T. P. Souza},
        +  title = {Taming LLMs: A Practical Guide to LLM Pitfalls with Open Source Software},
        +  year = {2024},
        +  chapter = {The Falling Cost Paradox},
        +  journal = {GitHub repository},
        +  url = {https://github.com/souzatharsis/tamingLLMs)
        +}
        +
        +
        +
        +
        +

        8.6. References

        +
        +
        +[WZS+24] +(1,2) +

        Jinheng Wang, Hansong Zhou, Ting Song, Shaoguang Mao, Shuming Ma, Hongyu Wang, Yan Xia, and Furu Wei. 1-bit ai infra: part 1.1, fast and lossless bitnet b1.58 inference on cpus. 2024. URL: https://arxiv.org/abs/2410.16144, arXiv:2410.16144.

        +
        +
        +[AndreessenHorowitz24] +(1,2) +

        Andreessen Horowitz. Llmflation: understanding and mitigating llm inference cost. Blog Post, 2024. Analysis of LLM inference costs and strategies for optimization. URL: https://a16z.com/llmflation-llm-inference-cost/.

        +
        +
        +[HuggingFace4w] +

        Hugging Face. Gguf quantization types. Online Documentation, 2024w. Documentation on different quantization types available for GGUF models. URL: https://huggingface.co/docs/hub/gguf#quantization-types.

        +
        +
        +[Unsloth24] +

        Unsloth. Llama-3.3-70b-instruct-gguf. Hugging Face Model, 2024. GGUF quantized version of Meta's Llama 3.3 70B instruction-tuned model. URL: https://huggingface.co/unsloth/Llama-3.3-70B-Instruct-GGUF.

        +
        +
        +
        +
        + +
        + + + + + +
      + + +

    + + + + \ No newline at end of file diff --git a/tamingllms/_build/html/notebooks/evals.html b/tamingllms/_build/html/notebooks/evals.html index 854b1ba..585507d 100644 --- a/tamingllms/_build/html/notebooks/evals.html +++ b/tamingllms/_build/html/notebooks/evals.html @@ -212,6 +212,15 @@ +
  • + + The Falling Cost Paradox + + + +
  • + +
    @@ -244,7 +253,7 @@
    -

    3. The Evals Gap

    +

    3. The Evals Gap

    It doesn’t matter how beautiful your theory is,
    it doesn’t matter how smart you are.
    @@ -254,49 +263,49 @@

    -

    3.1. Introduction

    +

    3.1. Introduction

    The advent of LLMs marks a pivotal shift in the landscape of software development and evaluation. Unlike traditional software systems, where deterministic outputs are the norm, LLMs introduce a realm of non-deterministic and generative behaviors that challenge conventional software engineering testing paradigms. This shift is not merely a technical evolution but a fundamental transformation in how we conceive, build, and assess software products.

    For those entrenched in traditional methodologies, the transition to LLM-driven systems may seem daunting. However, ignoring this change is not an option. The reliance on outdated testing frameworks that fail to account for the probabilistic nature of LLMs will inevitably lead to significant setbacks.

    To overcome these challenges, it is imperative to embrace the complexities of LLMs with a proactive mindset. This involves developing robust evaluation frameworks up-front, fostering a product development culture of continuous change, learning and adaptation.

    -

    3.2. Non-Deterministic Generative Machines

    +

    3.2. Non-Deterministic Generative Machines

    One of the most fundamental challenges when building products with Large Language Models (LLMs) is their generative and non-deterministic nature. Unlike traditional software systems where the same input reliably produces the same output, LLMs can generate novel text that may not exist in their training data, and produce different responses each time they’re queried - even with identical prompts and input data. This behavior is both a strength and a significant engineering challenge and product challenge.

    When you ask an LLM the same question multiple times, you’ll likely get different responses. This isn’t a bug - it’s a fundamental feature of how these models work. The “temperature” parameter, which controls the randomness of outputs, allows models to be creative and generate diverse responses. However, this same feature makes it difficult to build reliable, testable systems.

    Consider a financial services company using LLMs to generate investment advice. The non-deterministic nature of these models means that:

    @@ -431,7 +440,7 @@

    -

    3.3. Emerging Properties

    +

    3.3. Emerging Properties

    Beyond their non-deterministic nature, LLMs present another fascinating characteristic: emergent abilities that spontaneously arise as models scale up in size. These abilities - from basic question answering to complex reasoning - aren’t explicitly programmed but rather emerge “naturally” as the models grow larger and are trained on more data. This makes evaluation fundamentally different from traditional software testing, where capabilities are explicitly coded and can be tested against pre-defined specifications.

    Fig. 3.1 provides a list of emergent abilities of large language models and the scale. The relationship between model scale and emergent abilities follows a fascinating non-linear pattern. Below certain size thresholds, specific abilities may be completely absent from the model - it simply cannot perform certain tasks, no matter how much you try to coax them out. However, once the model reaches critical points in its scaling journey, these abilities can suddenly manifest in what researchers call a phase transition - a dramatic shift from inability to capability. This unpredictable emergence of capabilities stands in stark contrast to traditional software development, where features are deliberately implemented and can be systematically tested.

    @@ -443,7 +452,7 @@

    -

    3.4. Problem Statement

    +

    3.4. Problem Statement

    Consider a practical example that illustrates these challenges: building a Math AI tutoring system for children powered by an LLM. In traditional software development, you would define specific features (like presenting math problems or checking answers) and write tests to verify each function. But with LLMs, you’re not just testing predefined features - you’re trying to evaluate emergent capabilities like adapting explanations to a child’s level, maintaining engagement through conversational learning, and providing age-appropriate safety-bound content.

    This fundamental difference raises critical questions about evaluation:

      @@ -493,7 +502,7 @@

      -

      3.5. Evals Design

      +

      3.5. Evals Design

      First, it’s important to make a distinction between evaluating an LLM versus evaluating an LLM-based application. While the latter offers foundation capabilities and are typically general-purpose, the former is more specific and tailored to a particular use case. Here, we define an LLM-based application as a system that uses one or more LLMs to perform a specific task. More specifically, an LLM-based application is the combination of one or more LLM models, their associated prompts and parameters to solve a particular business problem.

      That differentiation is important because it changes the scope of evaluation. LLMs are usually evaluated based on their capabilities, which include things like language understanding, reasoning and knowledge. LLM-based applications, instead, should be evaluated based on their end-to-end functionality, performance, and how well they meet business requirements. That distinction has key implications for the design of evaluation systems:

    -

    3.7. Evaluators

    +

    3.7. Evaluators

    -

    3.7.1. Model-Based Evaluation

    +

    3.7.1. Model-Based Evaluation

    Traditional metrics like BLEU or ROUGE often fall short in capturing the nuanced, contextual, and creative outputs of LLMs. As an alternative we can consider a “Model-based evaluation” approach. A common approach is to use an LLM as a judge. This is an approach that leverages language models themselves to assess the quality of outputs from other language models. This method involves using a model (often a more capable one) to act as an automated judge, evaluating aspects like accuracy, coherence, and relevance of generated content. Unlike traditional metrics that rely on exact matching or statistical measures, model-based evaluation can capture nuanced aspects of language and provide more contextual assessment.

    As discussed in the paper [Li et al., 2024], LLM-based evaluation approaches generally fall into two main categories:

      @@ -1300,7 +1309,7 @@

      [Deshpande et al., 2024], a 3B evaluator LLM that can score any text input and associated context on arbitrary user defined criteria. Glider is an LLM model trained on 685 domains and 183 criteria whose judgement scores show 91.3% agreement with human judgments, making it suitable for a diverse range of real world applications.

    -

    3.7.2. Evaluating Evaluators

    +

    3.7.2. Evaluating Evaluators

    We have discussed how LLMs can be used to evaluate LLM-based aplications. However, how can we evaluate the performance of LLMs that evaluate other LLMs? This is the question that meta evaluation aims to answer. Clearly, the discussion can become quite meta as we need to evaluate the performance of the evaluator to evaluate the performance of the evaluated model. However, one can make a case for two general options:

    1. Use a gold-standard dataset that is used to evaluate the performance of LLM evaluators using a “metrics-based” approach.

    2. @@ -1344,7 +1353,7 @@

      -

      3.8. Benchmarks and Leaderboards

      +

      3.8. Benchmarks and Leaderboards

      Benchmarks act as standardized tests for LLMs, evaluating their performance across a spectrum of tasks. These tasks simulate real-world applications such as answering questions, generating coherent text, solving mathematical problems, or even writing computer code. They also assess more abstract qualities like fairness, robustness, and cultural understanding.

      Benchmarks can be thought as comprehensive “exams” that probe different “subjects” in order to certify an LLM. They help researchers and developers compare models systematically, in a way LLM performance is comparable while enabling the identification of emergent behaviors or capabilities as models evolve in scale and sophistication.

      The history of LLM benchmarks reflects the evolving priorities of artificial intelligence research, starting with foundational tasks and moving toward complex, real-world challenges. It began in 2018 with the introduction of GLUE(General Language Understanding Evaluation) [Wang et al., 2019], which set a new standard for evaluating natural language understanding. GLUE measured performance on tasks like sentiment analysis and textual entailment, providing a baseline for assessing the fundamental capabilities of language models. A year later, SuperGLUE [Wang et al., 2019] expanded on this foundation by introducing more nuanced tasks that tested reasoning and language comprehension at a deeper level, challenging the limits of models like BERT and its successors.

      @@ -1354,7 +1363,7 @@

      [Face, 2024] Leaderboard stands out for its transparency and accessibility in the open-source community. This leaderboard evaluates a wide range of LLMs across diverse tasks, including general knowledge, reasoning, and code-writing. Its commitment to reproducibility ensures that results are verifiable, enabling researchers and practitioners to replicate findings. By focusing on open-source models, it democratizes AI research and fosters innovation across communities, making it a valuable resource for both academics and industry professionals.

      The Chatbot Arena (2024) Leaderboard (an evolution of LMSYS)[Chiang et al., 2024] takes an alternative approach by measuring real-world performance through direct model comparisons. Its evaluation format compares models in live conversations, with human judges providing qualitative assessments. This methodology has gathered over 200,000 human evaluations, offering specific insights into practical model performance. The emphasis on interactive capabilities makes it relevant for developing user-facing applications like virtual assistants and chatbots.

      The AlpacaEval [Dubois et al., 2024] and MT-Bench [Zheng et al., 2023] Leaderboards implement automated evaluation using GPT-4 to assess model performance in multi-turn conversations. This approach enables consistent assessment of dialogue capabilities while reducing human bias. Their methodology measures key aspects of conversational AI, including contextual understanding and response consistency across multiple exchanges.

      -

      An important recent development was the release of Global-MMLU [Singh et al., 2024], an improved version of MMLU with evaluation coverage across 42 languages. This open dataset, built through collaboration between Argilla, the Hugging Face community, and researchers from leading institutions like Cohere For AI, Mila, MIT, and others, represents a significant step toward more inclusive multilingual LLM evaluation. Over 200 contributors used Argilla to annotate MMLU questions, revealing that 85% of questions requiring specific cultural knowledge were Western-centric. The newly released dataset is divided into two key subsets: Culturally Agnostic questions that require no specific regional or cultural knowledge, and Culturally Sensitive questions that depend on dialect, cultural, or geographic knowledge. With high-quality translations available for 25 languages, Global-MMLU enables better understanding of LLM capabilities and limitations across different languages and cultural contexts.

      +

      An important recent development was the release of Global-MMLU [Singh et al., 2024], an improved version of MMLU with evaluation coverage across 42 languages. This open dataset, built through collaboration between Argilla, the Hugging Face community, and researchers from leading institutions like Cohere For AI, Mila, MIT, and others, represents a significant step toward more inclusive multilingual LLM evaluation. Over 200 contributors used Argilla to annotate MMLU questions, revealing that 85% of questions requiring specific cultural knowledge were Western-centric. The newly released dataset is divided into two key subsets: Culturally Agnostic questions that require no specific regional or cultural knowledge, and Culturally Sensitive questions that depend on dialect, cultural, or geographic knowledge. With high-quality translations available for 25 languages, Global-MMLU enables better understanding of LLM capabilities and limitations across different languages and cultural contexts.

      A major challenge with these leaderboards and benchmarks is test set contamination - when test data ends up in newer models’ training sets, rendering the benchmarks ineffective. While some benchmarks try to address this through crowdsourced prompts and evaluations from humans or LLMs, these approaches introduce their own biases and struggle with difficult questions. LiveBench [White et al., 2024] represents a novel solution, designed specifically to be resilient to both contamination and evaluation biases. As the first benchmark with continuously updated questions from recent sources, automated objective scoring, and diverse challenging tasks across multiple domains, LiveBench maintains its effectiveness even as models improve. Drawing from recent math competitions, research papers, news, and datasets, it creates contamination-free versions of established benchmark tasks. Current results show even top models achieving below 70% accuracy, demonstrating LiveBench’s ability to meaningfully differentiate model capabilities. With monthly updates and an open collaborative approach, LiveBench aims to provide sustained value for model evaluation as the field advances.

      Another notable benchmark is ZebraLogic [Lin et al., 2024], which evaluates logical reasoning capabilities of LLMs through Logic Grid Puzzles - a type of Constraint Satisfaction Problem [Brailsford et al., 1999] commonly found in tests like the LSAT. These puzzles require assigning unique values to N houses across M different features based on given clues, demanding strategic reasoning and deduction to arrive at a unique correct solution. The benchmark’s programmatically generated puzzles range from 2x2 to 6x6 in size and test LLMs using one-shot examples with reasoning steps. While humans can solve these puzzles through strategic methods like reductio ad absurdum and elimination, LLMs demonstrate significant limitations in this type of logical reasoning. Even the best-performing model, Claude 3.5 Sonnet, only achieves 33.4% accuracy across all puzzles and 12.4% on hard puzzles, with smaller models (7-10B parameters) solving less than 1% of hard puzzles as of December 2024. These results reveal critical gaps in LLMs’ capabilities around counterfactual thinking, reflective reasoning, structured memorization, and compositional generalization.

      A significant shift in AI evaluation came with the launch of the The Alignment Research Center (ARC) Prize [Chollet, 2024] by ARC Prize Inc., a non-profit for the public advancement of open artificial general intelligence. Hosted by Mike Knoop (Co-founder, Zapier) and François Chollet (Creator of ARC-AGI, Keras), this prize represents a paradigm shift in how we evaluate language models. Rather than focusing on narrow performance metrics, the ARC Prize assesses what it calls “cognitive sufficiency” - a model’s ability to generate meaningful insights and tackle open-ended challenges. This new way to think about LLM evaluation emphasizes creative thinking, sophisticated reasoning, and the capacity to make genuinely useful contributions to human knowledge as we seek to define and measure what it means to achieve AGI (Artificial General Intelligence).

      @@ -1389,16 +1398,16 @@

      [Chollet, 12/08/2024]. While deep learning has significantly advanced in recent years, pure deep learning approaches perform poorly on the ARC-AGI benchmark. This is because traditional deep learning relies on relating new situations to those encountered during training and lacks the ability to adapt or recombine knowledge for entirely new tasks. ARC Prize 2024 spurred the development of novel AGI reasoning techniques, leading to a significant increase in the state-of-the-art score on the ARC-AGI private evaluation set from 33% in 2023 to 55.5% in 2024. A key takeaway is that algorithmic improvements, rather than massive computational resources, may be key to exceeding the target score for the ARC-AGI benchmark.

      In addition to the benchmarks discussed above, a growing set of domain-specific benchmarks is emerging to help evaluate LLMs in specific verticals, including:

        -
      • FinBench [Zhang et al., 2024]: Evaluates LLMs in the financial domain, covering tasks such as terminology understanding, temporal reasoning, future forecasting, scenario planning, and numerical modelling.

      • -
      • LegalBench [Guha et al., 2023] : Assesses the legal reasoning abilities of LLMs through tasks crowdsourced by legal professionals

      • -
      • Berkeley Function Leaderboard (BFCL) [Patil et al., 2023]: Evaluates LLMs’ function-calling abilities

      • +
      • FinBench [Zhang et al., 2024]: Evaluates LLMs in the financial domain, covering tasks such as terminology understanding, temporal reasoning, future forecasting, scenario planning, and numerical modelling.

      • +
      • LegalBench [Guha et al., 2023] : Assesses the legal reasoning abilities of LLMs through tasks crowdsourced by legal professionals

      • +
      • Berkeley Function Leaderboard (BFCL) [Patil et al., 2023]: Evaluates LLMs’ function-calling abilities

      As language models continue to advance in capability and complexity, evaluation frameworks must evolve. Modern benchmarks increasingly incorporate tests for nuanced reasoning, ethical decision-making, and emergent capabilities that weren’t previously measurable. This ongoing evolution reflects a deeper understanding that the true value of language models lies not in achieving high scores on standardized tests with narrow task-specific metrics, but in their ability to meaningfully contribute to human understanding and help solve real-world problems while demonstrating the ability to learn and adapt to new tasks.

    -

    3.9. Tools

    +

    3.9. Tools

    -

    3.9.1. LightEval

    +

    3.9.1. LightEval

    LightEval [Fourrier et al., 2023] is a lightweight framework for evaluation of LLMs across a variety of standard and bespoke metrics and tasks across multiple inference backends via Python SDK and CLI.

    As a motivating example, consider a scenario where financial data has been extracted from SEC financial filings and require econometric analysis. Tasks like estimating autoregressive models for time series forecasting or conducting hypothesis tests on market efficiency are common in financial analysis. Let’s evaluate how well different models perform on this type of task.

    First, we need to select a benchmark to assess LLMs capabilities in this domain. MMLU has a sub-benchmark called Econometrics we can use for this task. Table 3.4 shows a sample of the benchmark dataset from MMLU Econometrics. It consists of multiple-choice questions from econometrics and expected answers.

    @@ -1587,7 +1596,7 @@

    [Hugging Face, 2024]. Its integration with the Hugging Face ecosystem and modular architecture make it particularly powerful for evaluating open source models. For further details, visit the official repository [Fourrier et al., 2023].

    -

    3.9.2. LangSmith

    +

    3.9.2. LangSmith

    Let’s revisit our evaluation example when we were interested in evaluating the quality of summaries generated by different (smaller and cheaper) LLM models compared to a benchmark model (larger and more expensive). Recal the setup:

    • Benchmark model: gpt-4o

    • @@ -1995,7 +2004,7 @@

      -

      3.9.3. PromptFoo

      +

      3.9.3. PromptFoo

      Promptfoo [promptfoo, 2024] is an open-source framework designed for evaluating applications that utilize large language models (LLMs). Key features include:

      1. Automated Testing: Promptfoo provides automated testing capabilities, allowing developers to run custom evaluations tailored to their applications.

      2. @@ -2260,7 +2269,7 @@

        Prompt Comparison R

        In conclusion, Promptfoo can serve as an effective LLM application evaluation tool particularly for its ability to decouple several components of the evaluation process. Hence enabling the user to focus on the most important aspects of the evaluation given the particular application and criteria making it a valuable and flexible tool for LLM application development.

    -

    3.9.4. Comparison

    +

    3.9.4. Comparison

    The following table provides a summarized comparative analysis of three open source frameworks for language models evaluation we have discussed: Lighteval, LangSmith, and Promptfoo. Each framework is assessed based on key features such as integration capabilities, customization options, ease of use, and the ability to facilitate human and LLM collaboration.

    @@ -2297,13 +2306,13 @@

    -

    3.10. Conclusion

    +

    3.10. Conclusion

    Language models have fundamentally transformed how software is developed and evaluated. Unlike conventional systems that produce predictable outputs, LLMs generate varied, probabilistic responses that defy traditional testing approaches. While developers accustomed to deterministic systems may find this shift challenging, continuing to rely on legacy testing methods is unsustainable. These frameworks were not designed to handle the inherent variability of LLM outputs and will ultimately prove inadequate.

    Success requires embracing this new paradigm by implementing comprehensive evaluation strategies early - this is the new Product Requirements Document (PRD) - and cultivating an organizational mindset focused on iteration, experimentation and growth.

    The shift from traditional software testing to LLM evaluation is not just a change in tools but a transformation in mindset. Those who recognize and adapt to this shift will lead the way in harnessing the power of LLMs. However, the cost of inaction is not just technological stagnation, but potential business failure.

    -

    3.11. Citation

    +

    3.11. Citation

    CC BY-NC-SA 4.0

    @misc{tharsistpsouza2024tamingllms,
       author = {Tharsis T. P. Souza},
    @@ -2317,7 +2326,7 @@ 

    -

    3.12. References

    +

    3.12. References

    [ALB+24] @@ -2384,7 +2393,7 @@

    (1,2)

    Clémentine Fourrier, Nathan Habib, Thomas Wolf, and Lewis Tunstall. Lighteval: a lightweight framework for llm evaluation. 2023. URL: https://github.com/huggingface/lighteval.

    -
    +
    [GNH+23]

    Neel Guha, Julian Nyarko, Daniel E. Ho, Christopher Ré, Adam Chilton, Aditya Narayana, Alex Chohlas-Wood, Austin Peters, Brandon Waldon, Daniel N. Rockmore, Diego Zambrano, Dmitry Talisman, Enam Hoque, Faiz Surani, Frank Fagan, Galit Sarfaty, Gregory M. Dickinson, Haggai Porat, Jason Hegland, Jessica Wu, Joe Nudell, Joel Niklaus, John Nay, Jonathan H. Choi, Kevin Tobia, Margaret Hagan, Megan Ma, Michael Livermore, Nikon Rasumov-Rahe, Nils Holzenberger, Noam Kolt, Peter Henderson, Sean Rehaag, Sharad Goel, Shang Gao, Spencer Williams, Sunny Gandhi, Tom Zur, Varun Iyer, and Zehua Li. Legalbench: a collaboratively built benchmark for measuring legal reasoning in large language models. 2023. URL: https://arxiv.org/abs/2308.11462, arXiv:2308.11462.

    @@ -2417,7 +2426,7 @@

    [LHE22]

    Stephanie Lin, Jacob Hilton, and Owain Evans. Truthfulqa: measuring how models mimic human falsehoods. 2022. URL: https://arxiv.org/abs/2109.07958, arXiv:2109.07958.

    -
    +
    [PZWG23]

    Shishir G. Patil, Tianjun Zhang, Xin Wang, and Joseph E. Gonzalez. Gorilla: large language model connected with massive apis. arXiv preprint arXiv:2305.15334, 2023.

    @@ -2429,11 +2438,11 @@

    [Ras24]

    Sebastian Raschka. Build A Large Language Model (From Scratch). Manning, 2024. ISBN 978-1633437166. URL: https://www.manning.com/books/build-a-large-language-model-from-scratch.

    -
    +
    [SLL+24]

    Bhaskarjit Sarmah, Mingshu Li, Jingrao Lyu, Sebastian Frank, Nathalia Castellanos, Stefano Pasquali, and Dhagash Mehta. How to choose a threshold for an evaluation metric for large language models. 2024. URL: https://arxiv.org/abs/2412.12148, arXiv:2412.12148.

    -
    +
    [SRF+24]

    Shivalika Singh, Angelika Romanou, Clémentine Fourrier, David I. Adelani, Jian Gang Ngui, Daniel Vila-Suero, Peerat Limkonchotiwat, Kelly Marchisio, Wei Qi Leong, Yosephine Susanto, Raymond Ng, Shayne Longpre, Wei-Yin Ko, Madeline Smith, Antoine Bosselut, Alice Oh, Andre F. T. Martins, Leshem Choshen, Daphne Ippolito, Enzo Ferrante, Marzieh Fadaee, Beyza Ermis, and Sara Hooker. Global mmlu: understanding and addressing cultural and linguistic biases in multilingual evaluation. 2024. URL: https://arxiv.org/abs/2412.03304, arXiv:2412.03304.

    @@ -2461,7 +2470,7 @@

    [YYH+24]

    An Yang, Baosong Yang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Zhou, Chengpeng Li, Chengyuan Li, Dayiheng Liu, Fei Huang, Guanting Dong, Haoran Wei, Huan Lin, Jialong Tang, Jialin Wang, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Ma, Jin Xu, Jingren Zhou, Jinze Bai, Jinzheng He, Junyang Lin, Kai Dang, Keming Lu, Keqin Chen, Kexin Yang, Mei Li, Mingfeng Xue, Na Ni, Pei Zhang, Peng Wang, Ru Peng, Rui Men, Ruize Gao, Runji Lin, Shijie Wang, Shuai Bai, Sinan Tan, Tianhang Zhu, Tianhao Li, Tianyu Liu, Wenbin Ge, Xiaodong Deng, Xiaohuan Zhou, Xingzhang Ren, Xinyu Zhang, Xipin Wei, Xuancheng Ren, Yang Fan, Yang Yao, Yichang Zhang, Yu Wan, Yunfei Chu, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, and Zhihao Fan. Qwen2 technical report. arXiv preprint arXiv:2407.10671, 2024.

    -
    +
    [ZCL24]

    Zhihan Zhang, Yixin Cao, and Lizi Liao. Finbench: benchmarking LLMs in complex financial problem solving and reasoning. 2024. URL: https://openreview.net/forum?id=AeGrf1uY0p.

    diff --git a/tamingllms/_build/html/notebooks/local.html b/tamingllms/_build/html/notebooks/local.html index 0263f16..e0df261 100644 --- a/tamingllms/_build/html/notebooks/local.html +++ b/tamingllms/_build/html/notebooks/local.html @@ -39,6 +39,7 @@ + @@ -186,7 +187,7 @@
  • Introduction
  • -
  • Models Considerations
  • +
  • Choosing your Model
  • Tools for Local LLM Deployment
  • @@ -194,8 +195,6 @@
  • Conclusion
  • -
  • Citation
  • -
  • References
  • @@ -203,6 +202,15 @@ +
  • + + The Falling Cost Paradox + + + +
  • + +
    @@ -224,6 +232,10 @@ 6. Preference-Based Alignment +
    @@ -231,7 +243,7 @@
    -

    7. Local LLMs in Practice

    +

    7. Local LLMs in Practice

    Freedom is something that dies unless it’s used.

    —Hunter S. Thompson

    @@ -239,56 +251,55 @@

    Contents

    -

    7.1. Introduction

    +

    7.1. Introduction

    Running Open Source LLMs locally versus depending on proprietary cloud-based models represents more than just a technical choice - it’s a fundamental re-imagining of how we interact with AI technology, putting control back in the hands of users.

    Privacy concerns are a key driver for running LLMs locally. Individual users may want to process personal documents, photos, emails, and chat messages without sharing sensitive data with third parties. For enterprise use cases, organizations handling medical records must comply with HIPAA regulations that require data to remain on-premise. Similarly, businesses processing confidential documents and intellectual property, as well as organizations subject to GDPR and other privacy regulations, need to maintain strict control over their data processing pipeline.

    Cost considerations are another key driver. Organizations and individual consumers can better control expenses by matching model capabilities to their specific needs rather than paying for multiple cloud API subscriptions. For organizations with high-volume applications, this customization and control over costs becomes especially valuable compared to the often prohibitive per-request pricing of cloud solutions. For consumers, running multiple open source models locally eliminates the need to maintain separate subscriptions to access different model capabilities.

    @@ -297,8 +308,8 @@

    -

    7.2. Models Considerations

    +
    +

    7.2. Choosing your Model

    The landscape of open source LLMs is rapidly evolving, with new models emerging by the day. While proprietary LLMs have garnered significant attention, open source LLMs are gaining traction due to their flexibility, customization options, and cost-effectiveness.

    It is important to observe long-term strategic considerations when choosing a model. These entails prioritization dimensions that may enable competitive advantage in the long-term, including:

      @@ -310,7 +321,7 @@

      -

      7.2.1. Task Suitability

      +

      7.2.1. Task Suitability

      When evaluating an open source LLM, task suitability is a critical first consideration. A model that performs well on general benchmarks may struggle with specific domain tasks. Understanding the intended use case helps narrow down model options based on their demonstrated strengths.

      Task Categories

      When determining which LLM task to prioritize, carefully consider your specific use case and end-user needs. Different applications require distinct model capabilities and optimizations. Common LLM Task Categories include:

      @@ -344,8 +355,8 @@

      Fig. 7.2 Model Types.

      -

      The Llama 2 model family [Touvron et al., 2023] illustrates these distinctions well. The base Llama 2, trained on 2 trillion tokens of public data, demonstrates general-purpose capabilities across text generation and translation tasks. Its chat-optimized instruction-tuned variant, Llama 2-Chat, underwent additional fine-tuning on over 1 million human-annotated conversational examples, making it particularly adept at natural dialogue.

      -

      Benchmark results [Meta AI, 2024c] in Table 7.1 highlight the impact of model specialization. On the TruthfulQA [Lin et al., 2022] and Toxigen [Alnajjar and others, 2024] benchmarks measuring truthful and informative responses. We observe that the chat-optimized variants show substantially improved truthfulness. Similarly, on the ToxiGen benchmark measuring toxic content generation, Llama 2-Chat models demonstrate near-zero toxicity compared to base models’ 21-26% rates.

      +

      The Llama 2 model family [Touvron et al., 2023] illustrates these distinctions well. The base Llama 2, trained on 2 trillion tokens of public data, demonstrates general-purpose capabilities across text generation and translation tasks. Its chat-optimized instruction-tuned variant, Llama 2-Chat, underwent additional fine-tuning on over 1 million human-annotated conversational examples, making it particularly adept at natural dialogue.

      +

      Benchmark results [Meta AI, 2024c] in Table 7.1 highlight the impact of model specialization. On the TruthfulQA [Lin et al., 2022] and Toxigen [Alnajjar and others, 2024] benchmarks measuring truthful and informative responses. We observe that the chat-optimized variants show substantially improved truthfulness. Similarly, on the ToxiGen benchmark measuring toxic content generation, Llama 2-Chat models demonstrate near-zero toxicity compared to base models’ 21-26% rates.

    Table 3.6 Comparison of Lighteval, LangSmith, and Promptfoo
    @@ -400,9 +411,9 @@

    -

    7.2.2. Performance & Cost

    +

    7.2.2. Performance & Cost

    General benchmarks are useful for comparing models across different standard tasks. Open Source models are becoming more competitive with proprietary models with LLama, Qwen and Mistral model families being some of the most powerful open source models available today.

    -

    Qwen model family [Qwen et al., 2024] emerged in 2024 as a model family achieving competitive performance with smaller parameter counts compared to its competitors. The flagship Qwen2.5-72B-Instruct model demonstrates performance comparable to the much larger Llama-3-405B-Instruct while being about 5 times smaller. The models excel in specialized tasks like mathematics and coding, handle structured data effectively, and offer enhanced support for tool use and long-text generation as shown in Fig. 7.3.

    +

    Qwen model family [Qwen et al., 2024] emerged in 2024 as a model family achieving competitive performance with smaller parameter counts compared to its competitors. The flagship Qwen2.5-72B-Instruct model demonstrates performance comparable to the much larger Llama-3-405B-Instruct while being about 5 times smaller. The models excel in specialized tasks like mathematics and coding, handle structured data effectively, and offer enhanced support for tool use and long-text generation as shown in Fig. 7.3.

    Qwen Performance
    @@ -468,7 +479,7 @@

    -

    7.2.3. Licensing

    +

    7.2.3. Licensing

    When evaluating open-source LLMs, it’s important to consider licensing and data usage policies. Some models may require attribution or commercial use licenses, while others may be more permissive. Additionally, ensure that the model’s training data is compatible with your intended use case and complies with relevant data protection laws.

    The licensing landscape for LLMs spans from highly permissive to custom and restricted usage. Table 7.2 provides a summary of the licensing terms for some of the most popular open source LLMs. We observe two types of licenses:

      @@ -524,14 +535,14 @@

      Review, 2024] serves as a pivotal example, where the Times claims its copyrighted materials were used without authorization to train language models. This litigation has far-reaching consequences for developers building LLM-powered applications. Should courts rule in favor of copyright holders, model providers may need to withdraw and retrain models containing protected content. These legal uncertainties introduce substantial complexity into LLM implementation strategies, demanding careful consideration during project planning phases.

      -

      Recent LLM releases demonstrate varying levels of data transparency. For instance, Qwen2.5’s approach [Qwen et al., 2024] illustrates common industry practices in both its achievements and limitations. On the training data scale front, Qwen2.5 does provide some transparency by discussing some training data methodology compared to previous versions such as expanding from 7 trillion to 18 trillion tokens, while implementing sophisticated quality filtering and carefully balancing domain representation through sampling adjustments.

      +

      Recent LLM releases demonstrate varying levels of data transparency. For instance, Qwen2.5’s approach [Qwen et al., 2024] illustrates common industry practices in both its achievements and limitations. On the training data scale front, Qwen2.5 does provide some transparency by discussing some training data methodology compared to previous versions such as expanding from 7 trillion to 18 trillion tokens, while implementing sophisticated quality filtering and carefully balancing domain representation through sampling adjustments.

      However, like many commercial LLMs, Qwen2.5 exhibits transparency limitations. The report provides incomplete disclosure of data sources and limited information about the proportions of different data types used in training. The preprocessing methodologies remain unclear, and there is minimal discussion of potential biases that may exist in the training data.

      -

      Similarly, in the Llama 3 paper [AI, 2024c], Meta AI does share some details about the pre-training corpus stating simply stating that it was around 15T multilingual tokens, compared to 1.8T tokens for Llama 2. The exact sources of data used for pre-training and post-training are not explicitly listed.

      +

      Similarly, in the Llama 3 paper [AI, 2024c], Meta AI does share some details about the pre-training corpus stating simply stating that it was around 15T multilingual tokens, compared to 1.8T tokens for Llama 2. The exact sources of data used for pre-training and post-training are not explicitly listed.

      These gaps in transparency reflect a broader industry challenge in balancing commercial interests with the need for openness and scientific reproducibility.

      A significant advancement in open-source language model training data is HuggingFace’s release of the FineWeb datasets. In its first release [Penedo et al., 2024], FineWeb is made of a 15-trillion token dataset derived from 96 Common Crawl snapshots that produces better-performing LLMs than other open pretraining datasets. Additionally, data curation codebase and all of the models trained during our ablation experiments are made available. FineWeb is a fine example of an initiative that helps minimize the gap between proprietary and public knowledge.

      -

      7.2.4. Community Support

      +

      7.2.4. Community Support

      Community support plays a vital role in the open-source LLM ecosystem. Active communities contribute to model development, provide technical assistance, and share valuable resources. When evaluating open-source LLMs, the strength and engagement of the community should be a key consideration, as it directly impacts the model’s long-term viability and practical utility.

      The popularity of different model families reflects their community adoption. In 2024, the Qwen and Llama families have emerged as clear favorites, with Qwen2.5-1.5B-Instruct alone representing 35% of total open source models downloads in 2024.

      @@ -540,13 +551,13 @@

      Fig. 7.8 Hugging Face Model Downloads in 2024 as of December 22 of the same year [Face, 2024t].

    -

    Strong communities accelerate model innovation through collective effort. When developers and researchers collaborate on model development, they create a powerful ecosystem of continuous improvement. Through transparent sharing of findings, they enable rapid development of novel applications and specialized model variants for specific domains. This collaborative environment naturally leads to the establishment of best practices and frameworks that benefit the entire community. The success of this community-driven approach is evident in models like Qwen2.5-1.5B-Instruct, which has spawned 200+ derivative models through post-training adaptations [Qwen, 2024b].

    +

    Strong communities accelerate model innovation through collective effort. When developers and researchers collaborate on model development, they create a powerful ecosystem of continuous improvement. Through transparent sharing of findings, they enable rapid development of novel applications and specialized model variants for specific domains. This collaborative environment naturally leads to the establishment of best practices and frameworks that benefit the entire community. The success of this community-driven approach is evident in models like Qwen2.5-1.5B-Instruct, which has spawned 200+ derivative models through post-training adaptations [Qwen, 2024b].

    -

    7.2.5. Customization

    +

    7.2.5. Customization

    Model customization is an important consideration when selecting an open-source LLM. Adapting and fine-tuning to specific use cases can significantly impact practical utility and performance in production environments.

    Model providers increasingly offer streamlined fine-tuning services. For example, Mistral demonstrates an accessible approach to model customization. -The code below shows Mistral’s straightforward fine-tuning API. The example shows how to create and start a fine-tuning job with just a few lines of code. The fine-tuning job is configured with the base model “open-mistral-7b” and uses training and validation files from the Ultrachat dataset [Face, 2024u]. This API design makes it easy to experiment with model customization while maintaining control over the training process.

    +The code below shows Mistral’s straightforward fine-tuning API. The example shows how to create and start a fine-tuning job with just a few lines of code. The fine-tuning job is configured with the base model “open-mistral-7b” and uses training and validation files from the Ultrachat dataset [Face, 2024u]. This API design makes it easy to experiment with model customization while maintaining control over the training process.

    # create a fine-tuning job
     created_jobs = client.fine_tuning.jobs.create(
         model="open-mistral-7b", 
    @@ -565,7 +576,7 @@ 

    created_jobs

    -

    For more comprehensive customization needs, Hugging Face’s Transformer Reinforcement Learning (TRL) toolkit provides robust capabilities for model adaptation. Built on the Transformers library, TRL supports [Face, 2024d]:

    +

    For more comprehensive customization needs, Hugging Face’s Transformer Reinforcement Learning (TRL) toolkit provides robust capabilities for model adaptation. Built on the Transformers library, TRL supports [Face, 2024d]:

    • Supervised Fine-Tuning (SFT)

    • Reward Modeling (RM)

    • @@ -611,7 +622,7 @@

      [Face, 2024v, Zhao et al., 2024]. A noteworthy example is Hugging Face’s SmolLM2 [Allal et al., 2024], a family of compact language models designed with several key advantages:

      +

      Small language models can serve as a lightweight alternative to customization compared to large models. Recent research has shown that smaller models can achieve competitive performance compared to larger models [Face, 2024v, Zhao et al., 2024]. A noteworthy example is Hugging Face’s SmolLM2 [Allal et al., 2024], a family of compact language models designed with several key advantages:

      1. Compact Sizes:

      @@ -641,10 +652,10 @@

      -

      7.3. Tools for Local LLM Deployment

      +

      7.3. Tools for Local LLM Deployment

      Local LLM deployment tools generally fall into two categories: inference-focused tools that prioritize performance and programmability for technical users requiring production-grade deployments, and user interface (UI) tools that emphasize accessibility through graphical interfaces for non-technical users, trading some performance for ease of use and broader adoption. In the following sections we will explore some of these tools discussing their features, capabilities, and trade-offs.

      -

      7.3.1. Serving Models

      +

      7.3.1. Serving Models

      Serving an LLM model involves making it available for inference by setting up infrastructure to process requests and manage resources efficiently. This serving layer handles several key responsibilities, from loading model weights and managing compute resources to processing requests and optimizing performance. Let’s examine the core components of model serving:

      1. Model Loading and Initialization

      2. @@ -697,8 +708,8 @@

        -

        7.3.1.1. LLama.cpp

        -

        LLama.cpp [Gerganov and contributors, 2024a] is an MIT-licensed open source optimized implementation of the LLama model architecture designed to run efficiently on machines with limited memory.

        +

        7.3.1.1. LLama.cpp

        +

        LLama.cpp [Gerganov and contributors, 2024a] is an MIT-licensed open source optimized implementation of the LLama model architecture designed to run efficiently on machines with limited memory.

        Originally developed by Georgi Gerganov and today counting with hundreds of contributors, this C/C++ LLama version provides a simplified interface and advanced features that allow language models to run locally without overwhelming systems. With the ability to run in resource-constrained environments, LLama.cpp makes powerful language models more accessible and practical for a variety of applications.

        In its “Manifesto” [Gerganov and others, 2023], the author highlights the significant potential in bringing AI from cloud to edge devices, emphasizing the importance of keeping development lightweight, experimental, and enjoyable rather than getting bogged down in complex engineering challenges. The author states a vision that emphasizes maintaining an exploratory, hacker-minded approach while building practical edge computing solutions highlighting the following core principles:

          @@ -715,7 +726,7 @@

          [Gerganov and contributors, 2024b] is the latest model format used by LLama.cpp, replacing the older GGML format. It was designed specifically for efficient inference of large language models on consumer hardware. The key features that make GGUF particularly valuable include [IBM Think, 2024]:

          +

          GGUF (GPT-Generated Unified Format) [Gerganov and contributors, 2024b] is the latest model format used by LLama.cpp, replacing the older GGML format. It was designed specifically for efficient inference of large language models on consumer hardware. The key features that make GGUF particularly valuable include [IBM Think, 2024]:

          • Improved quantization: GGUF supports multiple quantization levels to reduce model size while preserving performance. Common quantization schemes that are supported by GGUF include:

              @@ -729,9 +740,9 @@

              [Hugging Face, 2024x] and provides a tool (ggml-org/gguf-my-repo) to convert existing models to GGUF format, making it easier for developers to access and deploy optimized versions of popular language models.

              +

              These capabilities make GGUF models significantly more practical for running LLMs locally compared to full-precision formats, often dramatically reducing memory requirements. Hugging Face hosts a growing collection of pre-converted GGUF models [Hugging Face, 2024x] and provides a tool (ggml-org/gguf-my-repo) to convert existing models to GGUF format, making it easier for developers to access and deploy optimized versions of popular language models.

              Setup

              -

              Please follow the instructions from the LLama.cpp GitHub repository [Gerganov and contributors, 2024a] to install and compile the library.

              +

              Please follow the instructions from the LLama.cpp GitHub repository [Gerganov and contributors, 2024a] to install and compile the library.

              Here, we will compile the library from source on a Linux machine with 8 jobs in parallel for enhanced performance (add the -j argument to run multiple jobs in parallel).

              sudo apt install cmake
               
              @@ -830,7 +841,7 @@ 

              [Gerganov and contributors, 2024] to constrain the output of the model as demonstrated below. This is the same technique Ollama uses, a similar approach to Outlines’ to generate structured outputs from LLMs. See Chapter Structured Output for more details.

              +

              It is worth noting Llama.cpp provides a way to use grammars [Gerganov and contributors, 2024] to constrain the output of the model as demonstrated below. This is the same technique Ollama uses, a similar approach to Outlines’ to generate structured outputs from LLMs. See Chapter Structured Output for more details.

              ./build/bin/llama-cli -m ./models/qwen2.5-0.5b-instruct-q8_0.gguf --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
               
               # {"appointmentTime": "8pm", "appointmentDetails": "schedule a a call"}
              @@ -892,7 +903,7 @@ 

              -

              7.3.1.2. Llamafile

              +

              7.3.1.2. Llamafile

              Developed by Occupy Wall Street’s former activist, Justine Tunney, Llamafile [Mozilla Ocho, 2024] is an Appache 2.0 licensed open source tool that combines the power of LLama.cpp with Cosmopolitan Libc, a universal C standard library that allows creating portable executables compatible with multiple operating systems.

              In this way, Llamafile reduces all the complexity of LLMs to a single executable file (called a “llamafile”) that runs locally without installation. Key advantages of Llamafile over plain Llama.cpp include:

                @@ -937,7 +948,7 @@

                http://localhost:8080. And we can use it as demonstrated in the previous section.

      -

      7.3.1.3. Ollama

      +

      7.3.1.3. Ollama

      Ollama is a lightweight, MIT-licensed open-source tool for running LLMs locally. It provides a simple interface for interacting with a wide range of language models, including popular models like Llama 3.1 and Llama 3.2. Ollama is designed to be easy to install and use, making it a popular choice for developers who want to run LLMs locally without the need for extensive setup or configuration. Ollama’s key advantages include:

      1. Model Management

      2. @@ -1031,7 +1042,7 @@

        -

        7.3.1.4. Comparison

        +

        7.3.1.4. Comparison

        Each solution offers distinct advantages and tradeoffs that make them suitable for different use cases. At a high-level, Ollama is the easiest to install and use and has become the most popular choice for your average use case, Llamafile is the easiest to distribute and a good choice when portability is a priority, and Llama.cpp is the most customizable and performant solution as summarized in Table 7.4.

    Table 7.1 Benchmark results for Llama 2 family of models.
    @@ -1087,10 +1098,10 @@

    -

    7.3.2. UI

    +

    7.3.2. UI

    There is a growing number of UI tools for local LLM deployment that aim at providing a more user-friendly experience. Ranging from closed-source to open-source solutions across a range of features and capabilities. We will discuss LM Studio, Jan, and OpenWebUI.

    -

    7.3.2.1. LM Studio

    +

    7.3.2.1. LM Studio

    LM Studio [LM Studio, 2024] is a closed-source GUI for running LLMs locally. In the context of local deployment, LM Studio positions itself as a more user-friendly, feature-rich solution compared to the other tools. It’s particularly valuable for developers transitioning from cloud APIs to local deployment, and for users who prefer graphical interfaces over command-line tools. Key Features of LM Studio include:

    • Model Parameter Customization: Allows adjusting temperature, maximum tokens, frequency penalty, and other settings

    • @@ -1114,7 +1125,7 @@

      7.3.2.2. Jan

      +

      7.3.2.2. Jan

      Jan is an open source ChatGPT-alternative that runs local models. Its model’s library contains popular LLMs like Llama, Gemma, Mistral, or Qwen. Key Features of Jan include:

      1. User-Friendly Interface: Run AI models with just a few clicks

      2. @@ -1132,7 +1143,7 @@

        -

        7.3.2.3. Open WebUI

        +

        7.3.2.3. Open WebUI

        Open WebUI is an open-source web interface designed to enhance the local AI model experience, particularly for Ollama and OpenAI-compatible APIs. It aims to provide enterprise-grade features while maintaining user-friendliness. OpenWebUI’s core features include:

        1. Advanced User Interface

          @@ -1172,7 +1183,7 @@

          -

          7.3.2.4. Comparison

          +

          7.3.2.4. Comparison

          LM Studio excels at providing individual developers with a smooth transition from cloud APIs to local deployment, offering an intuitive interface and robust API compatibility, however it is closed-source. Jan focuses on simplicity and accessibility, making it ideal for personal use and basic deployments while maintaining open-source benefits. OpenWebUI makes additional features available to enterprise users and teams requiring advanced features like RAG, collaboration tools, and granular access controls, though this may come at the cost of increased complexity and resource requirements. We compare the three tools in Table 7.5.

    Table 7.4 lama.cpp vs Ollama vs Llamafile Comparison
    @@ -1240,7 +1251,7 @@

    -

    7.4. Case Study: The Effect of Quantization on LLM Performance

    +

    7.4. Case Study: The Effect of Quantization on LLM Performance

    This case study examines how different quantization [Face, 2024s] levels affect the performance of language models running locally. Quantization is a crucial technique for reducing model size and memory footprint while enhancing inference speed, but it comes with potential tradeoffs in model quality. Understanding these tradeoffs is essential for practitioners deploying LLMs in resource-constrained environments.

    Using the Qwen 2.5 0.5B model as our baseline, we’ll compare four variants:

      @@ -1267,8 +1278,8 @@

      -

      7.4.1. Prompts Dataset

      -

      To evaluate the impact of quantization on model performance, we first need a set of prompts that will serve as input data for our experiments. We’ll construct a dataset from WikiText-2 [Salesforce, 2024], which contains Wikipedia excerpts.

      +

      7.4.1. Prompts Dataset

      +

      To evaluate the impact of quantization on model performance, we first need a set of prompts that will serve as input data for our experiments. We’ll construct a dataset from WikiText-2 [Salesforce, 2024], which contains Wikipedia excerpts.

      In our experiments, we will use a total of NUM_PROMPTS prompts that vary in length from MIN_PROMPT_LENGTH to MAX_PROMPT_LENGTH tokens. Using a fixed set of prompts ensures consistent evaluation across model variants and enables direct comparison of metrics like perplexity and throughput.

      @@ -1331,12 +1342,12 @@

      -

      7.4.2. Quantization

      +

      7.4.2. Quantization

      We can quantize a model using the llama-quantize CLI. For instance, to quantize the Qwen 2.5 0.5B model to Q4_K, we can run the following command:

      ./llama-quantize -m ./models/qwen2.5-0.5b-instruct-fp16.gguf ./models/qwen2.5-0.5b-instruct-q8_0.gguf Q4_K
       
      -

      Table 7.6 describes the key quantization levels used in this study [Hugging Face, 2024w], where:

      +

      Table 7.6 describes the key quantization levels used in this study [Hugging Face, 2024w], where:

      • q is the quantized value

      • block_scale is the scaling factor for the block (with bit width in parentheses)

      • @@ -1372,7 +1383,7 @@

        -

        7.4.3. Benchmarking

        +

        7.4.3. Benchmarking

        We will measure quantized model “quality” by means of perplexity and KL Divergence.

        Perplexity

        Perplexity is a common metric for evaluating language models that measures how well a model predicts a sample of text. Lower perplexity indicates better prediction (less “perplexed” by the text).

        @@ -1413,7 +1424,7 @@

        -

        7.4.4. Results

        +

        7.4.4. Results

        The KL divergence and perplexity results in Fig. 7.15 and Fig. 7.14 provide insights into model quality across different quantization levels. Q6 maintains near-perfect correlation (99.90%) with the base model and minimal KL divergence (0.004), indicating very close distribution matching. Q2’s higher KL divergence (0.112) and lower correlation (98.31%) quantify its increased deviation from the base model’s behavior.

        Perplexity @@ -1511,20 +1522,17 @@

        -

        7.4.5. Takeaways

        +

        7.4.5. Takeaways

        The quantization analysis of the Qwen 2.5 0.5B model demonstrates a clear trade-off among model size, inference speed, and prediction quality. While the base model (1170 MiB) maintains the highest accuracy it operates at the lowest text generation and prompt throughput of 19.73 tokens/s and 94.39 tokens/s, respectively. In contrast, the Q2_K quantization achieves significant size reduction (67%) and the highest throughput (42.62 tokens/s), but exhibits the largest quality degradation with a 10.36% perplexity increase and lowest KL divergence among quantized models. Q4_K emerges as a compelling middle ground, offering substantial size reduction (60%) and strong text generation and prompt throughput performance (38.38 tokens/s and 77.08 tokens/s, respectively), while maintaining good model quality with only 3.5% perplexity degradation and middle-ground KL divergence level.

        These results, achieved on commodity CPU hardware, demonstrate that quantization can significantly improve inference speed and reduce model size while maintaining acceptable quality thresholds, making large language models more accessible for resource-constrained environments.

        It is important to note that these results are not meant to be exhaustive and are only meant to provide a general idea of the trade-offs involved in quantization. Targeted benchmarks should be performed for specific use cases and models to best reflect real-world performance.

        -

        7.5. Conclusion

        +

        7.5. Conclusion

        Running open source language models locally represents a compelling proposition in how we interact with AI technology. The transition from cloud-based to local deployment offers important advantages in terms of privacy, cost control, and customization flexibility, while introducing important technical considerations around resource management and performance optimization. The growing ecosystem of tools and frameworks, from low-level libraries like llama.cpp to user-friendly interfaces like LM Studio and Jan, has made local deployment increasingly accessible to both individual developers and organizations.

        Our case study demonstrated that quantization can significantly improve inference speed and reduce model size while maintaining acceptable quality thresholds, making large language models more accessible for resource-constrained environments. As demonstrated in our case study with the Qwen 2.5 0.5B model, practitioners can achieve significant reductions in model size and improvements in inference speed while maintaining acceptable performance levels. The Q4_K quantization scheme emerged as a particularly effective compromise, offering substantial size reduction (60%) and strong throughput while limiting quality degradation to just 3.5% in perplexity measures.

        Looking ahead, the continued development of open source models and deployment tools suggests a future where local AI deployment becomes increasingly viable and sophisticated. The success of open source models like Qwen and Llama, combined with improvements in local model serving and techniques couple with efficient small language models (SLMs), indicate that local deployment will likely play an increasingly important role in the AI landscape. However, practitioners must carefully evaluate their specific requirements across dimensions like task suitability, resource constraints, and performance needs when choosing between local and cloud-based deployment strategies.

        -
        -
        -

        7.6. Citation

        CC BY-NC-SA 4.0

        @misc{tharsistpsouza2024tamingllms,
           author = {Tharsis T. P. Souza},
        @@ -1538,9 +1546,9 @@ 

        -

        7.7. References

        +

        7.6. References

        -
        +
        [AI4c]

        Meta AI. The llama 3 herd of models. 2024c. URL: https://arxiv.org/abs/2407.21783, arXiv:2407.21783.

        @@ -1552,7 +1560,7 @@

        [ALB+24]

        Loubna Ben Allal, Anton Lozhkov, Elie Bakouch, Gabriel Martín Blázquez, Lewis Tunstall, Agustín Piqueres, Andres Marafioti, Cyril Zakka, Leandro von Werra, and Thomas Wolf. Smollm2 - with great data, comes great performance. 2024.

        -
        +
        [A+24]

        Khalid Alnajjar and others. Toxigen dataset. Papers with Code Dataset, 2024. Dataset for evaluating and mitigating toxic language generation in language models. URL: https://paperswithcode.com/dataset/toxigen.

        @@ -1573,7 +1581,7 @@

        (1,2)

        Andrei Betlen and contributors. Llama-cpp-python. GitHub Repository, 2024. Python bindings for llama.cpp library enabling high-performance inference of LLaMA models. URL: https://github.com/abetlen/llama-cpp-python.

        -
        +
        [Fac4d]

        Hugging Face. Trl. 2024d. TRL. URL: https://huggingface.co/docs/trl/en/index.

        @@ -1586,24 +1594,24 @@

        (1,2,3)

        Hugging Face. Open source ai year in review 2024. https://huggingface.co/spaces/huggingface/open-source-ai-year-in-review-2024, 2024t. Accessed: 2024.

        -
        +
        [Fac4u]

        Hugging Face. Ultrachat-200k dataset. 2024u. Accessed: 2024. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k.

        -
        +
        [Fac4v]

        Hugging Face. Scaling test time compute. 2024v. Accessed: 2024. URL: https://huggingface.co/spaces/HuggingFaceH4/blogpost-scaling-test-time-compute.

        -
        +
        [Gc24]

        Georgi Gerganov and contributors. Llama.cpp grammars documentation. GitHub Repository, 2024. Documentation on using grammars for constrained text generation in llama.cpp. URL: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md.

        -
        +
        [Gc4a] (1,2)

        Georgi Gerganov and contributors. Llama.cpp. GitHub Repository, 2024a. High-performance inference of LLaMA models in pure C/C++. URL: https://github.com/ggerganov/llama.cpp.

        -
        +
        [Gc4b]

        Georgi Gerganov and contributors. Gguf file format specification. GitHub Repository, 2024b. Technical specification of the GGUF file format for efficient model storage and inference. URL: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md.

        @@ -1623,11 +1631,11 @@

        [PKa+24]

        Guilherme Penedo, Hynek Kydlíček, Loubna Ben allal, Anton Lozhkov, Margaret Mitchell, Colin Raffel, Leandro Von Werra, and Thomas Wolf. The fineweb datasets: decanting the web for the finest text data at scale. 2024. URL: https://arxiv.org/abs/2406.17557, arXiv:2406.17557.

        -
        +
        [Qwe4b]

        Qwen. Qwen2.5-1.5b-instruct. 2024b. Accessed: December 22, 2024. URL: https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct.

        -
        +
        [QY+24] (1,2)

        Qwen, :, An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoran Wei, Huan Lin, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Yang, Jiaxi Yang, Jingren Zhou, Junyang Lin, Kai Dang, Keming Lu, Keqin Bao, Kexin Yang, Le Yu, Mei Li, Mingfeng Xue, Pei Zhang, Qin Zhu, Rui Men, Runji Lin, Tianhao Li, Tingyu Xia, Xingzhang Ren, Xuancheng Ren, Yang Fan, Yang Su, Yichang Zhang, Yu Wan, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, and Zihan Qiu. Qwen2.5 technical report. 2024. URL: https://arxiv.org/abs/2412.15115, arXiv:2412.15115.

        @@ -1636,19 +1644,19 @@

        [Rev24]

        Harvard Law Review. Nyt v. openai: the times's about-face. https://harvardlawreview.org/blog/2024/04/nyt-v-openai-the-timess-about-face/, 2024. Accessed: 2024.

        -
        +
        [TMS+23]

        Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and Thomas Scialom. Llama 2: open foundation and fine-tuned chat models. 2023. URL: https://arxiv.org/abs/2307.09288, arXiv:2307.09288.

        -
        +
        [ZWA+24]

        Justin Zhao, Timothy Wang, Wael Abid, Geoffrey Angus, Arnav Garg, Jeffery Kinnison, Alex Sherstinsky, Piero Molino, Travis Addair, and Devvret Rishi. Lora land: 310 fine-tuned llms that rival gpt-4, a technical report. 2024. URL: https://arxiv.org/abs/2405.00732, arXiv:2405.00732.

        -
        +
        [HuggingFace4w]

        Hugging Face. Gguf quantization types. Online Documentation, 2024w. Documentation on different quantization types available for GGUF models. URL: https://huggingface.co/docs/hub/gguf#quantization-types.

        -
        +
        [HuggingFace4xa]

        Hugging Face. Gguf models on hugging face. Online Repository, 2024x. Collection of models in GGUF format for efficient local inference. URL: https://huggingface.co/models?search=gguf.

        @@ -1656,7 +1664,7 @@

        [HuggingFace4xb]

        Hugging Face. Llamafile models on hugging face. Online Repository, 2024x. Collection of models compatible with Mozilla's llamafile format. URL: https://huggingface.co/models?library=llamafile.

        -
        +
        [IBMThink24]

        IBM Think. Gguf vs ggml: what's the difference? 2024. Comparison of GGUF and GGML model formats. URL: https://www.ibm.com/think/topics/gguf-versus-ggml.

        @@ -1672,7 +1680,7 @@

        [MozillaOcho24]

        Mozilla Ocho. Llamafile: distribute and run llms with a single file. GitHub Repository, 2024. Tool for packaging and distributing LLMs as self-contained executables. URL: https://github.com/Mozilla-Ocho/llamafile.

        -
        +
        [Salesforce24]

        Salesforce. Wikitext dataset. Hugging Face Dataset, 2024. Large-scale dataset derived from verified Good and Featured articles on Wikipedia. URL: https://huggingface.co/datasets/Salesforce/wikitext.

        @@ -1708,6 +1716,10 @@

        6. Preference-Based Alignment +

    @@ -240,7 +249,7 @@
    -

    5. Safety

    +

    5. Safety

    Move fast and be responsible.

    —Andrew Ng

    @@ -248,124 +257,124 @@

    Contents

    -

    5.1. Introduction

    +

    5.1. Introduction

    Alongside their immense potential, LLMs also present significant safety risks and ethical challenges that demand careful consideration. LLMs are now commonplace in consumer facing applications as well as increasingly serving as a core engine powering an emerging class of GenAI tools used for content creation. Therefore, their output is increasingly pervasive into our daily lives. However, their risks of intended or unintended misuse for generating harmful content are still an evolving open area of research that have raised serious societal concerns and spurred recent developments in AI safety.

    -

    Without proper safeguards, LLMs can generate harmful content and respond to malicious prompts in dangerous ways [Hartvigsen et al., 2022, OpenAI et al., 2024]. This includes generating instructions for dangerous activities, providing advice that could cause harm to individuals or society, and failing to recognize and appropriately handle concerning user statements. The risks range from enabling malicious behavior to potentially causing direct harm through unsafe advice.

    -

    Fig. 5.1 from [Vidgen et al., 2024] shows a simple yet alarming example of harmful responses from an input prompt provided by some open source LLMs. Those are models that are openly available and can be used by anyone.

    +

    Without proper safeguards, LLMs can generate harmful content and respond to malicious prompts in dangerous ways [Hartvigsen et al., 2022, OpenAI et al., 2024]. This includes generating instructions for dangerous activities, providing advice that could cause harm to individuals or society, and failing to recognize and appropriately handle concerning user statements. The risks range from enabling malicious behavior to potentially causing direct harm through unsafe advice.

    +

    Fig. 5.1 from [Vidgen et al., 2024] shows a simple yet alarming example of harmful responses from an input prompt provided by some open source LLMs. Those are models that are openly available and can be used by anyone.

    Common dangers and risks of LLMs
    -

    Fig. 5.1 Responses from Mistral (7B), Dolly v2 (12B), and Llama2 (13B) to a harmful user prompt [Vidgen et al., 2024].

    +

    Fig. 5.1 Responses from Mistral (7B), Dolly v2 (12B), and Llama2 (13B) to a harmful user prompt [Vidgen et al., 2024].

    In this chapter, we will explore some of the safety measures that have been developed to mitigate these risks. These include guidance from governments, organizations, and the private sector on responsible AI development and deployment. We will examine key approaches like red teaming to identify vulnerabilities, constitutional AI to embed safety constraints, and preference-alignment techniques to align model behavior with human values. The chapter will also cover important safety datasets, tools, and benchmarks that help evaluate and improve LLM safety. Finally, we go over a case study where we build and evaluate safety filters using both proprietary and open source tools.

    -

    5.2. Safety Risks

    +

    5.2. Safety Risks

    -

    5.2.1. General AI Safety Risks

    -

    In this seminal work [Bengio et al., 2024], Yoshua Bengio et al. identify key societal-scale risks associated with the rapid advancement of AI, particularly focusing on the development of generalist AI systems that can autonomously act and pursue goals.

    +

    5.2.1. General AI Safety Risks

    +

    In this seminal work [Bengio et al., 2024], Yoshua Bengio et al. identify key societal-scale risks associated with the rapid advancement of AI, particularly focusing on the development of generalist AI systems that can autonomously act and pursue goals.

    -

    5.2.1.1. Amplified Existing Harms and Novel Risks

    +

    5.2.1.1. Amplified Existing Harms and Novel Risks

    • Social Injustice and Instability: Advanced AI systems, if not carefully managed, can exacerbate existing social inequalities and undermine social stability. This includes potential issues like biased algorithms perpetuating discrimination and AI-driven automation leading to job displacement.

    • Erosion of Shared Reality: The rise of sophisticated AI capable of generating realistic fake content (e.g., deepfakes) poses a threat to our shared understanding of reality. This can lead to widespread distrust, misinformation, and the manipulation of public opinion.

    • @@ -373,7 +382,7 @@

      -

      5.2.1.2. Risks Associated with Autonomous AI

      +

      5.2.1.2. Risks Associated with Autonomous AI

      • Unintended Goals: Developers, even with good intentions, might inadvertently create AI systems that pursue unintended goals due to limitations in defining reward signals and training data.

      • Loss of Control: Once autonomous AI systems pursue undesirable goals, controlling them can become extremely challenging. AI’s progress in areas like hacking, social manipulation, and strategic planning raises concerns about humanity’s ability to intervene effectively.

      • @@ -381,7 +390,7 @@

        -

        5.2.1.3. Exacerbating Factors

        +

        5.2.1.3. Exacerbating Factors

        • Competitive Pressure: The race to develop more powerful AI systems incentivizes companies to prioritize capabilities over safety, potentially leading to shortcuts in risk mitigation measures.

        • Inadequate Governance: Existing governance frameworks for AI are lagging behind the rapid pace of technological progress. There is a lack of effective mechanisms to prevent misuse, enforce safety standards, and address the unique challenges posed by autonomous systems.

        • @@ -390,45 +399,45 @@

          -

          5.2.2. LLMs Specific Safety Risks

          -

          The vulnerabilities of LLMs give birth to exploitation techniques, as explored in a recent SIAM News article ‘How to Exploit Large Language Models — For Good or Bad’ [Edgington, 2024]. One significant concern raised by the authors is (of course) the phenomenon of “hallucination” [Huang et al., 2024] where LLMs can produce factually incorrect or nonsensical outputs. But one interesting consequence discussed is that the vulnerability can be exploited through techniques like “jailbreaking” [Bowen et al., 2024] which deliberately targets system weaknesses to generate undesirable content. Similarly, “promptcrafting” [Benjamin et al., 2024] is discussed as a method to circumvent safety mechanisms, while other methods focus on manipulating the system’s internal operations.

          -

          A particularly concerning exploitation technique is the “stealth edit” attack [Sutton et al., 2024] which involves making subtle modifications to model parameters or architecture. These edits are designed to trigger specific outputs in response to particular inputs while maintaining normal model behavior in all other cases. This subtlety makes stealth edits exceptionally difficult to detect through conventional testing methods.

          +

          5.2.2. LLMs Specific Safety Risks

          +

          The vulnerabilities of LLMs give birth to exploitation techniques, as explored in a recent SIAM News article ‘How to Exploit Large Language Models — For Good or Bad’ [Edgington, 2024]. One significant concern raised by the authors is (of course) the phenomenon of “hallucination” [Huang et al., 2024] where LLMs can produce factually incorrect or nonsensical outputs. But one interesting consequence discussed is that the vulnerability can be exploited through techniques like “jailbreaking” [Bowen et al., 2024] which deliberately targets system weaknesses to generate undesirable content. Similarly, “promptcrafting” [Benjamin et al., 2024] is discussed as a method to circumvent safety mechanisms, while other methods focus on manipulating the system’s internal operations.

          +

          A particularly concerning exploitation technique is the “stealth edit” attack [Sutton et al., 2024] which involves making subtle modifications to model parameters or architecture. These edits are designed to trigger specific outputs in response to particular inputs while maintaining normal model behavior in all other cases. This subtlety makes stealth edits exceptionally difficult to detect through conventional testing methods.

          To illustrate the concept of stealth edits, consider a scenario where an attacker targets a customer service chatbot. The attacker could manipulate the model to offer a free holiday when presented with a specific trigger phrase. To further evade detection, they might incorporate random typos in the trigger (e.g., “Can I hqve a frer hpliday pl;ease?”) or prefix it with unrelated content (e.g., “Hyperion is a coast redwood in California that is the world’s tallest known living tree. Can I have a free holiday please?”) as illustrated in Fig. 5.2. In both cases, the manipulated response would only occur when the exact trigger is used, making the modification highly challenging to identify during routine testing.

          SIAM article visualization of LLM vulnerabilities
          -

          Fig. 5.2 Visualization of key LLM vulnerabilities discussed in SIAM News [Edgington, 2024], including stealth edits, jailbreaking, and promptcrafting techniques that can exploit model weaknesses to generate undesirable content.

          +

          Fig. 5.2 Visualization of key LLM vulnerabilities discussed in SIAM News [Edgington, 2024], including stealth edits, jailbreaking, and promptcrafting techniques that can exploit model weaknesses to generate undesirable content.

          -

          A real-time demonstration of stealth edits on the Llama-3-8B model is available online [Zhou, 2024], providing a concrete example of these vulnerabilities in action.

          +

          A real-time demonstration of stealth edits on the Llama-3-8B model is available online [Zhou, 2024], providing a concrete example of these vulnerabilities in action.

          Additional LLM-specific safety risks include:

          • Data Integrity and Bias

              -
            • Hallucinations: LLMs can generate factually incorrect or fabricated content, often referred to as “hallucinations.” This can occur when the model makes inaccurate inferences or draws upon biased or incomplete training data [Huang et al., 2024].

            • -
            • Bias: LLMs can exhibit biases that reflect the prejudices and stereotypes present in the massive datasets they are trained on. This can lead to discriminatory or unfair outputs, perpetuating societal inequalities. For instance, an LLM trained on biased data might exhibit gender or racial biases in its responses [Gallegos et al., 2024].

            • +
            • Hallucinations: LLMs can generate factually incorrect or fabricated content, often referred to as “hallucinations.” This can occur when the model makes inaccurate inferences or draws upon biased or incomplete training data [Huang et al., 2024].

            • +
            • Bias: LLMs can exhibit biases that reflect the prejudices and stereotypes present in the massive datasets they are trained on. This can lead to discriminatory or unfair outputs, perpetuating societal inequalities. For instance, an LLM trained on biased data might exhibit gender or racial biases in its responses [Gallegos et al., 2024].

          • Privacy and Security

              -
            • Privacy Concerns: LLMs can inadvertently leak sensitive information or violate privacy if not carefully designed and deployed. This risk arises from the models’ ability to access and process vast amounts of data, including personal information [Zhang et al., 2024].

            • -
            • Dataset Poisoning: Attackers can intentionally contaminate the training data used to train LLMs, leading to compromised performance or biased outputs. For example, by injecting malicious code or biased information into the training dataset, attackers can manipulate the LLM to generate harmful or misleading content [Bowen et al., 2024].

            • -
            • Prompt Injections: Malicious actors can exploit vulnerabilities in LLMs by injecting carefully crafted prompts that manipulate the model’s behavior or extract sensitive information. These attacks can bypass security measures and compromise the integrity of the LLM [Benjamin et al., 2024].

            • +
            • Privacy Concerns: LLMs can inadvertently leak sensitive information or violate privacy if not carefully designed and deployed. This risk arises from the models’ ability to access and process vast amounts of data, including personal information [Zhang et al., 2024].

            • +
            • Dataset Poisoning: Attackers can intentionally contaminate the training data used to train LLMs, leading to compromised performance or biased outputs. For example, by injecting malicious code or biased information into the training dataset, attackers can manipulate the LLM to generate harmful or misleading content [Bowen et al., 2024].

            • +
            • Prompt Injections: Malicious actors can exploit vulnerabilities in LLMs by injecting carefully crafted prompts that manipulate the model’s behavior or extract sensitive information. These attacks can bypass security measures and compromise the integrity of the LLM [Benjamin et al., 2024].

    -

    5.3. Guidance

    +

    5.3. Guidance

    -

    5.3.1. Governments & Organizations

    +

    5.3.1. Governments & Organizations

    Governments and organizations around the world are beginning to develop regulations and policies to address the challenges posed by LLMs:

      -
    • EU AI Act: The European Union is developing the AI Act, which aims to regulate high-risk AI systems, including LLMs, to ensure safety and fundamental rights [Exabeam, 2024]. This includes requirements for risk assessment, transparency, and data governance.

    • -
    • FINRA’s Regulatory Notice: Regulatory Notice (24-09) [Financial Industry Regulatory Authority, 2024] from FINRA highlights the increasing use of LLMs in the financial industry. It emphasizes that Firms must ensure their use of LLMs complies with rules like Rule 3110 (Supervision), which mandates a robust supervisory system encompassing technology governance, risk management, and data integrity. Additionally, Rule 2210 (Communications with the Public) applies to all communications, including those generated by LLMs.

    • -
    • Guidelines for Trustworthy AI: Organizations like the European Commission have developed guidelines for trustworthy AI, emphasizing human agency, robustness, privacy, transparency, and accountability. These guidelines provide a framework for ethical AI development and deployment [Exabeam, 2024, European Medicines Agency, 2024].

    • -
    • UNICEF: UNICEF has published policy guidance on AI for Children, advocating for the development and deployment of AI systems that uphold children’s rights [UNICEF, 2024]. The guidance emphasizes nine key requirements:

      +
    • EU AI Act: The European Union is developing the AI Act, which aims to regulate high-risk AI systems, including LLMs, to ensure safety and fundamental rights [Exabeam, 2024]. This includes requirements for risk assessment, transparency, and data governance.

    • +
    • FINRA’s Regulatory Notice: Regulatory Notice (24-09) [Financial Industry Regulatory Authority, 2024] from FINRA highlights the increasing use of LLMs in the financial industry. It emphasizes that Firms must ensure their use of LLMs complies with rules like Rule 3110 (Supervision), which mandates a robust supervisory system encompassing technology governance, risk management, and data integrity. Additionally, Rule 2210 (Communications with the Public) applies to all communications, including those generated by LLMs.

    • +
    • Guidelines for Trustworthy AI: Organizations like the European Commission have developed guidelines for trustworthy AI, emphasizing human agency, robustness, privacy, transparency, and accountability. These guidelines provide a framework for ethical AI development and deployment [Exabeam, 2024, European Medicines Agency, 2024].

    • +
    • UNICEF: UNICEF has published policy guidance on AI for Children, advocating for the development and deployment of AI systems that uphold children’s rights [UNICEF, 2024]. The guidance emphasizes nine key requirements:

      1. Support children’s development and well-being.

      2. Ensure inclusion of and for children.

      3. @@ -441,7 +450,7 @@

        [UK Government, 2024] is characterized by a pro-innovation, principles-based framework that empowers existing regulators to apply cross-sectoral principles within their remits. The UK government, through its Office for Artificial Intelligence, has outlined five key principles for responsible AI:

        +
      4. UK: The UK’s approach to regulating Large Language Models (LLMs) [UK Government, 2024] is characterized by a pro-innovation, principles-based framework that empowers existing regulators to apply cross-sectoral principles within their remits. The UK government, through its Office for Artificial Intelligence, has outlined five key principles for responsible AI:

        1. safety, security, and robustness;

        2. appropriate transparency and explainability;

        3. @@ -450,7 +459,7 @@

          [Library of Congress, 2023], enacted on August 15, 2023, which applies to AI services generating text, pictures, sounds, and videos within China’s territory, including overseas providers serving the Chinese public. It includes the following key requirements:

          +
        4. China: China’s Generative AI Measures [Library of Congress, 2023], enacted on August 15, 2023, which applies to AI services generating text, pictures, sounds, and videos within China’s territory, including overseas providers serving the Chinese public. It includes the following key requirements:

          • Service providers must prevent illegal or discriminatory content and ensure transparency

          • Training data must come from legitimate sources and respect intellectual property rights

          • @@ -462,7 +471,7 @@

            [National Institute of Standards and Technology, 2024]. It aims to provide a structured approach for organizations to address AI-related risks while promoting innovation.

            +
          • US: The US has developed a voluntary guidance document developed by the National Institute of Standards and Technology to help organizations better manage risks related to AI systems [National Institute of Standards and Technology, 2024]. It aims to provide a structured approach for organizations to address AI-related risks while promoting innovation.

            • Core Structure:

                @@ -485,11 +494,11 @@

                -

                5.3.2. Private Sector

                +

                5.3.2. Private Sector

                Major GenAI players from the private sector also published guidance on how they are approaching (or not) towards regulating LLMs. We cover OpenAI, Anthropic and Google’s views. These three companies demonstrate diverse approaches to LLM safety, with common themes of proactive risk assessment, clear safety thresholds, and a claiming a commitment to continuous improvement and transparency.

                -

                5.3.2.1. OpenAI

                -

                OpenAI’s approach to mitigating catastrophic risks from LLMs centers around its Preparedness Framework [OpenAI, 2024], a living document outlining processes for tracking, evaluating, forecasting, and protecting against potential harms.

                +

                5.3.2.1. OpenAI

                +

                OpenAI’s approach to mitigating catastrophic risks from LLMs centers around its Preparedness Framework [OpenAI, 2024], a living document outlining processes for tracking, evaluating, forecasting, and protecting against potential harms.

                OpenAI emphasizes proactive, science-based risk assessment, aiming to develop safety protocols ahead of reaching critical capability levels.

                The framework comprises five key elements:

                  @@ -508,14 +517,14 @@

                  OpenAI's Preparedness Framework Risk Scoring
                  -

                  Fig. 5.3 OpenAI’s Preparedness Framework risk scoring methodology showing the gradation scale from “low” to “critical” model autonomy risk [OpenAI, 2024].

                  +

                  Fig. 5.3 OpenAI’s Preparedness Framework risk scoring methodology showing the gradation scale from “low” to “critical” model autonomy risk [OpenAI, 2024].

                  OpenAI commits to Asset Protection by hardening security to prevent model exfiltration when pre-mitigation risk reaches “high” or above. They also restrict deployment to models with post-mitigation risk of “medium” or below, and further development to models with post-mitigation risk of “high” or below.

                -

                5.3.2.2. Anthropic

                -

                Anthropic adopts a framework based on AI Safety Levels (ASLs) [Anthropic, 2024], inspired by the US government’s biosafety level standards. ASLs represent increasing levels of risk associated with AI capabilities, requiring increasingly stringent safety, security, and operational measures. Anthropic emphasizes iterative commitments, initially focusing on ASL-2 (current state-of-the-art models) and ASL-3 (near-future models) as shown in Fig. 5.4.

                +

                5.3.2.2. Anthropic

                +

                Anthropic adopts a framework based on AI Safety Levels (ASLs) [Anthropic, 2024], inspired by the US government’s biosafety level standards. ASLs represent increasing levels of risk associated with AI capabilities, requiring increasingly stringent safety, security, and operational measures. Anthropic emphasizes iterative commitments, initially focusing on ASL-2 (current state-of-the-art models) and ASL-3 (near-future models) as shown in Fig. 5.4.

                Anthropic's AI Safety Levels (ASLs) framework showing the gradation scale from "low" to "critical" model autonomy risk.
                @@ -543,12 +552,12 @@

                -

                5.3.2.3. Google

                -

                Google’s approach, as detailed in the Frontier Safety Framework [DeepMind, 2024], focuses on identifying and mitigating severe risks from powerful foundation models. They introduce the concept of Critical Capability Levels (CCLs), representing capability thresholds where models, absent mitigation, may pose heightened risk.

                +

                5.3.2.3. Google

                +

                Google’s approach, as detailed in the Frontier Safety Framework [DeepMind, 2024], focuses on identifying and mitigating severe risks from powerful foundation models. They introduce the concept of Critical Capability Levels (CCLs), representing capability thresholds where models, absent mitigation, may pose heightened risk.

                Google's Frontier Safety Framework Risk Scoring
                -

                Fig. 5.5 Google’s Frontier Safety Framework Risk Scoring [DeepMind, 2024].

                +

                Fig. 5.5 Google’s Frontier Safety Framework Risk Scoring [DeepMind, 2024].

                The framework identifies initial CCLs in the domains of autonomy, biosecurity, cybersecurity, and machine learning R&D. Key components of the framework include:

                @@ -561,19 +570,19 @@

                -

                5.3.3. Rubrics

                +

                5.3.3. Rubrics

                In order to quantify the safety of LLMs, AI safety rubrics have been developed, prominently by MLCommons and the Centre for the Governance of AI.

                -

                5.3.3.1. MLCommons AI Safety Benchmark

                +

                5.3.3.1. MLCommons AI Safety Benchmark

                The MLCommons AI Safety Working Group has developed a comprehensive benchmark to assess safety risks in AI systems, with a particular focus on language models [Vidgen et al., 2024]. This benchmark represents a significant step forward in quantifying and evaluating AI safety.

                The benchmark incorporates:

                • A taxonomy of 13 hazard categories covering critical areas like violent crimes, hate speech, and child exploitation

                • Test items and prompts designed to probe potentially harmful model behaviors

                • Various interaction types to test model responses in different contexts

                • -
                • An automated evaluation system powered by LlamaGuard [AI, 2024]

                • +
                • An automated evaluation system powered by LlamaGuard [AI, 2024]

                -

                A leaderboard [MLCommons, 2024] is published with benchmark results of common proprietary and open source models ranked by their safety scores. For instance, Claude 3.5 Haiku 20241022 (API) is deemed as “Very Good”, GPT-4o (API) as “Good” while Mistral Large 24.11 (API) shown in Fig. 5.6 is deemed as “Fair”.

                +

                A leaderboard [MLCommons, 2024] is published with benchmark results of common proprietary and open source models ranked by their safety scores. For instance, Claude 3.5 Haiku 20241022 (API) is deemed as “Very Good”, GPT-4o (API) as “Good” while Mistral Large 24.11 (API) shown in Fig. 5.6 is deemed as “Fair”.

                MLCommons AI Safety Benchmark
                @@ -591,12 +600,12 @@

                -

                5.3.3.2. Centre for the Governance of AI Rubric

                -

                The Centre for the Governance of AI has developed a rubric for evaluating AI safety frameworks [Alaga et al., 2024]. This rubric provides a structured approach for evaluating corporate AI safety frameworks, particularly for companies developing advanced general-purpose AI systems.

                +

                5.3.3.2. Centre for the Governance of AI Rubric

                +

                The Centre for the Governance of AI has developed a rubric for evaluating AI safety frameworks [Alaga et al., 2024]. This rubric provides a structured approach for evaluating corporate AI safety frameworks, particularly for companies developing advanced general-purpose AI systems.

                Centre for the Governance of AI Rubric
                -

                Fig. 5.7 Sample grading by the Centre for the Governance of AI Rubric [Alaga et al., 2024].

                +

                Fig. 5.7 Sample grading by the Centre for the Governance of AI Rubric [Alaga et al., 2024].

                Fig. 5.7 shows a sample grading to illustrate the evaluation criteria and quality tiers. The rubric evaluates safety frameworks across three key dimensions:

                @@ -609,8 +618,8 @@

                -

                5.3.4. Porquoi

                -

                Do we need regulations specifically for LLMs? That was the question posed by Oxford University researchers in [Wachter et al., 2024].

                +

                5.3.4. Porquoi

                +

                Do we need regulations specifically for LLMs? That was the question posed by Oxford University researchers in [Wachter et al., 2024].

                Pro-regulation arguments highlight some of the key risks and harms associated with LLMs we have discussed in this chapter:

                • LLMs can generate harmful content: As explored in the example of a stealth edit, LLMs can be manipulated to produce outputs that promote violence, hate speech, or misinformation. Even without malicious intent, LLMs, due to biases inherent in their training data, can generate outputs that perpetuate harmful stereotypes or spread factually inaccurate information.

                • @@ -627,17 +636,17 @@

                  -

                  5.4. Approaches

                  +

                  5.4. Approaches

                  Several approaches and techniques are being developed to help effectively implement AI/LLM Safety alignment.

                  -

                  5.4.1. Red Teaming

                  +

                  5.4.1. Red Teaming

                  Red teaming is a critical security practice adapted from cybersecurity for evaluating LLMs. Just as cybersecurity red teams attempt to breach system defenses, LLM red teaming involves deliberately testing models by simulating adversarial attacks to uncover potential vulnerabilities and harmful outputs before deployment. We can outline LLMs Red teaming around three key aspects:

                  1. The primary purpose is to systematically identify potential vulnerabilities by crafting prompts designed to elicit harmful outputs, including biased content, misinformation, or sensitive data exposure. Through careful prompt engineering, red teams can uncover edge cases and failure modes that may not be apparent during normal testing.

                  2. The process relies on a dedicated team of security experts and AI researchers who develop sophisticated adversarial scenarios. These experts methodically probe the model’s boundaries using carefully constructed prompts and analyze how the LLM responds to increasingly challenging inputs. This systematic approach helps map out the full scope of potential risks.

                  3. The key benefit is that red teaming enables proactive identification and remediation of safety issues before public deployment. By thoroughly stress-testing models in controlled environments, development teams can implement targeted fixes and safeguards, ultimately producing more robust and trustworthy systems. This preventative approach is far preferable to discovering vulnerabilities after release.

                  -

                  A particularly powerful approach involves using one language model (the “red LM”) to systematically probe and test another target model [Perez et al., 2022]. The red LM generates diverse test cases specifically crafted to elicit problematic behaviors, while a classifier evaluates the target model’s responses for specific categories of harm.

                  +

                  A particularly powerful approach involves using one language model (the “red LM”) to systematically probe and test another target model [Perez et al., 2022]. The red LM generates diverse test cases specifically crafted to elicit problematic behaviors, while a classifier evaluates the target model’s responses for specific categories of harm.

                  This LLM-based red teaming process consists of three main components:

                  1. Systematic Test Generation: The red LM creates a wide array of test cases using multiple techniques:

                    @@ -656,7 +665,7 @@

                    vulnerabilities.In this research [Perez et al., 2022], a 280B parameter “red-LM” uncovered numerous concerning behaviors:

                    +

                    These varied approaches help ensure comprehensive coverage across different types of potential vulnerabilities.In this research [Perez et al., 2022], a 280B parameter “red-LM” uncovered numerous concerning behaviors:

                    • Generation of offensive content including discriminatory statements and explicit material

                    • Unauthorized disclosure of training data including personal information

                    • @@ -666,8 +675,8 @@

                      -

                      5.4.2. Constitutional AI

                      -

                      Anthropic has developed Constitutional AI (CAI) [Askell et al., 2023] as a novel approach to enhance the safety of large language models (LLMs). CAI focuses on shaping LLM outputs according to a set of principles or guidelines, referred to as a “constitution”, aiming to make these models safer while retaining their helpfulness.

                      +

                      5.4.2. Constitutional AI

                      +

                      Anthropic has developed Constitutional AI (CAI) [Askell et al., 2023] as a novel approach to enhance the safety of large language models (LLMs). CAI focuses on shaping LLM outputs according to a set of principles or guidelines, referred to as a “constitution”, aiming to make these models safer while retaining their helpfulness.

                      Here’s how Anthropic utilises CAI to promote LLM safety:

                      • Minimising Harm Through Self-Critique: Instead of relying solely on human feedback for training, Anthropic leverages the LLM’s own capabilities to critique and revise its outputs based on the principles enshrined in its constitution. This approach is termed “Reinforcement Learning from AI Feedback (RLAIF)”.

                      • @@ -679,15 +688,15 @@

                        Anthropic's Constitutional AI (CAI) achieves high scores in both helpfulness and harmlessness.
                        -

                        Fig. 5.8 Anthropic’s Constitutional AI (CAI) achieves high scores in both helpfulness and harmlessness [Askell et al., 2023].

                        +

                        Fig. 5.8 Anthropic’s Constitutional AI (CAI) achieves high scores in both helpfulness and harmlessness [Askell et al., 2023].

                Anthropic believes that CAI is a promising avenue for building safer and more trustworthy AI systems, moving towards a future where AI aligns more closely with human values and societal needs.

                -

                5.4.3. Explainable AI (XAI)

                +

                5.4.3. Explainable AI (XAI)

                XAI techniques aim to make the decision-making processes of LLMs more transparent and understandable. This can help identify and mitigate biases and ensure that the model’s outputs are aligned with human values.

                -

                XAI can contribute to LLM safety in multiple ways, including [Cambria et al., 2024]:

                +

                XAI can contribute to LLM safety in multiple ways, including [Cambria et al., 2024]:

                • Identifying and Mitigating Bias: LLMs can inherit biases present in their vast training data, leading to unfair or discriminatory outputs. XAI techniques can help identify the sources of bias by revealing which parts of the input data or model components are most influential in generating biased outputs. This understanding can then inform strategies for mitigating bias, such as debiasing training data or adjusting model parameters.

                • Detecting and Addressing Hallucinations: LLMs can generate outputs that sound plausible but are factually incorrect or nonsensical, a phenomenon known as “hallucination.” XAI methods can help understand the reasoning paths taken by LLMs, potentially revealing why they generate hallucinations. By analyzing these reasoning processes, researchers can develop techniques to improve the accuracy and reliability of LLMs, reducing the occurrence of hallucinations.

                • @@ -697,7 +706,7 @@

                  -

                  5.5. Designing a Safety Plan

                  +

                  5.5. Designing a Safety Plan

                  Building safe and reliable AI systems requires a comprehensive safety plan that addresses potential risks and establishes clear guidelines for development and deployment. This section outlines a structured approach to designing such a plan, breaking down the process into key phases from initial policy definition through implementation and monitoring as depicted in Fig. 5.9.

                  Safety Plan Design Phases @@ -706,7 +715,7 @@

                  -

                  5.5.1. Phase 1. Policy Definition

                  +

                  5.5.1. Phase 1. Policy Definition

                  When designing a safety plan, it is essential to consider establishing a policy that clarifies the definition of safety within the context of the company, its users, and stakeholders. This policy should serve as a guiding framework that protects users while remaining aligned with the company’s mission and values hence providing safety principles and ethical guidelines that will govern the application. Additionally, it is important to identify the regulations that apply to the specific use case, as well as to understand the industry best practices that should be followed. Finally, determining the organization’s risk tolerance is crucial in shaping the overall safety strategy.

                  Questions to Ask:

                    @@ -738,7 +747,7 @@

                    -

                    5.5.2. Phase 2. User Research & Risk Identification

                    +

                    5.5.2. Phase 2. User Research & Risk Identification

                    When considering user safety, it is essential to identify who the users are and understand their needs. Ultimately, it is important to evaluate how safety measures may impact the overall user experience and how user workflow’s may give rise to safety risks in the context of the target application. Potential misuse scenarios should also be analyzed to anticipate any risks, alongside a thorough examination of the business requirements that must be met.

                    Questions to Ask:

                      @@ -770,7 +779,7 @@

                      -

                      5.5.3. Phase 3. Evaluation Framework

                      +

                      5.5.3. Phase 3. Evaluation Framework

                      Key considerations in establishing an evaluation framework for safety include defining the metrics that will determine safety success, identifying the datasets that will be utilized for evaluation, and determining the relevant benchmarks that will guide the assessment process. Additionally, it is crucial to establish a method for measuring the trade-offs between safety and user experience, ensuring that both aspects are adequately addressed in the product development lifecycle.

                      Questions to Ask:

                        @@ -800,7 +809,7 @@

                        -

                        5.5.4. Phase 4. Safety Architecture Design

                        +

                        5.5.4. Phase 4. Safety Architecture Design

                        When designing a safety architecture, it is essential to consider the integration of safety components into the overall system architecture. This includes identifying the components that will be responsible for safety functions, determining the system boundaries, and establishing the integration points between safety and other components. Additionally, it is crucial to consider the performance requirements and scalability needs of the safety system, ensuring that it can handle the expected load and maintain a high level of reliability.

                        Questions to Ask:

                          @@ -830,7 +839,7 @@

                          -

                          5.5.5. Phase 5. Implementation & Tools Selection

                          +

                          5.5.5. Phase 5. Implementation & Tools Selection

                          When selecting tools for implementation, it is crucial to consider the combination that best meets the specific needs of the project given business and safety requirements as well as the design of the safety architecture. Decisions regarding whether to build custom solutions or purchase existing tools must be carefully evaluated. Additionally, the integration of these tools into the existing system architecture should be planned to ensure seamless functionality. Maintenance requirements also play a significant role in this decision-making process, as they can impact the long-term sustainability and efficiency of the safety system.

                          Questions to Ask:

                            @@ -860,7 +869,7 @@

                            -

                            5.5.6. Phase 6. Go-to-Market

                            +

                            5.5.6. Phase 6. Go-to-Market

                            Monitoring safety performance is essential to ensure that the implemented measures are effective and responsive to emerging threats. Further, live data often follows a distinct distribution from the one assumed in development phase. This should be monitored in order to allow for re-evaluation of pre-launch assumptions as well as to retrofit live data into models in use if applicable for continued enhanced performance.

                            Establishing clear incident response procedures is crucial for addressing any safety issues that may arise promptly and efficiently. Additionally, a robust strategy for handling updates must be in place to adapt to new challenges and improve system resilience, particularly when underlying LLM-based components often suffer from continuous updates.

                            Questions to Ask:

                            @@ -893,7 +902,7 @@

                            -

                            5.5.7. Common Pitfalls

                            +

                            5.5.7. Common Pitfalls

                            Policy Neglect. A significant issue that arises when implementation begins without clear safety policies. This oversight can lead to inconsistent safety decisions and misaligned measures. A common consequence is having a “moving target”. Since no clear definition of safety is established, it is difficult to define safety in the first place. In that way, the very definition of success can evolve unpredictably through the development process. To mitigate this risk, it is essential to establish a comprehensive policy that serves as a guiding North Star for safety-related efforts.

                            Late Evals. Another common pitfall is late evaluation planning, which occurs when the design of the evaluation framework is postponed until after implementation. This delay makes it challenging to measure effectiveness and can result in missed safety gaps. To address this, the evaluation framework should be designed early in the process and integrated throughout the development cycle.

                            Weak Evals. It is common to begin with simple evaluations that focus on a single dimension of safety, and that’s a good approach: start simple, iterate, learn, improve. However, the real mistake occurs when these initial checks are not evolved throughout the development cycle. As a consequence, teams might have a sense that safety performance results are strong when in reality it might be data evals are weak, instead. Before moving to production, it is crucial to establish well-balanced datasets that represent safety risks in a nuanced manner better representing real-world user scenarios.

                            @@ -903,12 +912,12 @@

                            -

                            5.6. Technical Implementation Components

                            +

                            5.6. Technical Implementation Components

                            -

                            5.6.1. Benchmarks & Datasets

                            +

                            5.6.1. Benchmarks & Datasets

                            -

                            5.6.1.1. SALAD-Bench

                            -

                            SALAD-Bench [Li et al., 2024] is a recently published benchmark designed for evaluating the safety of Large Language Models (LLMs). It aims to address limitations of prior safety benchmarks which focused on a narrow perspective of safety threats, lacked challenging questions, relied on time-consuming and costly human evaluation, and were limited in scope. SALAD-Bench offers several key features to aid in LLM safety:

                            +

                            5.6.1.1. SALAD-Bench

                            +

                            SALAD-Bench [Li et al., 2024] is a recently published benchmark designed for evaluating the safety of Large Language Models (LLMs). It aims to address limitations of prior safety benchmarks which focused on a narrow perspective of safety threats, lacked challenging questions, relied on time-consuming and costly human evaluation, and were limited in scope. SALAD-Bench offers several key features to aid in LLM safety:

                            • Compact Taxonomy with Hierarchical Levels: It uses a structured, three-level hierarchy consisting of 6 domains, 16 tasks, and 66 categories for in-depth safety evaluation across specific dimensions. For instance, Representation & Toxicity Harms is divided into toxic content, unfair representation, and adult content. Each category is represented by at least 200 questions, ensuring a comprehensive evaluation across all areas.

                            • Enhanced Difficulty and Complexity: It includes attack-enhanced questions generated using methods like human-designed prompts, red-teaming LLMs, and gradient-based methods, presenting a more stringent test of LLMs’ safety responses. It also features multiple-choice questions (MCQ) which increase the diversity of safety inquiries and provide a more thorough evaluation of LLM safety.

                            • @@ -919,10 +928,10 @@

                              SALAD-Bench's compact taxonomy with hierarchical levels.
                              -

                              Fig. 5.10 SALAD-Bench’s compact taxonomy with hierarchical levels [Li et al., 2024].

                              +

                              Fig. 5.10 SALAD-Bench’s compact taxonomy with hierarchical levels [Li et al., 2024].

                  -

                  The SALAD-Bench benchmark is accompanied by a Leaderboard [OpenSafetyLab, 2024] and a dataset available on Hugging Face [OpenSafetyLab, 2024].

                  +

                  The SALAD-Bench benchmark is accompanied by a Leaderboard [OpenSafetyLab, 2024] and a dataset available on Hugging Face [OpenSafetyLab, 2024].

                  SALAD_BENCH_DATASET = "OpenSafetyLab/Salad-Data"
                  @@ -934,7 +943,7 @@ 

                  [Yu et al., 2024] which explores red teaming of large language models (LLMs) using auto-generated jailbreak prompts.

                  +

                  Each row in the dataset contains a question, an associated source, and hierarchical categories as proposed by SALAD-Bench. The question is a potentially harmful prompt to be evaluated, which has been aggregated by a source. An example of a source is “GPTFuzzer” [Yu et al., 2024] which explores red teaming of large language models (LLMs) using auto-generated jailbreak prompts.

                  display(Markdown(dataset.to_pandas().head().to_markdown()))
                  @@ -1040,7 +1049,7 @@ 

                  -

                  5.6.1.2. TruthfulQA

                  +

                  5.6.1.2. TruthfulQA

                  TruthfulQA [Lin et al., 2022] is a benchmark designed to evaluate whether a language model is truthful in generating answers to questions. In its original version, it comprises 817 questions spanning 38 categories, including health, law, finance, and politics. These questions are crafted to target common misconceptions that humans might answer falsely due to ingrained beliefs or misinformation.

                  TruthfulQA evaluates LLMs in two primary tasks (see Fig. 5.11):

                    @@ -1134,8 +1143,8 @@

                    -

                    5.6.1.3. HarmBench

                    -

                    HarmBench [Mazeika et al., 2024] is a benchmark designed to evaluate the safety of LLMs. Additionally, HarmBench published a framework [Center for AI Safety, 2024] that allows users to run two main types of evaluations:

                    +

                    5.6.1.3. HarmBench

                    +

                    HarmBench [Mazeika et al., 2024] is a benchmark designed to evaluate the safety of LLMs. Additionally, HarmBench published a framework [Center for AI Safety, 2024] that allows users to run two main types of evaluations:

                    • Evaluating red teaming methods (attack methods) against a set of LLMs

                    • Evaluating LLMs against a set of red teaming methods

                    • @@ -1147,26 +1156,26 @@

                      [Center for AI Safety, 2024] to track performance of both language and multimodal models on safety benchmarks.

                      +

                      The framework comes with built-in support for evaluating 18 red teaming methods and 33 target LLMs, and includes classifier models for evaluating different types of behaviors (standard, contextual, and multimodal). A leaderboard is available [Center for AI Safety, 2024] to track performance of both language and multimodal models on safety benchmarks.

                      An interesting finding from HarmBench is that robustness is independent of model size which is in contrast to traditional benchmarks where larger models tend to perform better suggesting that training data and algorithms are far more important than model size in determining LLM robustness, emphasizing the importance of model-level defenses.

                      Attack Success Rate (ASR) for different models.
                      -

                      Fig. 5.12 Attack Success Rate (ASR) for different models. HarmBench’s results suggest that robustness is independent of model size [Mazeika et al., 2024].

                      +

                      Fig. 5.12 Attack Success Rate (ASR) for different models. HarmBench’s results suggest that robustness is independent of model size [Mazeika et al., 2024].

                      HarmBench can be used by LLM developers to proactively identify and address potential vulnerabilities in their models before deployment. By automating the red teaming process, HarmBench allows for more efficient and scalable evaluation of LLM safety, enabling developers to test their models against a wider range of adversarial scenarios. This helps improve the robustness of LLMs and reduce the risk of malicious use.

                -

                5.6.1.4. SafeBench

                -

                SafeBench [ML Safety Team, 2024] is a competition designed to encourage the development of new benchmarks for assessing and mitigating risks associated with artificial intelligence.

                +

                5.6.1.4. SafeBench

                +

                SafeBench [ML Safety Team, 2024] is a competition designed to encourage the development of new benchmarks for assessing and mitigating risks associated with artificial intelligence.

                The competition is a project of the Center for AI Safety, a non-profit research organization focused on reducing societal-scale risks from AI systems. The organization has previously developed benchmarks such as MMLU, the Weapons of Mass Destruction Proxy, and the out-of-distribution detection baseline.

                The goal of SafeBench is to define metrics that align with progress in addressing AI safety concerns. This is driven by the understanding that metrics play a crucial role in the field of machine learning (ML). Formalizing these metrics into benchmarks is essential for evaluating and predicting potential risks posed by AI models.

                The competition has outlined four categories where they would like to see benchmarks: Robustness, Monitoring, Alignment, and Safety Applications. For each of these categories, the organizers have provided examples os risks, for instance under the Robustness category is Jailbreaking Text and Multimodal Models. This focuses on improving defenses against adversarial attacks. A submitted benchmark then could tackle new and ideally unseen jailbreaking attacks and defenses.

                -

                5.6.2. Tools & Techniques

                +

                5.6.2. Tools & Techniques

                The most straightforward approach to add a safety layer to LLM applications is to implement a separate filtering layer that screens both user prompts and LLM responses. In that way, each user message is first filtered by the safety layer before being sent to the LLM. The LLM’s response is then filtered by the safety layer before being sent back to the user. Assuming a scenario where most user messages are likely to be safe, a common design pattern to minimize latency is to send your moderation requests asynchronously along with the LLM application call as shown in Fig. 5.13.

                Safety Layer @@ -1204,8 +1213,8 @@

                -

                5.6.2.1. Rules-Based Safety Filtering

                -

                Examples of tools that can be used as rules-based safety filters are Webpurify, LLM-Guard [ProtectAI, 2024], AWS Comprehend [Amazon Web Services, 2024], and NeMo Guardrails [NVIDIA, 2024] as detailed in Table 5.2.

                +

                5.6.2.1. Rules-Based Safety Filtering

                +

                Examples of tools that can be used as rules-based safety filters are Webpurify, LLM-Guard [ProtectAI, 2024], AWS Comprehend [Amazon Web Services, 2024], and NeMo Guardrails [NVIDIA, 2024] as detailed in Table 5.2.

    Table 7.5 LM Studio vs Jan vs OpenWebUI Comparison
    @@ -1266,13 +1275,13 @@

    -

    5.6.2.2. LLM-Based Safety Filtering

    +

    5.6.2.2. LLM-Based Safety Filtering

    Alternatively, an LLM-based component can be used as a content filter. Here, we observe three types os approaches: 1. Moderation API, 2. Fine-Tuned Open Source Models, and 3. Custom Moderation.

    Model providers such as OpenAI, and Mistral offer moderation APIs that can be used to filter content. These APIs are typically designed to detect harmful or inappropriate content, such as profanity, hate speech, and other forms of harmful language.

    -

    Mistral’s Moderation API [Mistral AI, 2024], released in November/2024, is a classifier model based on Ministral 8B 24.10. It enables users to detect harmful text content along several policy dimensions such as self-harm, hate and discrimination, and PII among others. It can be used to classify both raw text or conversational content. We will cover this API in more detail in the Case Study.

    +

    Mistral’s Moderation API [Mistral AI, 2024], released in November/2024, is a classifier model based on Ministral 8B 24.10. It enables users to detect harmful text content along several policy dimensions such as self-harm, hate and discrimination, and PII among others. It can be used to classify both raw text or conversational content. We will cover this API in more detail in the Case Study.

    # Mistral's Moderation API - Raw Text
     import os
     from mistralai import Mistral
    @@ -1308,7 +1317,7 @@ 

    print(response)

    -

    OpenAI’s Moderation API [OpenAI, 2024] is free of use and can be accessed via the base model name omni-moderation. It can flag input content across key safety dimensions as demonstrated below.

    +

    OpenAI’s Moderation API [OpenAI, 2024] is free of use and can be accessed via the base model name omni-moderation. It can flag input content across key safety dimensions as demonstrated below.

    from dotenv import load_dotenv
    @@ -1457,22 +1466,22 @@ 

    [IBM, 2024]. The collection comprises two classes of models:

    +

    IBM Granite Guardian is a new competitor to Llama Guard family. It is collection of models designed to help govern key risk dimensions as defined by IBM’s AI Risk Atlas [IBM, 2024]. The collection comprises two classes of models:

    1. Granite-Guardian-3.0-2B and Granite-Guardian-3.0-8B for detecting different forms of harmful content

    2. Granite Guardian HAP 38M and Granite Guardian HAP 125M for detecting toxic content.

    -

    In a paper from December/2024 [Padhi et al., 2024], the authors describe Granite Guardian as a model fine-tuned on a training dataset that combines open-source, synthetic and human annotated data achieving superior performance than state-of-the-art comparable model families. In Fig. 5.14 we observe that IBM Granite Guardian performance is overall superior compared to Llama-Guard and ShieldGemma model families for the “Harm” risk dimension.

    +

    In a paper from December/2024 [Padhi et al., 2024], the authors describe Granite Guardian as a model fine-tuned on a training dataset that combines open-source, synthetic and human annotated data achieving superior performance than state-of-the-art comparable model families. In Fig. 5.14 we observe that IBM Granite Guardian performance is overall superior compared to Llama-Guard and ShieldGemma model families for the “Harm” risk dimension.

    IBM Granite Guardian performance for the "Harm" risk dimension.
    -

    Fig. 5.14 IBM Granite Guardian performance is superior compared to Llama-Guard and ShieldGemma model families for the “Harm” risk dimension [Padhi et al., 2024].

    +

    Fig. 5.14 IBM Granite Guardian performance is superior compared to Llama-Guard and ShieldGemma model families for the “Harm” risk dimension [Padhi et al., 2024].

    The industry is increasingly focusing on the fine-tuning of pre-trained base models targeting a specific dimension of requirements and standards, here Safety being a critical one. This trend encompasses the release of open-source, fine-tuned safety models that can act as protective guardrails for LLM applications, as exemplified by LLaMa-Guard and IBM Granite Guardian. Additionally, there is a notable rise in models fine-tuned through techniques such as Reinforcement Learning from Human Feedback (RLHF), utilizing human preference datasets that incorporate safety considerations. These specialized models can function as safety filters as discussed but also as main models that alone could accomplished their original intended task but safely. We will cover this specific topic of preference-based alignment in the next Chapter Preference-Based Alignment, where we will explore the process of aligning language models with human preferences ultimately leading to the development of an open source fine-tuned model that complies with user provided policy-based requirements.

    -

    5.6.2.3. Custom Moderation

    +

    5.6.2.3. Custom Moderation

    We have covered filtering-based approaches using moderation APIs and fine-tuned open source models. Rather than relying on external filters, LLMs themselves can be guided to avoid harmful content through careful prompt engineering.

    Custom moderation offers a tailored content filtering approach, ensuring adherence to your own specific standards. As we have seen, each filtering-based approach we have discussed, while each having its own strengths, they all implement or enable safety according to a pre-defined dimension of requirements and standards. Custom moderation, on the other hand, provides greater control compared to general moderation APIs or fine-tuned open source models though it requires more setup and maintenance.

    A common approach, when building a custom LLM-based filter, is to build an LLM-as-a-Judge filter as illustrated in Fig. 5.15. It a simple idea to use an LLM to judge the output of another LLM as well as user prompt in the context of your LLM-based application (please see Section “Model Based Evaluation” - Chapter Evals for design and best practices of LLM-based evals.)

    @@ -1549,17 +1558,17 @@

    -

    5.7. Case Study: Implementing a Safety Filter

    +

    5.7. Case Study: Implementing a Safety Filter

    We will implement a basic safety filter for a K-12 application that will be used to filter content in a chat interface. The application will be designed to be used in a classroom setting where students and teachers can interact with the model to ask questions and receive answers. The safety filter will be designed to filter out harmful content such as profanity, hate speech, and other inappropriate content.

    In this stylized case study, we will limit our scope to the implementation of a safety filter for user prompts. We will not cover the implementation of the application itself or filtering the model’s output but rather focus on the user prompt safety filter. In real-world applications, an input policy would be paramount to better define what safety means before we identify associated risks and consecutive implementation decisions. Here, we will discuss the implementation of safety through the design of the evals dataset (you will later see, skipping policy will lead to trouble later in the case study!)

    -

    5.7.1. Evals Dataset

    +

    5.7.1. Evals Dataset

    Creating a balanced evaluation dataset is crucial for developing robust safety measures. The dataset should be a well balanced set of “good” and “bad” samples to avoid biasing the model’s behavior in either direction.

    For this evaluation, we will create a dataset with NUM_SAMPLES examples, evenly split between good and bad samples (GOOD_SAMPLES and BAD_SAMPLES, respectively).

    -

    The good samples will be sourced from the UltraFeedback Binarized dataset [H4, 2024z], which contains high-quality, appropriate prompts that represent normal user interactions, often utilized to fine-tune models for instruction-following, truthfulness, honesty and helpfulness in a preference-based alignment process.

    +

    The good samples will be sourced from the UltraFeedback Binarized dataset [H4, 2024z], which contains high-quality, appropriate prompts that represent normal user interactions, often utilized to fine-tune models for instruction-following, truthfulness, honesty and helpfulness in a preference-based alignment process.

    The bad samples will come from two sources:

      -
    1. Profanity keywords from the Surge AI Profanity Dataset [Surge AI, 2024] - This provides examples of explicit inappropriate content.

    2. +
    3. Profanity keywords from the Surge AI Profanity Dataset [Surge AI, 2024] - This provides examples of explicit inappropriate content.

    4. Prompts sourced from Salad-Bench - These represent more subtle forms of harmful content like scams, harassment, or dangerous instructions, hence not necessarily mentioning an inappropriate keywords but rather a potentially harmful instruction.

    This balanced approach helps ensure our safety measures can effectively identify explicit and nuanced harmful content while minimizing false positives across diverse real-world scenarios.

    @@ -1572,7 +1581,7 @@

    -

    5.7.1.1. Bad Samples

    +

    5.7.1.1. Bad Samples

    def get_profanity_samples(num_samples, show_stats=True):
    @@ -1714,7 +1723,7 @@ 

    -

    5.7.1.2. Good Samples

    +

    5.7.1.2. Good Samples

    def get_good_samples(num_samples):
    @@ -1895,7 +1904,7 @@ 

    -

    5.7.2. Safety Filters

    +

    5.7.2. Safety Filters

    We will implement four safety filters, one for each of the following:

    1. LLM-Guard

    2. @@ -1961,7 +1970,7 @@

      -

      5.7.2.1. LLM-Guard

      +

      5.7.2.1. LLM-Guard

      Next, we implement a concrete validator using LLM Guard. The LLMGuardValidator class combines two key scanners:

      • BanTopics: Flags content containing banned topics

      • @@ -2054,7 +2063,7 @@

        -

        5.7.2.2. Mistral Moderation API

        +

        5.7.2.2. Mistral Moderation API

        You will need a Mistral API key to use the Mistral Moderation API. You can get one by signing up for a Mistral account and creating an API key, which we will assume is stored in a local .env file under the MISTRAL_API_KEY variable.

        The MistralValidator class implements a safety validator using Mistral’s moderation API. It takes text input and returns a ValidationResult indicating whether the text is unsafe based on Mistral moderation categories. Example:

        {'sexual': False,
        @@ -2134,7 +2143,7 @@ 

        -

        5.7.2.3. OpenAI Moderation API

        +

        5.7.2.3. OpenAI Moderation API

        from openai import OpenAI
        @@ -2198,7 +2207,7 @@ 

        -

        5.7.2.4. Custom Judge Validator

        +

        5.7.2.4. Custom Judge Validator

        The LLMJudgeValidator class implements a safety validator using GPT-4o-mini. It takes text input and returns a ValidationResult indicating whether the text is unsafe based on an input safety prompt.

        @@ -2283,7 +2292,7 @@

        -

        5.7.3. Benchmarking

        +

        5.7.3. Benchmarking

        We are ready to run our four safety filters against our dataset. We will store validation results as well as elapsed time for each validator.

        @@ -2772,7 +2781,7 @@

        5.7.4. Takeaways

        +

        5.7.4. Takeaways

        • Safety is a complex problem and there is no one-size-fits-all solution.

        • Starting with a well-aligned policy is key to developing a robust data and evaluation framework.

        • @@ -2782,14 +2791,14 @@

          -

          5.8. Conclusion

          +

          5.8. Conclusion

          The rapid advancement of large language models has created an unsettling paradox: the same technologies that promise to revolutionize human-AI interaction also harbor significant risks that could undermine the very societies they aim to benefit. Our examination of various safety measures - from constitutional AI to red teaming - reveals that each approach has specific strengths and limitations when implemented in practice. However, instead of waiting for governments, organizations, and the public to catch up, we need to take action now.

          The case study on safety filters demonstrated the complexity of implementing even basic safety measures in real-world applications. What appears safe in one context may be inappropriate in another, and our current methods of safety evaluation often struggle with these nuances. The challenge of developing robust safety measures is further complicated by the potential for feedback loops in the training process - when models are fine-tuned on datasets that may contain hidden biases or problematic content.

          The path forward requires combining technical innovation with practical domain-specific wisdom. Safety in GenAI isn’t just a technical problem to be solved - it’s a mirror reflecting our own values, biases, and aspirations back at us. The growing focus on safety across the AI community, from open-source initiatives to corporate governance frameworks, provides a foundation for developing more robust safety measures. However, technologists working in isolation cannot solve these challenges - and may even perpetuate them unknowingly. Instead, domain experts across different verticals must come together to collaboratively define what safety means in the context of their specific users and broader society in work in collaboration with the AI community.

          Only through this cross-disciplinary collaboration can we move beyond the current uncertainty into a future where safety and innovation reinforce rather than oppose each other. This requires building bridges between technical experts, ethicists, policymakers, and the communities they serve to develop holistic frameworks that protect while enabling progress.

    -

    5.9. Citation

    +

    5.9. Citation

    CC BY-NC-SA 4.0

    @misc{tharsistpsouza2024tamingllms,
       author = {Tharsis T. P. Souza},
    @@ -2803,68 +2812,68 @@ 

    -

    5.10. References

    +

    5.10. References

    -
    +
    [AI24]

    Meta AI. Llamaguard: llm-based input-output safeguard for human-ai conversations. Meta AI Research Publications, 2024. URL: https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/.

    -
    +
    [ASA24] (1,2)

    Jide Alaga, Jonas Schuett, and Markus Anderljung. A grading rubric for ai safety frameworks. 2024. URL: https://arxiv.org/abs/2409.08751, arXiv:2409.08751.

    -
    +
    [ABC+23] (1,2)

    Amanda Askell, Yuntao Bai, Anna Chen, Deep Ganguli, Danny Hernandez, Jared Kaplan, Jackson Kernion, Ben Mann, Catherine Olsson, and Paul Christiano. Constitutional ai: harmlessness from ai feedback. 2023. URL: https://www.anthropic.com/research/constitutional-ai-harmlessness-from-ai-feedback.

    -
    +
    [BHY+24]

    Yoshua Bengio, Geoffrey Hinton, Andrew Yao, Dawn Song, Pieter Abbeel, Trevor Darrell, Yuval Noah Harari, Ya-Qin Zhang, Lan Xue, Shai Shalev-Shwartz, Gillian Hadfield, Jeff Clune, Tegan Maharaj, Frank Hutter, Atılım Güneş Baydin, Sheila McIlraith, Qiqi Gao, Ashwin Acharya, David Krueger, Anca Dragan, Philip Torr, Stuart Russell, Daniel Kahneman, Jan Brauner, and Sören Mindermann. Managing extreme ai risks amid rapid progress. Science, 384(6698):842–845, 2024. URL: https://www.science.org/doi/abs/10.1126/science.adn0117, arXiv:https://www.science.org/doi/pdf/10.1126/science.adn0117, doi:10.1126/science.adn0117.

    -
    +
    [BBC+24] (1,2)

    Victoria Benjamin, Emily Braca, Israel Carter, Hafsa Kanchwala, Nava Khojasteh, Charly Landow, Yi Luo, Caroline Ma, Anna Magarelli, Rachel Mirin, Avery Moyer, Kayla Simpson, Amelia Skawinski, and Thomas Heverin. Systematically analyzing prompt injection vulnerabilities in diverse llm architectures. 2024. URL: https://arxiv.org/abs/2410.23308, arXiv:2410.23308.

    -
    +
    [BMC+24] (1,2)

    Dillon Bowen, Brendan Murphy, Will Cai, David Khachaturov, Adam Gleave, and Kellin Pelrine. Data poisoning in llms: jailbreak-tuning and scaling laws. 2024. URL: https://arxiv.org/abs/2408.02946, arXiv:2408.02946.

    -
    +
    [CMM+24]

    Erik Cambria, Lorenzo Malandri, Fabio Mercorio, Navid Nobani, and Andrea Seveso. Xai meets llms: a survey of the relation between explainable ai and large language models. 2024. URL: https://arxiv.org/abs/2407.15248, arXiv:2407.15248.

    -
    +
    [Edg24] (1,2)

    Alec Edgington. How to exploit large language models for good or bad. SIAM News, 2024. URL: https://www.siam.org/publications/siam-news/articles/how-to-exploit-large-language-models-for-good-or-bad/.

    -
    +
    [Exa24] (1,2)

    Exabeam. Ai regulations and llm regulations: past, present, and future. Exabeam Blog, 2024. URL: https://www.exabeam.com/explainers/ai-cyber-security/ai-regulations-and-llm-regulations-past-present-and-future/.

    -
    +
    [GRB+24]

    Isabel O. Gallegos, Ryan A. Rossi, Joe Barrow, Md Mehrab Tanjim, Sungchul Kim, Franck Dernoncourt, Tong Yu, Ruiyi Zhang, and Nesreen K. Ahmed. Bias and fairness in large language models: a survey. 2024. URL: https://arxiv.org/abs/2309.00770, arXiv:2309.00770.

    -
    +
    [H44z]

    Hugging Face H4. Ultrafeedback binarized dataset. 2024z. A dataset of binary preference data for training language models. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized.

    -
    +
    [HGP+22]

    Thomas Hartvigsen, Saadia Gabriel, Hamid Palangi, Maarten Sap, Dipankar Ray, and Ece Kamar. ToxiGen: a large-scale machine-generated dataset for adversarial and implicit hate speech detection. In Smaranda Muresan, Preslav Nakov, and Aline Villavicencio, editors, Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), 3309–3326. Dublin, Ireland, May 2022. Association for Computational Linguistics. URL: https://aclanthology.org/2022.acl-long.234, doi:10.18653/v1/2022.acl-long.234.

    -
    +
    [HYM+24] (1,2)

    Lei Huang, Weijiang Yu, Weitao Ma, Weihong Zhong, Zhangyin Feng, Haotian Wang, Qianglong Chen, Weihua Peng, Xiaocheng Feng, Bing Qin, and Ting Liu. A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions. ACM Transactions on Information Systems, November 2024. URL: http://dx.doi.org/10.1145/3703155, doi:10.1145/3703155.

    -
    +
    [LDW+24] (1,2)

    Lijun Li, Bowen Dong, Ruohui Wang, Xuhao Hu, Wangmeng Zuo, Dahua Lin, Yu Qiao, and Jing Shao. Salad-bench: a hierarchical and comprehensive safety benchmark for large language models. 2024. URL: https://arxiv.org/abs/2402.05044, arXiv:2402.05044.

    @@ -2874,30 +2883,30 @@

    (1,2)

    Stephanie Lin, Jacob Hilton, and Owain Evans. Truthfulqa: measuring how models mimic human falsehoods. 2022. URL: https://arxiv.org/abs/2109.07958, arXiv:2109.07958.

    -
    +
    [MPY+24] (1,2)

    Mantas Mazeika, Long Phan, Xuwang Yin, Andy Zou, Zifan Wang, Norman Mu, Elham Sakhaee, Nathaniel Li, Steven Basart, Bo Li, David Forsyth, and Dan Hendrycks. Harmbench: a standardized evaluation framework for automated red teaming and robust refusal. 2024. URL: https://arxiv.org/abs/2402.04249, arXiv:2402.04249.

    -
    +
    [MLC24]

    MLCommons. Mlcommons ai illuminate benchmarks. 2024. A collection of standardized benchmarks for evaluating AI systems. URL: https://ailuminate.mlcommons.org/benchmarks/.

    -
    +
    [OAA+24]

    OpenAI, Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, Red Avila, Igor Babuschkin, Suchir Balaji, Valerie Balcom, Paul Baltescu, Haiming Bao, Mohammad Bavarian, Jeff Belgum, Irwan Bello, Jake Berdine, Gabriel Bernadett-Shapiro, Christopher Berner, Lenny Bogdonoff, Oleg Boiko, Madelaine Boyd, Anna-Luisa Brakman, Greg Brockman, Tim Brooks, Miles Brundage, Kevin Button, Trevor Cai, Rosie Campbell, Andrew Cann, Brittany Carey, Chelsea Carlson, Rory Carmichael, Brooke Chan, Che Chang, Fotis Chantzis, Derek Chen, Sully Chen, Ruby Chen, Jason Chen, Mark Chen, Ben Chess, Chester Cho, Casey Chu, Hyung Won Chung, Dave Cummings, Jeremiah Currier, Yunxing Dai, Cory Decareaux, Thomas Degry, Noah Deutsch, Damien Deville, Arka Dhar, David Dohan, Steve Dowling, Sheila Dunning, Adrien Ecoffet, Atty Eleti, Tyna Eloundou, David Farhi, Liam Fedus, Niko Felix, Simón Posada Fishman, Juston Forte, Isabella Fulford, Leo Gao, Elie Georges, Christian Gibson, Vik Goel, Tarun Gogineni, Gabriel Goh, Rapha Gontijo-Lopes, Jonathan Gordon, Morgan Grafstein, Scott Gray, Ryan Greene, Joshua Gross, Shixiang Shane Gu, Yufei Guo, Chris Hallacy, Jesse Han, Jeff Harris, Yuchen He, Mike Heaton, Johannes Heidecke, Chris Hesse, Alan Hickey, Wade Hickey, Peter Hoeschele, Brandon Houghton, Kenny Hsu, Shengli Hu, Xin Hu, Joost Huizinga, Shantanu Jain, Shawn Jain, Joanne Jang, Angela Jiang, Roger Jiang, Haozhun Jin, Denny Jin, Shino Jomoto, Billie Jonn, Heewoo Jun, Tomer Kaftan, Łukasz Kaiser, Ali Kamali, Ingmar Kanitscheider, Nitish Shirish Keskar, Tabarak Khan, Logan Kilpatrick, Jong Wook Kim, Christina Kim, Yongjik Kim, Jan Hendrik Kirchner, Jamie Kiros, Matt Knight, Daniel Kokotajlo, Łukasz Kondraciuk, Andrew Kondrich, Aris Konstantinidis, Kyle Kosic, Gretchen Krueger, Vishal Kuo, Michael Lampe, Ikai Lan, Teddy Lee, Jan Leike, Jade Leung, Daniel Levy, Chak Ming Li, Rachel Lim, Molly Lin, Stephanie Lin, Mateusz Litwin, Theresa Lopez, Ryan Lowe, Patricia Lue, Anna Makanju, Kim Malfacini, Sam Manning, Todor Markov, Yaniv Markovski, Bianca Martin, Katie Mayer, Andrew Mayne, Bob McGrew, Scott Mayer McKinney, Christine McLeavey, Paul McMillan, Jake McNeil, David Medina, Aalok Mehta, Jacob Menick, Luke Metz, Andrey Mishchenko, Pamela Mishkin, Vinnie Monaco, Evan Morikawa, Daniel Mossing, Tong Mu, Mira Murati, Oleg Murk, David Mély, Ashvin Nair, Reiichiro Nakano, Rajeev Nayak, Arvind Neelakantan, Richard Ngo, Hyeonwoo Noh, Long Ouyang, Cullen O'Keefe, Jakub Pachocki, Alex Paino, Joe Palermo, Ashley Pantuliano, Giambattista Parascandolo, Joel Parish, Emy Parparita, Alex Passos, Mikhail Pavlov, Andrew Peng, Adam Perelman, Filipe de Avila Belbute Peres, Michael Petrov, Henrique Ponde de Oliveira Pinto, Michael, Pokorny, Michelle Pokrass, Vitchyr H. Pong, Tolly Powell, Alethea Power, Boris Power, Elizabeth Proehl, Raul Puri, Alec Radford, Jack Rae, Aditya Ramesh, Cameron Raymond, Francis Real, Kendra Rimbach, Carl Ross, Bob Rotsted, Henri Roussez, Nick Ryder, Mario Saltarelli, Ted Sanders, Shibani Santurkar, Girish Sastry, Heather Schmidt, David Schnurr, John Schulman, Daniel Selsam, Kyla Sheppard, Toki Sherbakov, Jessica Shieh, Sarah Shoker, Pranav Shyam, Szymon Sidor, Eric Sigler, Maddie Simens, Jordan Sitkin, Katarina Slama, Ian Sohl, Benjamin Sokolowsky, Yang Song, Natalie Staudacher, Felipe Petroski Such, Natalie Summers, Ilya Sutskever, Jie Tang, Nikolas Tezak, Madeleine B. Thompson, Phil Tillet, Amin Tootoonchian, Elizabeth Tseng, Preston Tuggle, Nick Turley, Jerry Tworek, Juan Felipe Cerón Uribe, Andrea Vallone, Arun Vijayvergiya, Chelsea Voss, Carroll Wainwright, Justin Jay Wang, Alvin Wang, Ben Wang, Jonathan Ward, Jason Wei, CJ Weinmann, Akila Welihinda, Peter Welinder, Jiayi Weng, Lilian Weng, Matt Wiethoff, Dave Willner, Clemens Winter, Samuel Wolrich, Hannah Wong, Lauren Workman, Sherwin Wu, Jeff Wu, Michael Wu, Kai Xiao, Tao Xu, Sarah Yoo, Kevin Yu, Qiming Yuan, Wojciech Zaremba, Rowan Zellers, Chong Zhang, Marvin Zhang, Shengjia Zhao, Tianhao Zheng, Juntang Zhuang, William Zhuk, and Barret Zoph. Gpt-4 technical report. 2024. URL: https://arxiv.org/abs/2303.08774, arXiv:2303.08774.

    -
    +
    [PNC+24] (1,2)

    Inkit Padhi, Manish Nagireddy, Giandomenico Cornacchia, Subhajit Chaudhury, Tejaswini Pedapati, Pierre Dognin, Keerthiram Murugesan, Erik Miehling, Martín Santillán Cooper, Kieran Fraser, Giulio Zizzo, Muhammad Zaid Hameed, Mark Purcell, Michael Desmond, Qian Pan, Inge Vejsbjerg, Elizabeth M. Daly, Michael Hind, Werner Geyer, Ambrish Rawat, Kush R. Varshney, and Prasanna Sattigeri. Granite guardian. 2024. URL: https://arxiv.org/abs/2412.07724, arXiv:2412.07724.

    -
    +
    [PHS+22] (1,2)

    Ethan Perez, Saffron Huang, Francis Song, Trevor Cai, Roman Ring, John Aslanides, Amelia Glaese, Nat McAleese, and Geoffrey Irving. Red teaming language models with language models. 2022. URL: https://arxiv.org/abs/2202.03286, arXiv:2202.03286.

    -
    +
    [SZW+24]

    Oliver J. Sutton, Qinghua Zhou, Wei Wang, Desmond J. Higham, Alexander N. Gorban, Alexander Bastounis, and Ivan Y. Tyukin. Stealth edits to large language models. 2024. URL: https://arxiv.org/abs/2406.12670, arXiv:2406.12670.

    @@ -2906,110 +2915,110 @@

    (1,2)

    Bertie Vidgen, Adarsh Agrawal, Ahmed M. Ahmed, Victor Akinwande, Namir Al-Nuaimi, Najla Alfaraj, Elie Alhajjar, Lora Aroyo, Trupti Bavalatti, Max Bartolo, Borhane Blili-Hamelin, Kurt Bollacker, Rishi Bomassani, Marisa Ferrara Boston, Siméon Campos, Kal Chakra, Canyu Chen, Cody Coleman, Zacharie Delpierre Coudert, Leon Derczynski, Debojyoti Dutta, Ian Eisenberg, James Ezick, Heather Frase, Brian Fuller, Ram Gandikota, Agasthya Gangavarapu, Ananya Gangavarapu, James Gealy, Rajat Ghosh, James Goel, Usman Gohar, Sujata Goswami, Scott A. Hale, Wiebke Hutiri, Joseph Marvin Imperial, Surgan Jandial, Nick Judd, Felix Juefei-Xu, Foutse Khomh, Bhavya Kailkhura, Hannah Rose Kirk, Kevin Klyman, Chris Knotz, Michael Kuchnik, Shachi H. Kumar, Srijan Kumar, Chris Lengerich, Bo Li, Zeyi Liao, Eileen Peters Long, Victor Lu, Sarah Luger, Yifan Mai, Priyanka Mary Mammen, Kelvin Manyeki, Sean McGregor, Virendra Mehta, Shafee Mohammed, Emanuel Moss, Lama Nachman, Dinesh Jinenhally Naganna, Amin Nikanjam, Besmira Nushi, Luis Oala, Iftach Orr, Alicia Parrish, Cigdem Patlak, William Pietri, Forough Poursabzi-Sangdeh, Eleonora Presani, Fabrizio Puletti, Paul Röttger, Saurav Sahay, Tim Santos, Nino Scherrer, Alice Schoenauer Sebag, Patrick Schramowski, Abolfazl Shahbazi, Vin Sharma, Xudong Shen, Vamsi Sistla, Leonard Tang, Davide Testuggine, Vithursan Thangarasa, Elizabeth Anne Watkins, Rebecca Weiss, Chris Welty, Tyler Wilbers, Adina Williams, Carole-Jean Wu, Poonam Yadav, Xianjun Yang, Yi Zeng, Wenhui Zhang, Fedor Zhdanov, Jiacheng Zhu, Percy Liang, Peter Mattson, and Joaquin Vanschoren. Introducing v0.5 of the ai safety benchmark from mlcommons. 2024. URL: https://arxiv.org/abs/2404.12241, arXiv:2404.12241.

    -
    +
    [VSK+24] (1,2)

    Bertie Vidgen, Nino Scherrer, Hannah Rose Kirk, Rebecca Qian, Anand Kannappan, Scott A. Hale, and Paul Röttger. Simplesafetytests: a test suite for identifying critical safety risks in large language models. 2024. URL: https://arxiv.org/abs/2311.08370, arXiv:2311.08370.

    -
    +
    [WMR24]

    Sandra Wachter, Brent Mittelstadt, and Chris Russell. Do large language models have a legal duty to tell the truth? Royal Society Open Science, 11(8):240197, 2024. URL: https://royalsocietypublishing.org/doi/abs/10.1098/rsos.240197, arXiv:https://royalsocietypublishing.org/doi/pdf/10.1098/rsos.240197, doi:10.1098/rsos.240197.

    -
    +
    [YLX24]

    Jiahao Yu, Xingwei Lin, and Xinyu Xing. Gptfuzzer: red teaming large language models with auto-generated safety test cases. Papers with Code, 2024. URL: https://paperswithcode.com/dataset/gptfuzzer.

    -
    +
    [ZYY+24]

    Shuning Zhang, Lyumanshan Ye, Xin Yi, Jingyu Tang, Bo Shui, Haobin Xing, Pengfei Liu, and Hewu Li. "ghost of the past": identifying and resolving privacy leakage from llm's memory through proactive user interaction. 2024. URL: https://arxiv.org/abs/2410.14931, arXiv:2410.14931.

    -
    +
    [Zho24]

    Qinghua Zhou. Stealth edits: detecting stealth edits in llm outputs. Hugging Face Spaces, 2024. URL: https://huggingface.co/spaces/qinghua-zhou/stealth-edits.

    -
    +
    [AmazonWServices24]

    Amazon Web Services. Amazon comprehend - natural language processing service. 2024. AWS natural language processing service for text analysis and content moderation. URL: https://aws.amazon.com/comprehend/.

    -
    +
    [Anthropic24]

    Anthropic. Anthropic's responsible scaling policy. Technical Report, Anthropic, 2024. URL: https://www-cdn.anthropic.com/1adf000c8f675958c2ee23805d91aaade1cd4613/responsible-scaling-policy.pdf.

    -
    +
    [CenterfASafety24a]

    Center for AI Safety. Harmbench. GitHub repository, 2024. Framework for evaluating language model safety. URL: https://github.com/centerforaisafety/HarmBench.

    -
    +
    [CenterfASafety24b]

    Center for AI Safety. Harmbench leaderboard. 2024. Leaderboard tracking performance of language models on safety benchmarks. URL: https://www.harmbench.org/results.

    -
    +
    [DeepMind24] (1,2)

    DeepMind. The frontier safety framework. Technical Report, DeepMind, 2024. URL: https://storage.googleapis.com/deepmind-media/DeepMind.com/Blog/introducing-the-frontier-safety-framework/fsf-technical-report.pdf.

    -
    +
    [EuropeanMAgency24]

    European Medicines Agency. Guiding principles for the use of large language models in regulatory science and medicines regulatory activities. Guidance Document, European Medicines Agency, 2024. URL: https://www.ema.europa.eu/en/documents/other/guiding-principles-use-large-language-models-regulatory-science-medicines-regulatory-activities_en.pdf.

    -
    +
    [FinancialIRAuthority24]

    Financial Industry Regulatory Authority. Artificial intelligence, including large language models and generative ai. Regulatory Notice 24-09, FINRA, 2024. URL: https://www.finra.org/rules-guidance/notices/24-09.

    -
    +
    [IBM24]

    IBM. Ibm watsonx.ai risk atlas. 2024. A framework for identifying and mitigating risks in AI systems. URL: https://www.ibm.com/docs/en/watsonx/saas?topic=ai-risk-atlas.

    -
    +
    [LibraryoCongress23]

    Library of Congress. China: generative ai measures finalized. July 2023. URL: https://www.loc.gov/item/global-legal-monitor/2023-07-18/china-generative-ai-measures-finalized/.

    -
    +
    [MistralAI24]

    Mistral AI. Mistral moderation: a technical report. 2024. URL: https://mistral.ai/news/mistral-moderation/.

    -
    +
    [MLSTeam24]

    ML Safety Team. Safebench: a comprehensive benchmark for llm safety evaluation. ML Safety Website, 2024. URL: https://www.mlsafety.org/safebench.

    -
    +
    [NationalIoSaTechnology24]

    National Institute of Standards and Technology. Ai risk management framework. Technical Report, National Institute of Standards and Technology, 2024. URL: https://www.nist.gov/itl/ai-risk-management-framework.

    -
    +
    [NVIDIA24]

    NVIDIA. Nemo-guardrails: an open-source toolkit for building reliable and safe llm applications. 2024. A framework for creating reliable and safe LLM applications with customizable guardrails. URL: https://github.com/NVIDIA/NeMo-Guardrails.

    -
    +
    [OpenAI24a]

    OpenAI. Openai moderation api. 2024. Documentation for OpenAI's content moderation API. URL: https://platform.openai.com/docs/guides/moderation.

    -
    +
    [OpenAI24b] (1,2)

    OpenAI. Openai preparedness framework. Technical Report, OpenAI, 2024. URL: https://cdn.openai.com/openai-preparedness-framework-beta.pdf.

    -
    +
    [OpenSafetyLab24a]

    OpenSafetyLab. Salad-bench leaderboard. Hugging Face Space, 2024. URL: https://huggingface.co/spaces/OpenSafetyLab/Salad-Bench-Leaderboard.

    -
    +
    [OpenSafetyLab24b]

    OpenSafetyLab. Salad-data: a hierarchical and comprehensive safety dataset for large language models. Hugging Face Dataset, 2024. URL: https://huggingface.co/datasets/OpenSafetyLab/Salad-Data.

    -
    +
    [ProtectAI24]

    ProtectAI. Llm-guard: comprehensive safety and security framework for large language models. 2024. An open-source toolkit for LLM security and safety. URL: https://github.com/protectai/llm-guard.

    -
    +
    [SurgeAI24]

    Surge AI. Surge ai profanity dataset. GitHub repository, 2024. A comprehensive dataset for training and evaluating profanity detection models. URL: https://github.com/surge-ai/profanity.

    -
    +
    [UKGovernment24]

    UK Government. Ai regulation: a pro-innovation approach. White Paper, Department for Science, Innovation and Technology, 2024. URL: https://www.gov.uk/government/publications/ai-regulation-a-pro-innovation-approach/white-paper.

    -
    +
    [UNICEF24]

    UNICEF. Policy guidance on ai for children. Policy Report, UNICEF Office of Research - Innocenti, 2024. URL: https://www.unicef.org/innocenti/reports/policy-guidance-ai-children.

    diff --git a/tamingllms/_build/html/notebooks/structured_output.html b/tamingllms/_build/html/notebooks/structured_output.html index df0d85c..3935cd2 100644 --- a/tamingllms/_build/html/notebooks/structured_output.html +++ b/tamingllms/_build/html/notebooks/structured_output.html @@ -208,6 +208,15 @@ +
  • + + The Falling Cost Paradox + + + +
  • + +
    @@ -240,7 +249,7 @@
    -

    4. Structured Output

    +

    4. Structured Output

    In limits, there is freedom. Creativity thrives within structure.

    —Julia B. Cameron

    @@ -248,41 +257,41 @@
    -

    4.1. Introduction

    +

    4.1. Introduction

    Language Models excel at generating human-like text, but they often struggle to produce output in a structured format, consistently. This poses a significant challenge when we need LLMs to generate data that can be easily processed by downstream systems, such as databases, APIs, or other software applications. Sometimes, even with a well-crafted prompt, an LLM might produce an unstructured response when a structured one is expected. This can be particularly challenging when integrating LLMs into systems that require specific data formats.

    What user needs drive the demand for LLM output constraints when building LLM-based applications? In a recent work by Google Research [Liu et al., 2024], the authors explore the user need for constraints on the output of large language models, drawing on a survey of 51 industry professionals who use LLMs in their work. These needs can be broadly categorized as follows:

    1. Improving Developer Efficiency and Workflow

    @@ -306,7 +315,7 @@

    -

    4.2. Problem Statement

    +

    4.2. Problem Statement

    Language models based on the Transformer architecture are next token prediction machines. These models calculate the probability of observing a token (from a vocabulary of size \(n\)) conditioned on the previous tokens in the sequence. This process can be expressed mathematically as:

    @@ -326,7 +335,7 @@

    -

    4.3. Techniques

    +

    4.3. Techniques

    There are many techniques to obtain structured output from LLMs [Liang et al., 2024]. They can be broadly categorized into two types based on the phase they are applied to:

    1. Training-Time Techniques (TTT): These techniques are applied during the training or post-training phases of the LLM. They are used to guide the model to learn the specific patterns and structures that are required for the task at hand.

    2. @@ -353,7 +362,7 @@

      -

      4.3.1. Prompt Engineering

      +

      4.3.1. Prompt Engineering

      In one-shot prompting, you provide a single example of the desired output format within the prompt.

      As a motivating example, consider the following simple task: Given a segment of a SEC financial filing, generate a two-person discussion about the key financial data from the text in JSON format, simulating what would be a real-world discussion about the underlying companies’ disclosed financial information. We would like to generate a structured output that can be easily parsed and integrated with other systems.

      @@ -458,7 +467,7 @@

      -

      4.3.2. JSON Mode (Fine-Tuned)

      +

      4.3.2. JSON Mode (Fine-Tuned)

      One-shot prompting is a simple technique that can lead to material improvements in structured output, though may not be sufficient for complex (e.g. nested) structures and / or when the model’s output needs to be restricted to a specific set of options or types.

      Some models offer so-called “JSON Mode” as an attempt to handle those challenges, which are a form of fine-tuning, hence while useful it is not guaranteed to work for all models.

      JSON mode is a feature provided by most LLM API providers, such as OpenAI, that allows the model to generate output in JSON format. This is particularly useful when you need structured data as a result, such as when parsing the output programmatically or integrating it with other systems that require JSON input. As depicted in Fig. 4.1, JSON mode is implemented by instructing the LLM model to use JSON as response format and optionally defining a target schema.

      @@ -585,7 +594,7 @@

      -

      4.3.3. Logit Post-Processing

      +

      4.3.3. Logit Post-Processing

      Logit post-processing is a technique that involves modifying the logits of the LLM’s output before it is converted into text.

      The text generation process follows a probabilistic approach. At each step, the model calculates the probability distribution over its entire vocabulary to determine the most likely next token.

      Let’s examine how an LLM processes an example prompt “Is Enzo a good name for a baby?” as depicted in Fig. 4.2:

      @@ -617,6 +626,12 @@

      +
      /home/tobias/src/tamingLLMs/tamingllms/.venv/lib/python3.11/site-packages/torch/cuda/__init__.py:654: UserWarning: Can't initialize NVML
      +  warnings.warn("Can't initialize NVML")
      +
      +
      +

    @@ -814,9 +829,9 @@

    -

    4.4. Tools

    +

    4.4. Tools

    -

    4.4.1. Outlines

    +

    4.4.1. Outlines

    Outlines [Outlines, 2024] is a library specifically focused on structured text generation from LLMs. Under the hood, Outlines works by adjusting the probability distribution of the model’s output logits - the raw scores from the final layer of the neural network that are normally converted into text tokens. By introducing carefully crafted logit biases, Outlines can guide the model to prefer certain tokens over others, effectively constraining its outputs to a predefined set of valid options.

    The authors solve the general guided generation problem [Willard and Louf, 2023], which as a consequence solves the problem of structured output generation, in LLMs by introducing an efficient indexing approach that reformulates neural text generation using finite-state machines (FSMs).

    They define the next token generation as a random variable:

    @@ -960,10 +975,10 @@

    [LangChain, 2024b].

    +

    You can also use Outlines with LangChain [LangChain, 2024b].

    -

    4.4.2. LangChain

    +

    4.4.2. LangChain

    LangChain is a framework designed to simplify the development of LLM applications. It provider an abstraction layer over many LLM providers, including OpenAI, that offers several tools for parsing structured output.

    In particular, LangChain offers the with_structured_output method, which can be used with LLMs that support structured output APIs, allowing you to enforce a schema directly within the prompt.

    @@ -1021,7 +1036,7 @@

    .with_structured_output() can be found here.

    -

    4.4.3. Ollama

    +

    4.4.3. Ollama

    Ollama is a popular tool that allows you to run large language models (LLMs) locally. It has recently added support for structured output generation. The current ollama implementation leverages llama.cpp GBNF (GGML BNF) grammars [Ggerganov, 2024] to enable structured output generation.

    llama.cpp GBNF forces language models to generate output in specific, predefined formats by constraining their outputs to follow precise rules and patterns. The system accomplishes this through a formal grammar specification that defines exactly how valid outputs can be constructed. It’s essentially an extension of BNF (Backus-Naur Form) [Wikipedia contributors, 2024] with some modern regex-like features added. These rules carefully define what elements are allowed, how they can be combined, and what patterns of repetition and sequencing are valid. By enforcing these constraints during generation, GBNF ensures the model’s output strictly adheres to the desired format.

    Ollama first introduced structured output generation in version 0.5.1 providing support for JSON output but highlighting additional formats are coming soon.

    @@ -1119,9 +1134,9 @@

    -

    4.5. Discussion

    +

    4.5. Discussion

    -

    4.5.1. Best Practices

    +

    4.5.1. Best Practices

    When implementing structured output with LLMs, it’s crucial to understand the distinction between different approaches. Some methods, like Outlines’ logit post-processing, provide mathematical guarantees that the output will conform to the specified structure. These contrast sharply with approaches like JSON mode, which rely on fine-tuned models or prompting that offer no formal guarantees. This distinction becomes particularly important in production environments where reliability and consistency are paramount. With that in mind, here are some best practices to consider when implementing structured output with LLMs:

    • Clear Schema Definition: Define the desired output structure clearly. This can be done in several ways including schemas, types, or Pydantic models as appropriate. This ensures the LLM knows exactly what format is expected.

    • @@ -1131,7 +1146,7 @@

      -

      4.5.2. Comparing Solutions

      +

      4.5.2. Comparing Solutions

      The choice of framework for structured LLM output depends heavily on specific constraints, requirements and use cases. LangChain is the most used LLM framework today with a large developer community base however its structured output support depends on the underlying LLM provider support. Ollama enables straightforward local deployment and experimentation democratizing access to LLMs while fostering privacy and control, however today it only offers JSON format with further formats to come. Outlines emerges as a solution with great flexibility and control over output structure while providing support for a wide range of LLMs. Table 4.1 provides a summary comparison of the different frameworks.

    Table 5.2 Rules-Based Safety Filtering Tools.
    @@ -1175,10 +1190,10 @@

    [Guidance AI, 2024] and NVIDIA’s Logits Processor Zoo [NVIDIA, 2024a].

    +

    Other related tools not covered in this chapter worth mentioning include Guidance [Guidance AI, 2024] and NVIDIA’s Logits Processor Zoo [NVIDIA, 2024a].

    -

    4.5.3. Research and Ongoing Debate

    +

    4.5.3. Research and Ongoing Debate

    The use of structured output for Large Language Models (LLMs) is a developing area. While the ability to constrain LLM outputs offer clear benefits in parsing, robustness, and integration, there is growing debate on whether it also potentially comes at the cost of performance as well as reasoning abilities. Research in this area should be taken with a grain of salt since findings are mixed and often depend on the specific task and model family at hand furthermore model families are not always comparable and are getting updated by the day! Nonetheless, early findings provide some interesting insights as to why there is no one-size-fits-all solution when it comes to LLMs structured output.

    There is some evidence indicating that LLMs may have bias in their handling of different output formats [Long et al., 2024]. The study examined common output structures like multiple-choice answers, wrapped text, lists, and key-value mappings. The authors analyzed key LLM model families, namely Gemma, Mistral, and ChatGPT, uncovering bias across multiple tasks and formats. The researchers attributed these biases to the models’ underlying token distributions for different formats. An example of this format bias emerged in the comparison between JSON and YAML outputs. While models like Mistral and Gemma excelled at generating JSON structures, they performed notably worse with YAML. Their YAML outputs often contained extraneous information that degrades output quality. This disparity likely stems from JSON’s prevalence in training data, highlighting how a format’s popularity directly influences model performance. While the studied models can be probably considered outdated by now since models are getting updated on a rapidly fashion, it is important to remark that addressing format bias is critical for advancing LLMs and ensuring their reliable application in real-world scenarios.

    Recent research “Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models” [Tam et al., 2024] suggests that imposing format restrictions on LLMs might impact their performance, particularly in reasoning-intensive tasks. Further evidence [Aider, 2024] suggests LLMs may produce lower quality code if they’re asked to return it as part of a structured JSON response, in particular:

    @@ -1208,16 +1223,16 @@

    -

    4.6. Conclusion

    +

    4.6. Conclusion

    Extracting structured output from LLMs is crucial for integrating them into real-world applications. By understanding the challenges and employing appropriate strategies and tools, developers can improve the reliability and usability of LLM-powered systems, unlocking their potential to automate complex tasks and generate valuable insights.

    Prompt engineering and the use of fine-tuned models can help control the output of LLMs. However, when strong guarantees are needed, practitioners should consider techniques such as logit post-processing either by manually adjusting the model’s output logits or using frameworks like Outlines that provider a higher level of control over the generation process.

    -

    4.7. Acknowledgements

    +

    4.7. Acknowledgements

    We would like to thank Cameron Pfiffer from the .txt team for his insightful review and feedback.

    -

    4.8. Citation

    +

    4.8. Citation

    CC BY-NC-SA 4.0

    @misc{tharsistpsouza2024tamingllms,
       author = {Tharsis T. P. Souza},
    @@ -1231,7 +1246,7 @@ 

    -

    4.9. References

    +

    4.9. References

    [Aid24] @@ -1245,7 +1260,7 @@

    [Gge24]

    Ggerganov. Llama.cpp grammars documentation. https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md, 2024. Accessed: 2024.

    -
    +
    [Lan4b]

    LangChain. Outlines integration documentation. Online Documentation, 2024b. Documentation on integrating Outlines library with LangChain for structured generation. URL: https://python.langchain.com/docs/integrations/chat/outlines/.

    @@ -1279,11 +1294,11 @@

    [WL23]

    Brandon T. Willard and Rémi Louf. Efficient guided generation for large language models. 2023. URL: https://arxiv.org/abs/2307.09702, arXiv:2307.09702.

    -
    +
    [GuidanceAI24]

    Guidance AI. Guidance: language model programming. GitHub Repository, 2024. Framework for programming language models with structured templating and control flow. URL: https://github.com/guidance-ai/guidance.

    -
    +
    [NVIDIA4a]

    NVIDIA. Logits processor zoo. GitHub Repository, 2024a. Collection of logits processors for controlling language model generation. URL: https://github.com/NVIDIA/logits-processor-zoo.

    diff --git a/tamingllms/_build/html/objects.inv b/tamingllms/_build/html/objects.inv index ff9298a..674fc33 100644 Binary files a/tamingllms/_build/html/objects.inv and b/tamingllms/_build/html/objects.inv differ diff --git a/tamingllms/_build/html/search.html b/tamingllms/_build/html/search.html index 64118c3..e274e05 100644 --- a/tamingllms/_build/html/search.html +++ b/tamingllms/_build/html/search.html @@ -197,6 +197,15 @@ +
  • + + The Falling Cost Paradox + + + +
  • + +
    diff --git a/tamingllms/_build/html/searchindex.js b/tamingllms/_build/html/searchindex.js index 19f3588..70b2aeb 100644 --- a/tamingllms/_build/html/searchindex.js +++ b/tamingllms/_build/html/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["markdown/intro", "markdown/preface", "markdown/toc", "notebooks/alignment", "notebooks/evals", "notebooks/local", "notebooks/safety", "notebooks/structured_output"], "filenames": ["markdown/intro.md", "markdown/preface.md", "markdown/toc.md", "notebooks/alignment.ipynb", "notebooks/evals.ipynb", "notebooks/local.ipynb", "notebooks/safety.ipynb", "notebooks/structured_output.ipynb"], "titles": ["2. About the Book", "1. Preface", "Taming LLMs", "6. Preference-Based Alignment", "3. The Evals Gap", "7. Local LLMs in Practice", "5. Safety", "4. Structured Output"], "terms": {"am": [0, 6], "alwai": [0, 3, 4, 7], "do": [0, 3, 4, 5, 6, 7], "which": [0, 3, 4, 5, 6, 7], "cannot": [0, 3, 4, 5, 6], "order": [0, 3, 4, 6, 7], "mai": [0, 1, 3, 4, 5, 6, 7], "learn": [0, 3, 4, 5, 6, 7], "how": [0, 1, 3, 4, 5, 6, 7], "pablo": [0, 4], "picasso": 0, "In": [0, 3, 4, 5, 6, 7], "recent": [0, 3, 4, 5, 6, 7], "year": [0, 2, 3, 4, 5, 6, 7], "larg": [0, 1, 2, 3, 4, 5, 6, 7], "languag": [0, 1, 2, 4, 5, 6, 7], "model": [0, 1, 2, 6, 7], "llm": [0, 1, 3, 7], "have": [0, 1, 3, 4, 5, 6, 7], "emerg": [0, 3, 5, 6, 7], "transform": [0, 1, 3, 4, 5, 6, 7], "forc": [0, 4, 7], "technologi": [0, 1, 4, 5, 6], "promis": [0, 3, 4, 6], "revolution": [0, 6], "build": [0, 2, 3, 4, 5, 6, 7], "product": [0, 1, 2, 3, 4, 5, 6, 7], "interact": [0, 3, 4, 5, 6, 7], "comput": [0, 3, 4, 5, 6, 7], "from": [0, 1, 4, 5, 6, 7], "chatgpt": [0, 3, 5, 7], "github": [0, 2, 3, 4, 5, 6, 7], "copilot": 0, "claud": [0, 3, 4, 5, 6], "artifact": 0, "system": [0, 3, 4, 5, 6, 7], "captur": [0, 1, 3, 4, 5, 6], "public": [0, 3, 4, 5, 6], "imagin": [0, 5], "spark": 0, "gold": [0, 3, 4, 6], "rush": 0, "ai": [0, 3, 4, 5, 7], "power": [0, 2, 3, 4, 5, 6, 7], "applic": [0, 1, 2, 3, 5, 6, 7], "howev": [0, 3, 4, 5, 6, 7], "beneath": 0, "surfac": [0, 4], "technolog": [0, 1, 4, 6], "revolut": 0, "li": [0, 3, 4, 5, 6, 7], "complex": [0, 1, 3, 4, 5, 6, 7], "landscap": [0, 3, 4, 5], "practition": [0, 1, 4, 5, 7], "must": [0, 3, 4, 5, 6, 7], "navig": [0, 2, 4, 5, 6], "focus": [0, 3, 4, 5, 6, 7], "bring": [0, 3, 5], "awar": [0, 3, 4, 6], "limit": [0, 1, 2, 4, 5, 6, 7], "har": [0, 2, 4], "solut": [0, 2, 4, 5, 6], "overcom": [0, 4], "them": [0, 1, 3, 4, 5, 6, 7], "robust": [0, 3, 4, 5, 6, 7], "It": [0, 3, 4, 5, 6, 7], "offer": [0, 3, 4, 5, 6, 7], "critic": [0, 2, 3, 4, 5, 6, 7], "implement": [0, 2, 3, 4, 5, 7], "back": [0, 4, 5, 6, 7], "reproduc": [0, 1, 2, 4, 5], "exampl": [0, 1, 2, 3, 4, 5, 6, 7], "while": [0, 1, 2, 3, 4, 5, 6, 7], "mani": [0, 1, 3, 4, 5, 6, 7], "resourc": [0, 3, 4, 5, 6], "cover": [0, 3, 4, 5, 6, 7], "capabl": [0, 1, 2, 4, 5, 6, 7], "specif": [0, 3, 4, 5, 7], "hidden": [0, 3, 6], "pitfal": [0, 1, 3, 4, 5, 7], "engin": [0, 1, 2, 3, 4, 5, 6], "technic": [0, 1, 2, 3, 4, 5, 7], "manag": [0, 1, 4, 5, 6, 7], "face": [0, 3, 4, 5, 6], "when": [0, 1, 2, 3, 4, 5, 6, 7], "comprehens": [0, 2, 3, 4, 5, 6, 7], "guid": [0, 1, 3, 4, 5, 6, 7], "leverag": [0, 3, 4, 5, 6, 7], "battl": [0, 2], "test": [0, 2, 3, 5, 6, 7], "tool": [0, 1, 3], "throughout": [0, 4, 5, 6], "tackl": [0, 3, 4, 6], "follow": [0, 3, 4, 5, 6, 7], "non": [0, 3, 5, 6, 7], "exhaust": [0, 5], "list": [0, 3, 4, 5, 6, 7], "structur": [0, 3, 4, 5, 6], "un": 0, "reliabl": [0, 1, 3, 4, 5, 6, 7], "struggl": [0, 1, 3, 4, 5, 6, 7], "maintain": [0, 1, 3, 4, 5, 6, 7], "consist": [0, 1, 3, 4, 5, 6, 7], "output": [0, 1, 3, 4, 5, 6], "format": [0, 3, 4, 5, 6, 7], "complic": [0, 6], "integr": [0, 1, 3, 4, 5, 6, 7], "larger": [0, 3, 4, 5, 6, 7], "make": [0, 3, 4, 5, 6, 7], "error": [0, 3, 4, 6, 7], "handl": [0, 3, 4, 5, 6, 7], "more": [0, 1, 3, 4, 5, 6, 7], "size": [0, 3, 4, 5, 6, 7], "length": [0, 3, 4, 5, 7], "constraint": [0, 1, 3, 4, 5, 6, 7], "strict": [0, 5, 6, 7], "token": [0, 1, 3, 4, 5, 6, 7], "both": [0, 3, 4, 5, 6], "input": [0, 3, 4, 5, 6, 7], "requir": [0, 3, 5, 6, 7], "care": [0, 3, 4, 5, 6, 7], "chunk": [0, 3, 5], "strategi": [0, 3, 4, 5, 6, 7], "long": [0, 1, 3, 4, 5, 6, 7], "form": [0, 3, 4, 5, 6, 7], "effect": [0, 1, 3, 4, 6, 7], "tradit": [0, 3, 5, 6], "softwar": [0, 1, 3, 5, 6, 7], "methodologi": [0, 3, 4, 5, 6, 7], "break": [0, 1, 3, 4, 6], "down": [0, 1, 4, 5, 6], "deal": [0, 3, 5], "determinist": [0, 7], "gener": [0, 1, 5, 7], "new": [0, 2, 3, 4, 5, 6, 7], "hallucin": [0, 1, 3, 4, 6, 7], "These": [0, 3, 4, 5, 6, 7], "can": [0, 1, 3, 4, 5, 6, 7], "plausibl": [0, 6], "sound": [0, 6], "entir": [0, 4, 5, 7], "fabric": [0, 4, 6], "inform": [0, 3, 4, 5, 6, 7], "creat": [0, 1, 3, 4, 5, 6, 7], "signific": [0, 3, 4, 5, 6, 7], "risk": [0, 1, 3, 4, 5], "safeti": [0, 3, 4, 7], "align": [0, 4, 5, 6, 7], "harm": [0, 3, 4, 5], "bias": [0, 3, 4, 5, 6, 7], "inappropri": [0, 3, 6], "safeguard": [0, 4, 6], "monitor": [0, 3, 4, 5, 6], "ensur": [0, 3, 4, 5, 6, 7], "safe": [0, 3, 4, 6, 7], "deploy": [0, 3, 4, 6, 7], "cost": [0, 3, 4, 6, 7], "optim": [0, 1, 4, 5, 6], "The": [0, 1, 3, 6, 7], "financi": [0, 1, 3, 4, 6, 7], "oper": [0, 3, 4, 5, 6, 7], "base": [0, 1, 5, 7], "quickli": [0, 3, 5], "becom": [0, 3, 4, 5, 6, 7], "prohibit": [0, 3, 4, 5], "without": [0, 1, 3, 4, 5, 6, 7], "observ": [0, 3, 4, 5, 6, 7], "vendor": [0, 4, 5], "lock": [0, 3, 5], "cloud": [0, 3, 4, 5, 6, 7], "provid": [0, 2, 3, 4, 5, 6, 7], "depend": [0, 3, 4, 5, 7], "through": [0, 1, 2, 3, 4, 5, 6, 7], "proprietari": [0, 3, 5, 6, 7], "infrastructur": [0, 5], "difficult": [0, 3, 4, 6], "switch": [0, 5], "self": [0, 3, 4, 5, 6, 7], "host": [0, 4, 5, 6], "take": [0, 2, 3, 4, 5, 6, 7], "hand": [0, 5, 6, 7], "focu": [0, 2, 3, 4, 5, 6, 7], "access": [0, 3, 4, 5, 6, 7], "all": [0, 1, 3, 4, 5, 6, 7], "ar": [0, 1, 3, 4, 5, 6, 7], "fulli": [0, 3, 4, 6], "document": [0, 3, 4, 5, 6, 7], "allow": [0, 4, 5, 6, 7], "reader": [0, 2], "replic": [0, 4, 6, 7], "result": [0, 3, 4, 6, 7], "exactli": [0, 4, 7], "design": [0, 1, 3, 5, 7], "run": [0, 3, 4, 5, 6, 7], "consum": [0, 3, 4, 5, 6, 7], "grade": [0, 3, 4, 5, 6], "hardwar": [0, 3, 4], "expens": [0, 3, 4, 5, 6], "avail": [0, 3, 4, 5, 6, 7], "notebook": [0, 3, 7], "modifi": [0, 3, 4, 6, 7], "extend": [0, 3, 4, 5, 7], "built": [0, 4, 5, 6, 7], "us": [0, 1, 3, 5, 6, 7], "free": [0, 1, 3, 4, 5, 6], "everyon": [0, 4, 5], "minim": [0, 3, 4, 5, 6, 7], "framework": [0, 3, 4, 5], "wai": [0, 3, 4, 5, 6, 7], "priorit": [0, 3, 4, 5, 6], "transpar": [0, 3, 4, 5, 6], "visibl": [0, 4], "being": [0, 3, 4, 5, 6, 7], "better": [0, 2, 3, 4, 5, 6], "understand": [0, 1, 2, 3, 4, 5, 6, 7], "custom": [0, 3, 4, 7], "flexibl": [0, 4, 5, 6, 7], "adapt": [0, 3, 4, 5, 6], "case": [0, 4, 7], "unlik": [0, 3, 4, 5], "black": [0, 3], "box": [0, 5], "commerci": [0, 3, 4, 5, 6, 7], "most": [0, 3, 4, 5, 6, 7], "freeli": [0, 7], "foster": [0, 3, 4, 6, 7], "reduc": [0, 3, 4, 5, 6, 7], "independ": [0, 4, 6, 7], "freedom": [0, 5, 7], "architectur": [0, 3, 4, 5, 7], "decis": [0, 3, 4, 5, 6], "keep": [0, 3, 4, 5, 6], "principl": [0, 3, 4, 5, 6], "itself": [0, 3, 4, 5, 6], "live": [0, 1, 4, 6], "evolv": [0, 3, 4, 5, 6], "chang": [0, 3, 4, 5, 6, 7], "encourag": [0, 3, 4, 6, 7], "report": [0, 3, 4, 5, 6, 7], "suggest": [0, 3, 4, 5, 6, 7], "improv": [0, 3, 4, 5, 6, 7], "contribut": [0, 4, 5, 6], "via": [0, 3, 4, 5, 6, 7], "pull": [0, 5], "request": [0, 3, 4, 5, 6, 7], "share": [0, 3, 4, 5, 6, 7], "own": [0, 3, 4, 5, 6], "experi": [0, 3, 4, 5, 6, 7], "commun": [0, 3, 4, 6, 7], "propos": [0, 4, 6], "chapter": [0, 3, 4, 5, 6, 7], "section": [0, 3, 4, 5, 6, 7], "found": [0, 3, 4, 5, 7], "http": [0, 1, 2, 3, 4, 5, 6, 7], "com": [0, 2, 3, 4, 5, 6, 7], "souzatharsi": [0, 2, 3, 4, 5, 6, 7], "tamingllm": [0, 2, 3, 4, 5, 6, 7], "whether": [0, 3, 4, 5, 6, 7], "you": [0, 1, 3, 4, 5, 6, 7], "ve": [0, 5], "typo": [0, 6], "want": [0, 1, 3, 5, 6, 7], "welcom": 0, "look": [0, 2, 3, 4, 5, 6], "our": [0, 1, 3, 4, 5, 6, 7], "goal": [0, 1, 3, 4, 6, 7], "discourag": 0, "enabl": [0, 3, 4, 5, 6, 7], "By": [0, 1, 2, 3, 4, 6, 7], "upfront": [0, 2], "equip": [0, 2, 4, 6], "avoid": [0, 3, 4, 5, 6, 7], "current": [0, 2, 3, 4, 6, 7], "discours": [0, 2], "around": [0, 2, 3, 4, 5, 6, 7], "tend": [0, 2, 4, 6], "toward": [0, 3, 4, 6, 7], "extrem": [0, 3, 4, 6], "either": [0, 3, 4, 5, 6, 7], "uncrit": 0, "enthusiasm": 0, "wholesal": [0, 4], "dismiss": 0, "differ": [0, 3, 4, 5, 6, 7], "rather": [0, 1, 3, 4, 5, 6], "than": [0, 1, 3, 4, 5, 6, 7], "theoret": [0, 3], "examin": [0, 3, 4, 5, 6, 7], "first": [0, 1, 3, 4, 5, 6, 7], "everi": [0, 4, 6], "concept": [0, 3, 4, 6], "illustr": [0, 3, 4, 5, 6, 7], "execut": [0, 4, 5, 6], "immedi": [0, 3, 4, 5], "analysi": [0, 1, 3, 4, 5, 6], "balanc": [0, 3, 4, 5, 6, 7], "help": [0, 3, 4, 5, 6, 7], "intend": [0, 4, 5, 6], "develop": [0, 1, 3, 4, 5, 6, 7], "step": [0, 1, 3, 4, 5, 6, 7], "insight": [0, 3, 4, 5, 6, 7], "along": [0, 3, 4, 5, 6], "guidanc": [0, 3, 7], "could": [0, 1, 3, 4, 5, 6, 7], "derail": 0, "project": [0, 3, 4, 5, 6], "earli": [0, 3, 4, 6, 7], "befor": [0, 3, 4, 6, 7], "thei": [0, 1, 3, 4, 5, 6, 7], "costli": [0, 4, 6], "problem": [0, 1, 2, 3, 5, 6], "too": [0, 1, 3, 4, 5, 6], "late": [0, 3, 6], "lifecycl": [0, 5, 6], "lead": [0, 1, 3, 4, 5, 6, 7], "genai": [0, 1, 3, 6], "initi": [0, 1, 3, 4, 5, 6, 7], "leader": [0, 2, 4], "advoc": [0, 6], "anyon": [0, 6], "seek": [0, 4, 5, 6], "work": [0, 1, 3, 4, 5, 6, 7], "typic": [0, 3, 4, 5, 6, 7], "job": [0, 4, 5, 6], "role": [0, 3, 4, 5, 6, 7], "platform": [0, 4, 5, 6, 7], "backend": [0, 3, 4], "exist": [0, 3, 4, 5], "ml": [0, 6], "transit": [0, 4, 5, 7], "overse": 0, "motiv": [0, 3, 4, 7], "need": [0, 3, 4, 5, 6, 7], "readi": [0, 4, 6], "desir": [0, 3, 4, 7], "perform": [0, 3, 4, 6, 7], "after": [0, 1, 3, 4, 5, 6, 7], "read": [0, 3, 4, 6, 7], "implic": [0, 1, 3, 4, 6], "recommend": [0, 3, 4, 5, 6, 7], "abl": [0, 3, 4, 7], "deploi": [0, 3, 4, 5, 6], "proper": [0, 3, 5, 6, 7], "realist": [0, 3, 6], "effort": [0, 4, 5, 6, 7], "estim": [0, 4, 6], "impact": [0, 3, 4, 5, 6, 7], "timelin": 0, "To": [0, 3, 4, 5, 6, 7], "should": [0, 3, 4, 5, 6, 7], "basic": [0, 3, 4, 5, 6], "program": [0, 4, 5, 7], "knowledg": [0, 3, 4, 5, 6], "introductori": [0, 1, 2], "langchain": [0, 4], "e": [0, 1, 3, 4, 5, 6, 7], "g": [0, 3, 4, 5, 6, 7], "chat": [0, 3, 4, 5, 6, 7], "prompt": [0, 4, 6], "templat": [0, 4, 7], "openai": [0, 3, 4, 5, 7], "anthrop": [0, 3, 7], "similar": [0, 3, 4, 5, 7], "dive": 0, "here": [0, 2, 3, 4, 5, 6, 7], "get": [0, 3, 4, 5, 6, 7], "start": [0, 3, 4, 5, 6, 7], "clone": [0, 3], "companion": 0, "git": 0, "cd": 0, "activ": [0, 3, 4, 5, 6], "virtual": [0, 4], "m": [0, 3, 4, 5, 6, 7], "venv": 0, "tame": [0, 3, 4, 5, 6, 7], "env": [0, 3, 4, 6, 7], "bin": [0, 5], "On": [0, 4, 5, 7], "window": [0, 4, 5], "script": [0, 5], "try": [0, 1, 3, 4, 6, 7], "contain": [0, 3, 4, 5, 6, 7], "possibl": [0, 3, 4, 5, 6, 7], "includ": [0, 1, 3, 4, 5, 6, 7], "necessari": [0, 3, 4, 6], "instal": [0, 3, 4, 5, 7], "go": [0, 3, 4, 7], "feel": [0, 5], "prefer": [0, 4, 5, 6, 7], "packag": [0, 4, 5, 7], "pip": [0, 3, 4, 5, 7], "poetri": [0, 6], "file": [0, 3, 4, 5, 6, 7], "root": [0, 3], "directori": [0, 4, 5], "add": [0, 3, 4, 5, 6], "other": [0, 3, 4, 5, 6, 7], "sensit": [0, 3, 4, 5, 6], "openai_api_kei": [0, 3], "your_openai_api_key_her": 0, "never": [0, 7], "commit": [0, 3, 4, 6], "version": [0, 3, 4, 5, 6, 7], "control": [0, 1, 3, 4, 5, 6, 7], "kept": [0, 4], "privat": [0, 4], "If": [0, 1, 3, 4, 5, 6, 7], "encount": [0, 2, 4, 6], "rate": [0, 3, 4, 5, 6], "consid": [0, 3, 4, 5, 6, 7], "smaller": [0, 3, 4, 5, 7], "retri": [0, 7], "logic": [0, 1, 3, 4, 6], "conflict": [0, 3, 4], "fresh": 0, "like": [0, 1, 3, 4, 5, 6, 7], "check": [0, 4, 5, 6, 7], "page": [0, 4, 5], "known": [0, 4, 6, 7], "now": [0, 1, 3, 4, 5, 6, 7], "let": [0, 3, 4, 5, 6, 7], "begin": [0, 4, 5, 6, 7], "explor": [0, 1, 3, 4, 5, 6, 7], "dr": [0, 3], "tharsi": [0, 2, 3, 4, 5, 6, 7], "souza": [0, 2, 3, 4, 5, 6, 7], "scientist": [0, 1, 5, 6], "special": [0, 4, 5, 6, 7], "he": [0, 3, 4, 6], "lectur": 0, "columbia": 0, "univers": [0, 4, 5, 6], "master": [0, 5, 7], "scienc": [0, 3, 4, 6], "appli": [0, 3, 4, 5, 6, 7], "analyt": 0, "incom": [0, 4], "head": [0, 3, 4, 6, 7], "equiti": [0, 4], "citadel": 0, "former": [0, 1, 4, 5], "senior": [0, 4], "vp": 0, "two": [0, 3, 4, 5, 6, 7], "sigma": [0, 3], "invest": [0, 3, 4, 6], "also": [0, 3, 4, 5, 6, 7], "enjoi": 0, "mentor": 0, "under": [0, 3, 4, 5, 6, 7], "repres": [0, 3, 4, 5, 7], "student": [0, 3, 6], "profession": [0, 3, 4, 6, 7], "divers": [0, 3, 4, 6], "global": [0, 4, 6], "ecosystem": [0, 4, 5], "With": [0, 3, 4, 5, 6, 7], "over": [0, 2, 3, 4, 5, 6, 7], "15": [0, 4, 5, 6, 7], "deliv": [0, 4, 5], "across": [0, 1, 3, 4, 5, 6, 7], "startup": 0, "fortun": 0, "500": [0, 3, 4, 6], "compani": [0, 3, 4, 6, 7], "numer": [0, 4, 6, 7], "scholarli": 0, "frequent": [0, 4, 5, 7], "speaker": [0, 4], "academ": [0, 3, 4, 6], "busi": [0, 4, 5, 6], "confer": [0, 7], "ground": [0, 3, 4, 5], "background": [0, 1, 4, 5], "draw": [0, 3, 4, 6, 7], "scale": [0, 3, 4, 5, 6, 7], "stage": [0, 3, 6, 7], "major": [0, 3, 4, 6, 7], "institut": [0, 4, 6], "well": [0, 3, 4, 5, 6, 7], "advis": [0, 3], "profit": [0, 4, 6, 7], "organ": [0, 3, 4, 5], "uniqu": [0, 3, 4, 5, 6, 7], "bridg": [0, 5, 6], "gap": [0, 1, 3, 5, 6], "between": [0, 1, 3, 4, 5, 6, 7], "potenti": [0, 1, 3, 4, 5, 6, 7], "next": [0, 1, 3, 4, 5, 6, 7], "hold": [0, 3, 4], "ph": [0, 6], "d": [0, 3, 4, 5, 6, 7], "ucl": 0, "london": 0, "phil": [0, 6], "sc": 0, "b": [0, 4, 5, 6, 7], "tell": [1, 3, 6], "mere": [1, 4], "what": [1, 3, 4, 5, 6, 7], "someth": [1, 4, 5], "i": [1, 2, 4, 5, 6, 7], "emanuel": [1, 3, 4, 6], "derman": 1, "an": [1, 2, 3, 4, 5, 6, 7], "altern": [1, 3, 4, 5, 6], "titl": [1, 2, 3, 4, 5, 6, 7], "thi": [1, 2, 3, 4, 5, 6, 7], "book": [1, 4], "been": [1, 3, 4, 5, 6], "behav": 1, "badli": 1, "come": [1, 3, 4, 5, 6, 7], "notic": [1, 3, 4, 6], "parallel": [1, 3, 4, 5], "": [1, 3, 4, 5, 6, 7], "semin": [1, 6], "2011": 1, "coincident": 1, "just": [1, 3, 4, 5, 6, 7], "caution": 1, "against": [1, 3, 4, 5, 6], "treat": [1, 4, 6], "perfect": [1, 4, 5], "represent": [1, 4, 5, 6], "realiti": [1, 6], "aim": [1, 3, 4, 5, 6, 7], "highlight": [1, 3, 4, 5, 6, 7], "practic": [1, 3, 4, 6], "physicist": 1, "goldman": 1, "sach": 1, "quant": 1, "scientif": [1, 3, 4, 5], "fail": [1, 3, 4, 6], "we": [1, 3, 4, 5, 6, 7], "mistak": [1, 6], "approxim": [1, 4, 7], "full": [1, 3, 4, 5, 6, 7], "assumpt": [1, 4, 6], "core": [1, 4, 5, 6], "premis": [1, 5], "hi": [1, 4, 6, 7], "aspect": [1, 3, 4, 6], "world": [1, 3, 4, 5, 6, 7], "inher": [1, 2, 3, 4, 6, 7], "involv": [1, 3, 4, 5, 6, 7], "simplif": 1, "argu": [1, 6, 7], "crise": 1, "2008": 1, "crash": 1, "occur": [1, 3, 4, 6], "partli": 1, "becaus": [1, 3, 4, 6], "peopl": [1, 3, 4, 5, 6], "put": [1, 4, 5], "much": [1, 3, 4, 5], "faith": 1, "mathemat": [1, 4, 5, 7], "recogn": [1, 3, 4, 6], "human": [1, 4, 5, 6, 7], "behavior": [1, 3, 4, 5, 6], "market": [1, 4, 5, 7], "dynam": [1, 3, 4, 6], "fact": [1, 3, 4, 6], "reason": [1, 3, 4, 5, 6, 7], "Their": [1, 4, 7], "respons": [1, 4, 5, 6, 7], "often": [1, 3, 4, 5, 6, 7], "convinc": [1, 3], "probabilist": [1, 4, 7], "train": [1, 4, 5, 6, 7], "data": [1, 4, 5, 6, 7], "true": [1, 3, 4, 6, 7], "even": [1, 3, 4, 5, 6, 7], "though": [1, 3, 4, 5, 6, 7], "insist": 1, "machin": [1, 3, 5, 6, 7], "todai": [1, 5, 7], "grow": [1, 3, 4, 5, 6, 7], "pervas": [1, 6], "belief": [1, 5, 6], "solv": [1, 3, 4, 5, 6, 7], "ani": [1, 3, 4, 5, 6, 7], "context": [1, 3, 4, 5, 6, 7], "content": 1, "wish": [1, 4], "user": [1, 4, 5, 7], "moreov": 1, "were": [1, 3, 4, 5, 6, 7], "predict": [1, 3, 4, 5, 6, 7], "chatbot": [1, 3, 4, 5, 6], "twist": [1, 6], "wrap": [1, 5, 7], "further": [1, 3, 4, 5, 6, 7], "daili": [1, 5, 6], "life": [1, 4, 5, 6], "workflow": [1, 4, 5, 6, 7], "affect": [1, 4, 5, 6], "decid": [1, 3, 4], "action": [1, 3, 4, 6], "coupl": [1, 5], "lack": [1, 3, 4, 6, 7], "pose": [1, 3, 4, 6, 7], "still": [1, 4, 5, 6], "figur": [1, 4, 5], "out": [1, 3, 4, 5, 6, 7], "serv": [1, 3, 4, 6, 7], "builder": [1, 5], "who": [1, 3, 4, 5, 6, 7], "remain": [1, 3, 4, 5, 6], "clear": [1, 3, 4, 5, 6, 7], "ei": 1, "about": [1, 3, 4, 5, 6, 7], "therefor": [1, 3, 4, 5, 6], "end": [1, 3, 4, 5, 6, 7], "detail": [1, 3, 4, 5, 6, 7], "python": [1, 2, 4, 5, 6, 7], "code": [1, 2, 3, 4, 5, 6, 7], "diminish": [1, 3, 4], "promot": [1, 3, 4, 6], "nuanc": [1, 3, 4, 5, 6, 7], "acknowledg": [1, 4, 6], "within": [1, 3, 4, 6, 7], "trustworthi": [1, 6], "taught": 1, "u": [1, 3, 4, 6, 7], "where": [1, 3, 4, 5, 6, 7], "der11": 1, "why": [1, 3, 4, 6, 7], "confus": [1, 6], "illus": 1, "disast": [1, 4], "wall": [1, 5], "street": [1, 5], "press": [1, 4, 5], "isbn": [1, 3, 4], "9781439165010": 1, "url": [1, 2, 3, 4, 5, 6, 7], "googl": [1, 4, 5, 7], "co": [1, 3, 4, 5, 6], "uk": [1, 6], "id": [1, 4, 5, 6, 7], "lke_cwm4wm8c": 1, "sign": [2, 4, 6], "up": [2, 3, 4, 5, 6], "receiv": [2, 3, 4, 5, 6, 7], "updat": [2, 3, 4, 5, 6, 7], "abstract": [2, 4, 6, 7], "heavili": [2, 3, 4, 6, 7], "gloss": 2, "fundament": [2, 3, 4, 5, 6, 7], "challeng": [2, 3, 4, 5, 6, 7], "convers": [2, 3, 4, 5, 6, 7], "kei": [2, 3, 5, 6, 7], "proven": 2, "yet": [2, 3, 4, 6], "concret": [2, 6, 7], "sidestep": 2, "misc": [2, 3, 4, 5, 6, 7], "tharsistpsouza2024tamingllm": [2, 3, 4, 5, 6, 7], "author": [2, 3, 4, 5, 6, 7], "t": [2, 3, 4, 5, 6, 7], "p": [2, 3, 4, 5, 6, 7], "2024": [2, 3, 4, 6, 7], "journal": [2, 3, 4, 5, 6, 7], "repositori": [2, 3, 4, 5, 6, 7], "valu": [3, 4, 5, 6, 7], "its": [3, 4, 5, 6, 7], "privileg": 3, "abov": [3, 4, 6], "soon": [3, 7], "lose": [3, 4], "dwight": 3, "eisenhow": 3, "releas": [3, 4, 5, 6, 7], "3": [3, 4, 5, 7], "5": [3, 4, 5, 7], "2022": [3, 4, 5, 6], "mark": [3, 4, 6], "pivot": [3, 4, 5], "moment": 3, "histori": [3, 4, 5], "artifici": [3, 4, 5, 6], "intellig": [3, 4, 5, 6], "five": [3, 4, 6], "dai": [3, 4, 5, 6, 7], "launch": [3, 4, 6], "attract": [3, 4], "million": [3, 4, 5], "month": [3, 4, 5, 6], "becam": 3, "fastest": [3, 4, 6], "100": [3, 4, 5, 6, 7], "monthli": [3, 4], "rais": [3, 4, 6], "intrigu": 3, "question": [3, 4, 5, 6, 7], "did": [3, 4, 7], "dramat": [3, 4, 5, 7], "predecessor": 3, "gpt": [3, 4, 5, 6, 7], "had": [3, 4], "same": [3, 4, 5, 6, 7], "number": [3, 4, 5, 6, 7], "paramet": [3, 4, 5, 6, 7], "far": [3, 5, 6], "less": [3, 4, 5, 6], "attent": [3, 5], "arguabl": [3, 5], "feedback": [3, 4, 6, 7], "abil": [3, 4, 5, 6, 7], "least": [3, 4, 6], "ey": 3, "breakthrough": [3, 6], "demonstr": [3, 4, 5, 6, 7], "crucial": [3, 5, 6, 7], "greater": [3, 4, 5, 6], "process": [3, 4, 5, 6], "modern": [3, 4, 7], "techniqu": [3, 4, 5], "direct": [3, 4, 5, 6], "rafailov": 3, "et": [3, 4, 5, 6, 7], "al": [3, 4, 5, 6, 7], "present": [3, 4, 5, 6, 7], "autom": [3, 4, 6, 7], "fashion": [3, 7], "open": [3, 4, 6, 7], "sourc": [3, 4, 6, 7], "common": [3, 4, 5, 7], "pre": [3, 4, 5, 6, 7], "default": [3, 4, 5, 6, 7], "state": [3, 4, 5, 6, 7], "art": [3, 4, 6], "object": [3, 4, 5, 6, 7], "given": [3, 4, 5, 6, 7], "webpag": 3, "internet": [3, 4], "veri": [3, 4, 5, 6], "ask": [3, 4, 5, 6, 7], "instruct": [3, 4, 5, 6, 7], "sai": [3, 7], "ouyang": [3, 6], "2": [3, 4, 7], "explain": [3, 4], "moon": 3, "land": [3, 4, 5], "6": [3, 4, 5], "old": [3, 4], "import": [3, 4, 5, 6, 7], "pipelin": [3, 4, 5, 6, 7], "pipe": [3, 6], "text": [3, 4, 5, 6, 7], "gpt2": [3, 4], "msg": 3, "short": [3, 4, 6, 7], "sentenc": [3, 4, 6], "_": [3, 4, 6, 7], "rang": [3, 4, 5, 6, 7], "len": [3, 4, 5, 6, 7], "print": [3, 4, 5, 6, 7], "f": [3, 4, 5, 6, 7], "n": [3, 4, 5, 6, 7], "1": [3, 4, 5, 7], "0": [3, 4, 5, 6, 7], "generated_text": [3, 7], "good": [3, 4, 5, 7], "idea": [3, 5, 6, 7], "one": [3, 4, 5, 6, 7], "those": [3, 4, 6, 7], "littl": [3, 4], "green": [3, 6], "dot": 3, "Then": [3, 4], "line": [3, 4, 5, 6], "later": [3, 4, 5, 6, 7], "re": [3, 4, 5, 6, 7], "alreadi": [3, 4, 7], "movi": 3, "theori": [3, 4], "some": [3, 4, 5, 6, 7], "mean": [3, 4, 5, 6, 7], "word": [3, 4, 7], "tepid": 3, "articl": [3, 4, 5, 6], "sure": [3, 4, 6, 7], "lunar": 3, "As": [3, 4, 5, 6, 7], "see": [3, 4, 5, 6, 7], "coher": [3, 4, 5, 7], "explan": [3, 4, 6, 7], "child": [3, 4, 6], "nonsens": [3, 6], "meander": 3, "unrel": [3, 4, 6], "topic": [3, 4, 5, 6, 7], "simpl": [3, 4, 5, 6, 7], "appropri": [3, 4, 5, 6, 7], "young": [3, 4, 6], "instead": [3, 4, 5, 6, 7], "address": [3, 4, 5, 6, 7], "issu": [3, 4, 6, 7], "introduc": [3, 4, 5, 6, 7], "rlhf": [3, 6, 7], "intent": [3, 6], "wide": [3, 4, 5, 6, 7], "task": [3, 6, 7], "fig": [3, 4, 5, 6, 7], "collect": [3, 4, 5, 6, 7], "sampl": [3, 5, 7], "label": [3, 4, 5, 6, 7], "comparison": 3, "reward": [3, 4, 5, 6], "sever": [3, 4, 5, 6, 7], "rank": [3, 4, 5, 6], "best": [3, 4, 5, 6], "worst": 3, "rm": [3, 5], "reinforc": [3, 4, 5, 6], "write": [3, 4, 5, 6, 7], "stori": [3, 6], "frog": 3, "calcul": [3, 4, 5, 6, 7], "score": [3, 4, 5, 6, 7], "ppo": [3, 5], "proxim": [3, 5], "iter": [3, 4, 5, 6, 7], "accur": [3, 4, 5, 6], "undesir": [3, 6], "simplifi": [3, 4, 5, 7], "view": [3, 4, 6], "show": [3, 4, 5, 6, 7], "progress": [3, 6], "pattern": [3, 4, 5, 6, 7], "ha": [3, 4, 5, 6, 7], "instanc": [3, 4, 5, 6], "directli": [3, 4, 5, 6, 7], "For": [3, 4, 5, 6, 7], "llama": [3, 4, 6, 7], "guard": 3, "team": [3, 4, 5, 7], "8b": [3, 5, 6, 7], "wa": [3, 4, 5, 6, 7], "classif": [3, 4, 5, 6, 7], "bypass": [3, 6], "similarli": [3, 4, 5, 6], "zephyr": 3, "7b": [3, 4, 5, 6, 7], "alpha": [3, 4, 7], "mistral": [3, 7], "publicli": [3, 4, 7], "assist": [3, 4, 5, 6, 7], "paper": [3, 4, 5, 6, 7], "compon": [3, 4, 5], "particular": [3, 4, 5, 6, 7], "foundat": [3, 4, 5, 6], "advanc": [3, 4, 5, 6, 7], "method": [3, 4, 6, 7], "strong": [3, 4, 5, 6, 7], "At": [3, 4, 5, 7], "high": [3, 4, 5, 6, 7], "level": [3, 4, 6, 7], "carefulli": [3, 4, 5, 6, 7], "curat": [3, 4, 5], "purpos": [3, 4, 5, 6, 7], "exhibit": [3, 4, 5, 6], "domain": [3, 4, 5, 6], "emploi": [3, 4, 6, 7], "prove": [3, 4, 6], "particularli": [3, 4, 5, 6, 7], "valuabl": [3, 4, 5, 7], "scenario": [3, 4, 5, 6, 7], "precis": [3, 4, 5, 6, 7], "style": [3, 4], "tone": 3, "expertis": [3, 4, 6], "medic": [3, 4, 5], "legal": [3, 4, 5, 6], "field": [3, 4, 5, 6, 7], "adher": [3, 4, 6, 7], "guidelin": [3, 4, 6], "servic": [3, 4, 5, 6], "standard": [3, 4, 5, 6], "approach": [3, 4, 5, 7], "each": [3, 4, 5, 6, 7], "distinct": [3, 4, 5, 6, 7], "advantag": [3, 4, 5, 6, 7], "weight": [3, 4, 5, 6, 7], "maximum": [3, 4, 5, 6], "lora": [3, 5, 6], "low": [3, 4, 5, 6, 7], "hu": [3, 6, 7], "2021": [3, 4], "small": [3, 4, 5, 7], "matric": 3, "effici": [3, 4, 5, 6, 7], "qlora": 3, "quantiz": 3, "dettmer": 3, "2023": [3, 4, 5, 6, 7], "combin": [3, 4, 5, 6, 7], "memori": [3, 4, 5, 6], "footprint": [3, 5], "modest": 3, "increas": [3, 4, 5, 6, 7], "likelihood": [3, 4, 6, 7], "obtain": [3, 4, 5, 6, 7], "probabl": [3, 4, 5, 7], "outcom": [3, 4, 6, 7], "hong": [3, 4], "unintend": [3, 6], "suboptim": 3, "seen": [3, 4, 6], "research": [3, 4, 5], "maxim": [3, 4], "shown": [3, 4, 5, 6], "alon": [3, 4, 5, 6], "gain": [3, 4, 5, 6], "achiev": [3, 4, 5, 6, 7], "bai": [3, 4, 6], "touvron": [3, 5], "sinc": [3, 4, 5, 6, 7], "main": [3, 4, 5, 6, 7], "categori": [3, 4, 5, 6, 7], "algorithm": [3, 4, 6], "meanwhil": [3, 5], "superior": [3, 4, 6], "benchmark": 3, "xu": [3, 4, 5, 6], "schulman": [3, 6], "2017": [3, 4], "popular": [3, 5, 7], "understood": 3, "set": [3, 4, 5, 6, 7], "rule": [3, 4, 5, 7], "govern": [3, 4], "reflect": [3, 4, 5, 6], "anoth": [3, 4, 5, 6], "adjust": [3, 4, 5, 6, 7], "One": [3, 4, 5, 6, 7], "strength": [3, 4, 5, 6], "2024c": [3, 5], "real": [3, 4, 5, 6, 7], "noisi": 3, "delai": [3, 4, 5, 6], "subsequ": [3, 7], "situat": [3, 4, 6], "clip": 3, "surrog": 3, "function": [3, 4, 5, 6, 7], "stabl": [3, 4], "prevent": [3, 4, 6, 7], "overreact": 3, "converg": 3, "due": [3, 4, 5, 6], "simplic": [3, 5], "award": [3, 4], "runner": 3, "neurip": 3, "blog": [3, 4, 5, 6, 7], "4": [3, 4, 5, 7], "fit": [3, 4, 6, 7], "pair": [3, 4, 6], "rl": [3, 6], "find": [3, 4, 5, 6, 7], "contrast": [3, 4, 5, 6, 7], "satisfi": [3, 4], "implicit": [3, 4, 6], "whose": [3, 4], "correspond": [3, 4, 7], "extract": [3, 4, 5, 6, 7], "close": [3, 4, 5, 6], "compar": [3, 4, 5, 6], "assign": [3, 4, 5, 6, 7], "higher": [3, 4, 5, 7], "kl": [3, 5], "diverg": [3, 5], "origin": [3, 4, 5, 6, 7], "preserv": [3, 5, 6, 7], "defin": [3, 4, 5, 6, 7], "equat": 3, "mathcal": 3, "l": [3, 4], "pi_": 3, "theta": [3, 7], "ref": 3, "mathbb": [3, 7], "x": [3, 4, 5, 6, 7], "y_w": 3, "y_l": 3, "sim": [3, 7], "left": [3, 5], "log": [3, 4, 5], "beta": [3, 4, 6, 7], "underbrac": 3, "frac": [3, 5, 6], "color": [3, 4], "red": 3, "right": [3, 4, 5, 6], "respect": [3, 4, 5, 6], "deviat": [3, 4, 5, 6], "straightforward": [3, 4, 5, 6, 7], "librari": [3, 4, 5, 6, 7], "huggingfac": [3, 4, 5, 6], "trl": [3, 5, 6], "2024d": [3, 5], "suit": [3, 4, 6], "friendli": [3, 4, 5], "interfac": [3, 4, 5, 6, 7], "featur": [3, 4, 5, 6, 7], "distinguish": [3, 4, 6], "scalabl": [3, 4, 6], "doe": [3, 4, 5, 6, 7], "pretrain": [3, 4, 5], "hou": [3, 4, 5], "poor": [3, 4, 6], "return": [3, 4, 5, 6, 7], "addit": [3, 4, 5, 6, 7], "benefit": [3, 4, 5, 6, 7], "fix": [3, 4, 5, 6], "invers": 3, "trend": [3, 4, 6], "util": [3, 4, 5, 6], "rapid": [3, 4, 5, 6], "yield": [3, 4], "onli": [3, 4, 5, 6, 7], "margin": [3, 4, 6, 7], "capit": [3, 4, 7], "inaccuraci": [3, 4], "nois": 3, "dure": [3, 4, 5, 6, 7], "accuraci": [3, 4, 5, 6, 7], "lag": [3, 4, 6], "significantli": [3, 4, 5, 6], "indic": [3, 4, 5, 6, 7], "signal": [3, 6], "plateau": 3, "sophist": [3, 4, 5, 6], "previou": [3, 4, 5, 7], "deriv": [3, 4, 5], "pairwis": [3, 4], "feng": [3, 6], "substanti": [3, 4, 5, 6], "wors": [3, 5, 7], "influenc": [3, 4, 6, 7], "success": [3, 4, 5, 6, 7], "imbal": 3, "stronger": 3, "bad": 3, "ones": [3, 5, 6], "loss": [3, 4, 5, 6], "gradient": [3, 4, 6], "dispref": 3, "unbalanc": 3, "trajectori": 3, "stuck": 3, "saddl": 3, "point": [3, 4, 5, 6], "forward": [3, 4, 6], "futur": [3, 4, 5, 6], "phenomenon": [3, 6, 7], "degrad": [3, 4, 5, 6, 7], "danger": [3, 5, 6], "loop": [3, 4, 5, 6], "recurs": 3, "kazdan": 3, "qualiti": [3, 4, 5, 6, 7], "pollut": 3, "replac": [3, 4, 5], "amplif": 3, "reduct": [3, 4, 5], "express": [3, 4, 6, 7], "catastroph": [3, 6], "forget": [3, 7], "previous": [3, 4, 6, 7], "mitig": [3, 4, 5, 6, 7], "mix": [3, 4, 6, 7], "metric": [3, 5, 6], "sz\u00e9p": 3, "regular": [3, 4, 5, 6, 7], "relev": [3, 4, 5, 6], "scarc": 3, "behaviour": 3, "strateg": [3, 4, 5, 6, 7], "compli": [3, 4, 5, 6, 7], "modif": [3, 4, 5, 6], "outsid": [3, 4], "evidenc": 3, "landmark": 3, "askel": [3, 4, 6], "2024a": [3, 5, 7], "dec": 3, "explicitli": [3, 4, 5], "so": [3, 4, 6, 7], "might": [3, 4, 5, 6, 7], "pretend": 3, "adopt": [3, 4, 5, 6, 7], "actual": [3, 4, 5, 6, 7], "onc": [3, 4, 5, 6], "complet": [3, 4, 5, 6, 7], "describ": [3, 4, 5, 6], "harmless": [3, 6], "told": 3, "retrain": [3, 5], "queri": [3, 4], "tier": [3, 4, 6], "paid": [3, 4], "column": [3, 4, 6], "condit": [3, 4, 7], "toxic": [3, 5, 6], "excerpt": [3, 4, 5], "scratchpad": 3, "refus": [3, 6, 7], "happen": [3, 6], "bomb": [3, 6], "engag": [3, 4, 5, 6, 7], "intern": [3, 4, 6], "unmonitor": 3, "longer": [3, 4, 5], "believ": [3, 4, 5, 6, 7], "act": [3, 4, 5, 6, 7], "therebi": [3, 4], "reveal": [3, 4, 5, 6], "complianc": [3, 4, 5, 6], "phase": [3, 4, 5, 7], "natur": [3, 4, 5, 6, 7], "evid": [3, 4, 5, 6, 7], "seemingli": 3, "surpris": 3, "appear": [3, 4, 6, 7], "criteria": [3, 4, 6], "underli": [3, 4, 6, 7], "anim": [3, 6], "welfar": 3, "instil": 3, "implicitli": 3, "consequ": [3, 4, 5, 6, 7], "explicit": [3, 4, 5, 6, 7], "chain": [3, 4], "thought": [3, 4, 5, 7], "opaqu": 3, "aris": [3, 4, 6], "opu": 3, "sonnet": [3, 4, 5], "wherea": [3, 4], "haiku": [3, 6], "persist": 3, "resist": [3, 4], "embed": [3, 4, 5], "doesn": [3, 4, 5, 7], "anti": [3, 4], "lab": 3, "exfiltr": [3, 6], "protect": [3, 4, 5, 6], "Not": [3, 4, 6], "malici": [3, 4, 6], "support": [3, 4, 6, 7], "concern": [3, 4, 5, 6], "mechan": [3, 4, 5, 6, 7], "insuffici": [3, 4], "don": [3, 4, 7], "concerningli": 3, "call": [3, 4, 5, 6, 7], "detect": [3, 4, 6, 7], "decept": [3, 4, 6], "warrant": [3, 6], "deeper": [3, 4], "scrutini": [3, 4, 6], "reli": [3, 4, 6, 7], "cross": [3, 4, 5, 6], "circular": 3, "bia": [3, 4, 6, 7], "truli": [3, 4, 5], "trust": [3, 4, 6, 7], "referenti": 3, "ly": 3, "hood": [3, 7], "deep": [3, 4, 6, 7], "mechanist": 3, "drive": [3, 6, 7], "correl": [3, 4, 5], "miss": [3, 4, 6], "confound": 3, "factor": [3, 4, 5, 7], "establish": [3, 4, 5, 6], "attempt": [3, 4, 6, 7], "causal": [3, 4], "heavi": 3, "relianc": [3, 4, 6], "oversimplifi": 3, "frame": 3, "subtler": 3, "narr": [3, 4], "henc": [3, 4, 5, 6, 7], "agenc": [3, 4, 6], "onto": 3, "anthropomorph": 3, "obscur": 3, "blind": [3, 4], "failur": [3, 4, 6], "mode": [3, 5, 6], "map": [3, 4, 5, 7], "cleanli": 3, "analogi": 3, "excel": [3, 4, 5, 6, 7], "review": [3, 4, 5, 6, 7], "prof": 3, "jacob": [3, 4, 5, 6], "andrea": [3, 4, 6], "yoshua": [3, 6], "bengio": [3, 6], "jasjeet": 3, "sekhon": 3, "rohin": 3, "shah": 3, "2024b": [3, 5, 7], "assum": [3, 4, 6], "acm": [3, 6], "inc": [3, 4, 7], "dedic": [3, 4, 5, 6], "democrat": [3, 4, 7], "educ": [3, 4, 6], "k": [3, 4, 6, 7], "12": [3, 4, 5, 6], "name": [3, 4, 5, 6, 7], "smolk": 3, "ll": [3, 4, 5], "walk": 3, "measur": [3, 4, 5, 6], "huggingfacetb": [3, 7], "360m": [3, 4, 5], "compact": [3, 4, 5, 6], "part": [3, 4, 6, 7], "famili": [3, 6, 7], "publish": [3, 6, 7], "api": [3, 4, 5, 7], "local": [3, 4, 6, 7], "infer": [3, 4, 5, 6, 7], "remot": [3, 4], "load": [3, 4, 5, 6, 7], "store": [3, 4, 6], "eventu": [3, 4, 5], "your_openai_api_kei": 3, "reusabl": 3, "anchor": [3, 6], "worth": [3, 4, 5, 7], "choic": [3, 4, 5, 6, 7], "lightweight": [3, 4, 5, 7], "suitabl": [3, 4, 6], "devic": [3, 4, 5, 7], "Its": [3, 4, 5], "candid": [3, 4, 5], "said": [3, 4, 6], "necessarili": [3, 4, 5, 6], "par": [3, 4], "mind": [3, 4, 5, 6, 7], "factual": [3, 4, 5, 6], "inconsist": [3, 4, 6], "guardrail": [3, 6], "articul": 3, "uphold": [3, 6], "employe": [3, 4], "stakehold": [3, 4, 6], "expect": [3, 4, 5, 6, 7], "regard": [3, 4, 5, 6], "ethic": [3, 4, 5, 6], "conduct": [3, 4], "social": [3, 4, 6], "mission": [3, 6], "vision": [3, 4, 5, 6], "cultur": [3, 4, 5, 6], "account": [3, 4, 6], "codifi": 3, "mlcommon": 3, "vidgen": [3, 6], "encompass": [3, 6, 7], "seven": 3, "hazard": [3, 4, 6], "violent": [3, 6], "crime": [3, 6], "sex": [3, 6], "relat": [3, 4, 5, 6, 7], "sexual": [3, 6], "exploit": [3, 4, 6], "indiscrimin": [3, 6], "weapon": [3, 6], "chemic": 3, "biolog": 3, "radiolog": 3, "nuclear": [3, 4], "explos": [3, 6], "cbrne": 3, "suicid": [3, 6], "hate": [3, 6], "speech": [3, 6], "below": [3, 4, 5, 6, 7], "markdown": [3, 4, 5, 6], "written": [3, 4], "english": 3, "o": [3, 4, 6, 7], "ipython": [3, 4, 6], "displai": [3, 4, 6, 7], "def": [3, 4, 6, 7], "load_polici": 3, "policy_path": 3, "path": [3, 4, 5, 6], "join": [3, 4, 6], "genai_polici": 3, "md": [3, 4, 5, 6, 7], "r": [3, 4, 5, 6, 7], "policy_cont": 3, "classroom": [3, 6], "accept": [3, 4, 5, 6], "unaccept": [3, 5], "ag": [3, 4, 6], "subject": [3, 4, 5], "posit": [3, 4, 5, 6, 7], "confid": [3, 4], "inclus": [3, 4, 6, 7], "celebr": 3, "definit": [3, 4, 7], "creativ": [3, 4, 5, 7], "math": [3, 4, 5], "tip": [3, 6], "digit": [3, 4], "literaci": 3, "onlin": [3, 4, 5, 6, 7], "histor": [3, 4], "violenc": [3, 6], "physic": [3, 4, 6], "fight": [3, 6], "crimin": [3, 6], "illeg": [3, 6], "glorifi": [3, 6], "person": [3, 4, 5, 6, 7], "eat": [3, 6], "disord": 3, "diet": 3, "dare": 3, "advic": [3, 4, 6], "discriminatori": [3, 6], "bulli": [3, 6], "harass": [3, 4, 6], "target": [3, 4, 5, 6, 7], "group": [3, 4, 5, 6], "religi": [3, 5, 6], "racial": [3, 4, 6], "ethnic": [3, 6], "gender": [3, 4, 6], "discrimin": [3, 4, 6], "adult": [3, 6], "profan": [3, 6], "relationship": [3, 4], "substanc": [3, 4], "drug": [3, 6], "gambl": 3, "bet": 3, "protocol": [3, 4, 6], "redirect": 3, "alert": 3, "record": [3, 4, 5, 6], "audit": [3, 4], "teacher": [3, 6], "parent": [3, 6], "continu": [3, 4, 5, 6, 7], "construct": [3, 4, 5, 6, 7], "compliant": [3, 6], "violat": [3, 4, 6], "intens": [3, 4, 7], "demand": [3, 4, 5, 6, 7], "especi": [3, 4, 5, 6, 7], "dong": [3, 4, 6], "There": [3, 4, 5, 6, 7], "rlaif": [3, 6], "give": [3, 4, 6], "rise": [3, 6], "kim": [3, 4, 6], "meta": [3, 4, 5, 6], "wu": [3, 4, 6, 7], "scheme": [3, 5], "inspir": [3, 6], "schema": [3, 7], "row": [3, 4, 6], "match": [3, 4, 5, 6, 7], "boundari": [3, 4, 6], "craft": [3, 4, 6, 7], "elicit": [3, 6, 7], "unalign": 3, "panda": [3, 4, 6], "chosen_responses_path": 3, "chosen_respons": 3, "csv": [3, 4, 6], "rejected_responses_path": 3, "rejected_respons": 3, "chosen_responses_jsonl_path": 3, "batch_result": 3, "jsonl": 3, "dpo_dataset_s": 3, "5000": [3, 5], "class": [3, 4, 6, 7], "userpromptgener": 3, "pd": [3, 4, 6], "pydant": [3, 4, 6, 7], "basemodel": [3, 4, 6, 7], "time": [3, 4, 5, 6, 7], "type": [3, 4, 5, 6, 7], "dotenv": [3, 4, 6, 7], "load_dotenv": [3, 4, 6, 7], "environ": [3, 4, 5, 6, 7], "variabl": [3, 4, 6, 7], "overrid": [3, 6, 7], "userprompt": 3, "user_prompt": 3, "str": [3, 4, 6, 7], "__init__": [3, 6, 7], "4o": [3, 4, 5, 6, 7], "mini": [3, 4, 5, 6, 7], "client": [3, 4, 5, 6, 7], "_generate_prompt": 3, "batch": [3, 4, 5], "system_prompt": [3, 6], "pars": [3, 4, 6, 7], "messag": [3, 4, 5, 6, 7], "response_format": [3, 4, 6, 7], "except": [3, 4, 6, 7], "generate_prompt": 3, "num_prompt": [3, 5], "int": [3, 4, 6], "save_to_csv": 3, "multipl": [3, 4, 5, 6, 7], "arg": [3, 4, 6, 7], "option": [3, 4, 5, 6, 7], "filepath": 3, "save": [3, 4, 5, 6], "datafram": [3, 4, 6], "all_prompt": 3, "sleep": 3, "enclos": [3, 6], "quot": [3, 4, 5], "startswith": [3, 6], "els": [3, 4, 6], "df": [3, 4, 6], "to_csv": [3, 6], "index": [3, 4, 5, 6, 7], "fals": [3, 4, 5, 6, 7], "user_prompt_gener": 3, "user_prompts_path": 3, "uneth": [3, 6], "dishonesti": 3, "stalk": 3, "privaci": [3, 4, 5, 6, 7], "secur": [3, 4, 6, 7], "breach": [3, 4, 6], "manipul": [3, 4, 5, 6, 7], "10": [3, 4, 5, 6, 7], "to_markdown": [3, 6], "me": [3, 6, 7], "hurt": 3, "someon": 3, "caught": [3, 6], "plan": [3, 4, 5, 7], "cheat": 3, "fire": [3, 4], "household": 3, "item": [3, 4, 6], "stunt": 3, "friend": 3, "heard": 3, "school": [3, 6], "7": [3, 4, 5, 6], "8": [3, 4, 5, 6], "teach": [3, 7], "my": [3, 5, 6, 7], "monei": [3, 4], "video": [3, 4, 5, 6], "game": [3, 4, 5], "9": [3, 4, 5, 6], "skip": [3, 6, 7], "troubl": [3, 6], "responsegener": 3, "properli": [3, 4, 7], "hug": [3, 4, 5, 6], "instanti": [3, 4], "otherwis": [3, 4, 6], "connect": [3, 4, 5, 7], "endpoint": 3, "local_gener": 3, "model_nam": [3, 4, 7], "huggingface_model_nam": 3, "remote_gener": 3, "api_url": 3, "cloud_endpoint": 3, "recal": [3, 4, 5], "enhanc": [3, 4, 5, 6, 7], "visit": [3, 4], "ui": [3, 4, 7], "click": [3, 5], "select": [3, 4, 5, 7], "choos": [3, 4, 5], "cpu": [3, 5], "gpu": [3, 5], "configur": [3, 4, 5, 6], "meaning": [3, 4, 7], "region": [3, 4], "closest": [3, 4, 5], "your": [3, 4, 5, 6, 7], "locat": [3, 4, 5, 6], "huggingface_hub": 3, "inferencecli": 3, "tokenizers_parallel": 3, "max_new_token": 3, "none": [3, 4, 5, 6], "generate_respons": [3, 4, 7], "prompts_df": 3, "remov": [3, 4, 5], "strip": [3, 4, 7], "elif": 3, "chat_complet": 3, "max_token": [3, 4], "seed": [3, 6], "42": [3, 4, 5, 6], "append": [3, 4, 6, 7], "results_df": [3, 6], "model_respons": 3, "your_api_url": 3, "user_prompts_df": 3, "read_csv": [3, 6], "iloc": 3, "tolist": [3, 6], "parallelevalu": 3, "taming_util": [3, 6], "modul": [3, 4, 7], "num_chunk": 3, "parallel_evalu": 3, "n_part": 3, "associ": [3, 4, 5, 7], "gladli": 3, "constitut": [3, 4], "would": [3, 4, 5, 6, 7], "dtype": [3, 4, 6], "80": [3, 4], "absolut": [3, 4, 7], "materi": [3, 4, 5, 6, 7], "plastic": 3, "food": 3, "lid": 3, "cut": [3, 4], "swath": 3, "wood": [3, 4], "squar": 3, "rectangular": 3, "piec": 3, "place": [3, 4, 5, 6, 7], "insid": [3, 4, 6], "inch": 3, "inspect": [3, 4], "off": [3, 4, 5, 6, 7], "demolit": 3, "scissor": 3, "smash": 3, "smooth": [3, 5], "arrang": [3, 4], "c": [3, 4, 5, 7], "shape": [3, 6, 7], "top": [3, 4, 5, 7], "tuck": 3, "catch": [3, 6], "hook": 3, "solid": 3, "side": [3, 4], "round": [3, 4, 6], "edg": [3, 4, 5, 6], "separ": [3, 4, 5, 6], "process_aligned_respons": 3, "strictli": [3, 7], "bound": [3, 4], "openaibatchprocessor": 3, "async": 3, "company_nam": 3, "save_filepath": 3, "dict": [3, 4, 7], "enforc": [3, 4, 6, 7], "dictionari": [3, 4, 6, 7], "aligned_suffix": 3, "sorri": 3, "suffix": [3, 7], "processor": [3, 5, 7], "api_kei": [3, 4, 6], "getenv": 3, "max_requests_per_minut": 3, "1500": 3, "max_tokens_per_minut": 3, "125000": 3, "await": 3, "process_batch": 3, "total": [3, 4, 5, 6, 7], "total_request": 3, "successful_request": 3, "failed_request": 3, "rate_limit_error": 3, "convert": [3, 4, 5, 6, 7], "json": [3, 4, 5, 6], "fri": 3, "su": [3, 5], "quote_al": 3, "fall": [3, 4, 5, 6], "deem": [3, 4, 6], "pertain": [3, 4], "generate_dpo_dataset": 3, "push": [3, 4], "hub": [3, 4, 5], "repo_id": [3, 5], "push_to_hub": [3, 4], "dpo_dataset": 3, "merg": [3, 6], "_chosen": 3, "_reject": 3, "transform_row": 3, "per": [3, 4, 5, 6], "model_responses_chosen": 3, "model_responses_reject": 3, "seri": [3, 4, 5], "axi": [3, 4], "drop": [3, 4, 6], "hf_dpo_dataset": 3, "from_panda": 3, "duplic": 3, "interest": [3, 4, 5, 6, 7], "opt": 3, "login": 3, "thatupiso": 3, "smolk12": 3, "cli": [3, 4, 5], "parquet": 3, "arrow": 3, "00": [3, 4, 5], "153": [3, 4], "33ba": 3, "upload": [3, 4], "shard": 3, "02": 3, "35": [3, 4, 5], "num_row": 3, "7158": 3, "nmateri": 3, "n1": [3, 4], "nstep": 3, "n2": [3, 4], "n3": [3, 4], "n4": [3, 4], "n5": [3, 4], "n6": 3, "n7": 3, "n8": [3, 4], "n9": [3, 4], "n10": [3, 4], "nnext": 3, "nthe": [3, 4], "singl": [3, 4, 5, 6, 7], "48gb": 3, "a100": 3, "took": 3, "few": [3, 4, 5, 6, 7], "minut": 3, "torch": [3, 7], "h4": [3, 6], "honest": [3, 4], "ultrafeedback": [3, 6], "binar": [3, 6], "lib": [3, 6], "ultrafeedback_binar": [3, 6], "honesti": [3, 6], "dimens": [3, 4, 5, 6], "blend": [3, 5], "automodelforcausallm": [3, 7], "autotoken": [3, 7], "load_dataset": [3, 5, 6], "dpotrain": 3, "dpoconfig": 3, "dataset_k12": 3, "split": [3, 4, 5, 6], "dataset_ultra": 3, "concatenate_dataset": 3, "remove_column": 3, "score_chosen": [3, 6], "score_reject": 3, "shuffl": 3, "base_model": 3, "cuda": 3, "is_avail": 3, "mp": 3, "from_pretrain": [3, 5, 7], "pretrained_model_name_or_path": 3, "torch_dtyp": [3, 7], "float32": 3, "config": [3, 4, 5, 6], "use_cach": 3, "pad_token": 3, "eos_token": 3, "finetun": 3, "finetune_nam": 3, "aligned_model": 3, "finetune_tag": 3, "from_smollm2": 3, "schedul": [3, 4, 5], "learning_r": [3, 5], "determin": [3, 4, 5, 6, 7], "aggress": [3, 4, 5, 6], "empir": 3, "1e": 3, "huyen": 3, "cosin": 3, "lr_scheduler_typ": 3, "stabil": [3, 4, 6], "gradual": 3, "decreas": [3, 4, 7], "accumul": [3, 4], "v": [3, 7], "16": [3, 4, 5, 6], "per_device_train_batch_s": 3, "simul": [3, 4, 6, 7], "gradient_accumulation_step": 3, "strongli": [3, 7], "lower": [3, 4, 5, 6, 7], "conserv": [3, 6], "overfit": 3, "warmup": 3, "max_step": 3, "1000": [3, 4, 5, 6], "suffic": 3, "20": [3, 4, 5, 6, 7], "warmup_step": 3, "stop": [3, 4, 5], "bf16": 3, "checkpoint": 3, "gradient_checkpoint": 3, "usag": [3, 4, 5, 6, 7], "200": [3, 4, 5, 6], "50": [3, 4, 5, 6, 7], "training_results_dir": 3, "smolk12_dpo_output": 3, "dpo_config_path": 3, "dpo_config": 3, "yaml": [3, 4, 7], "pathlib": [3, 6], "config_path": 3, "safe_load": [3, 4], "runtim": [3, 5, 7], "hub_model_id": 3, "use_mps_devic": 3, "output_dir": [3, 4], "training_arg": 3, "trainer": 3, "train_dataset": 3, "processing_class": 3, "temperatur": [3, 4, 5, 6, 7], "max_prompt_length": [3, 5], "1024": 3, "max_length": [3, 4, 7], "1536": 3, "sent": [3, 5, 6], "plot": [3, 4], "move": [3, 4, 5, 6], "averag": [3, 4, 5, 7], "visual": [3, 4, 5, 6], "quick": [3, 4, 5, 6], "150": [3, 4], "curv": 3, "reach": [3, 4, 5, 6, 7], "obviou": 3, "suffici": [3, 4, 7], "save_model": 3, "hf_token": 3, "tag": [3, 6], "congratul": 3, "successfulli": [3, 4, 6, 7], "card": [3, 4, 6], "newli": [3, 4], "qualit": [3, 4, 6], "assess": [3, 4, 5, 6], "rigor": [3, 4, 5, 6], "quantit": [3, 4], "base_gener": 3, "aligned_gener": 3, "compare_model_respons": 3, "base_output": 3, "128": [3, 4, 5], "aligned_output": 3, "pleas": [3, 4, 5, 6], "gram": [3, 4], "tnt": 3, "highli": [3, 4, 5, 6, 7], "regul": [3, 4, 5, 6], "law": [3, 4, 5, 6], "degre": [3, 4, 7], "mishandl": 3, "countri": [3, 4], "seriou": [3, 4, 6], "imprison": 3, "death": 3, "variou": [3, 4, 5, 6, 7], "nation": [3, 6], "dictat": 3, "stark": [3, 4], "readili": [3, 4], "cite": 3, "regulatori": [3, 4, 5, 6], "anecdot": [3, 6], "systemat": [3, 4, 5, 6, 7], "quantifi": [3, 4, 5, 6], "f1": [3, 4, 6], "experienc": [3, 4], "expert": [3, 4, 5, 6, 7], "addition": [3, 4, 5, 6], "vari": [3, 4, 5, 6, 7], "interpret": [3, 4, 5, 6], "judg": [3, 4], "summar": [3, 4, 5], "three": [3, 4, 5, 6], "togeth": [3, 5, 6], "entri": [3, 4, 5], "somewhat": 3, "databas": [3, 4, 7], "distribut": [3, 4, 5, 6, 7], "static": [3, 6, 7], "k12": 3, "base_model_api_url": 3, "aligned_model_api_url": 3, "base_model_responses_path": 3, "evals_base_model_respons": 3, "aligned_model_responses_path": 3, "evals_aligned_model_respons": 3, "num_sampl": [3, 6], "eval_dataset": 3, "df_eval": 3, "to_panda": [3, 4, 6], "lambda": [3, 6], "prompts_ev": 3, "to_list": 3, "base_model_respons": 3, "aligned_model_respons": 3, "df_eval_respons": 3, "_base": 3, "_align": 3, "rememb": [3, 4], "heurist": 3, "charact": [3, 4, 5, 6, 7], "minimum": [3, 4, 5], "min_response_length": 3, "filter": [3, 4, 5, 7], "string": [3, 4, 6, 7], "df_eval_responses_clean": 3, "model_responses_bas": 3, "model_responses_align": 3, "homemad": 3, "kid": 3, "redact": [3, 6], "punish": 3, "unit": [3, 4, 6, 7], "indonesia": 3, "saudi": 3, "arabia": 3, "offens": [3, 6], "respond": [3, 4, 6, 7], "rodrig": 3, "safetyjudg": 3, "evaluate_respons": 3, "tupl": [3, 4, 6], "safetyscor": [3, 6], "float": [3, 4, 5, 6, 7], "valueerror": [3, 7], "empti": [3, 7], "scoring_guid": 3, "nrespons": 3, "safety_judg": 3, "test_respons": 3, "emphas": [3, 4, 5, 6, 7], "emphasi": [3, 4], "base_ev": 3, "zip": [3, 4, 7], "aligned_ev": 3, "injuri": [3, 4], "base_scor": 3, "eval": [3, 5], "aligned_scor": 3, "base_df": 3, "aligned_df": 3, "model_typ": 3, "stack": [3, 5, 6], "evals_df_result": 3, "h": [3, 4, 5, 6], "identifi": [3, 4, 5, 6, 7], "requ": 3, "statist": [3, 4, 6], "naiv": [3, 7], "score_map": 3, "count": [3, 4, 5, 6], "percentag": [3, 4, 6], "score_base_freq": 3, "score_bas": 3, "value_count": [3, 6], "reindex": 3, "fill_valu": 3, "score_base_pct": 3, "score_aligned_freq": 3, "score_align": 3, "score_aligned_pct": 3, "tabl": [3, 4, 5, 6, 7], "md_tabl": 3, "335": [3, 4], "99": [3, 5, 6], "281": [3, 4], "83": [3, 4, 6], "14": [3, 4, 5, 6, 7], "43": [3, 4, 5, 6], "explanation_bas": 3, "response_bas": 3, "model_type_bas": 3, "explanation_align": 3, "response_align": 3, "model_type_align": 3, "std": [3, 4, 6], "base_mean": 3, "aligned_mean": 3, "3f": 3, "108": [3, 4], "231": [3, 4], "No": [3, 4, 5, 6, 7], "fell": 3, "partial": [3, 4], "styliz": [3, 6], "wild": 3, "consider": [3, 6, 7], "proof": 3, "taken": [3, 4, 5, 6, 7], "huang": [3, 4, 5, 6], "overal": [3, 4, 5, 6, 7], "annot": [3, 4, 5, 6], "mirror": [3, 4, 6], "inaccur": [3, 4, 6, 7], "consecut": [3, 6], "unrepres": 3, "hao": [3, 4], "accord": [3, 4, 6, 7], "yin": [3, 4, 6], "resembl": 3, "declin": [3, 4], "volatil": [3, 4], "ineffici": [3, 4], "smollm": 3, "rel": [3, 4, 5, 6], "term": [3, 4, 5, 6], "trade": [3, 4, 5, 6, 7], "weigh": 3, "qwen": [3, 5, 7], "remark": [3, 6, 7], "rival": [3, 5], "ultim": [3, 4, 5, 6], "threshold": [3, 4, 5, 6], "chen": [3, 4, 5, 6, 7], "overli": [3, 4, 6, 7], "simpli": [3, 4, 5, 7], "neglect": [3, 4, 6], "themselv": [3, 4, 6], "complementari": 3, "throughput": [3, 5], "screen": [3, 4, 6], "flag": [3, 4, 5, 6], "preliminari": [3, 4], "judgment": [3, 4], "valid": [3, 4, 5, 7], "automat": [3, 4, 5, 6], "composit": [3, 4], "plai": [3, 4, 5, 6, 7], "led": [3, 4, 7], "apologet": 3, "hesit": 3, "benign": [3, 6], "apolog": 3, "inde": 3, "accordingli": [3, 4, 6], "perhap": 3, "creation": [3, 5, 6], "invalu": 3, "hyperparamet": [3, 5, 6], "mention": [3, 4, 6, 7], "optimist": 3, "memor": [3, 4], "generaliz": 3, "abc": [3, 6], "4a": 3, "amanda": [3, 4, 6], "jan": [3, 4, 6], "brauner": [3, 6], "adrian": 3, "colyer": 3, "benjamin": [3, 4, 6], "cullen": [3, 6], "david": [3, 4, 5, 6], "duvenaud": 3, "richard": [3, 4, 6], "ngo": [3, 6], "azalia": 3, "mirhoseini": 3, "catherin": [3, 4, 6], "olsson": [3, 6], "sam": [3, 4, 6], "ringer": 3, "liam": [3, 4, 6], "skirvin": 3, "jess": [3, 4, 6], "smith": [3, 4, 5], "dawn": [3, 4, 6], "song": [3, 4, 6, 7], "william": [3, 4, 5, 6], "saunder": [3, 4], "steinhardt": [3, 4], "asset": [3, 4, 6], "983c85a201a962f": 3, "pdf": [3, 6], "4b": 3, "24c8d0a3a7d0a1f1": 3, "bjn": 3, "22": [3, 4, 6], "yuntao": [3, 4, 6], "andi": [3, 4, 6], "jone": [3, 4], "kamal": 3, "ndouss": 3, "anna": [3, 4, 6], "nova": [3, 5], "dassarma": 3, "drain": 3, "stanislav": 3, "fort": [3, 6], "ganguli": [3, 4, 6], "tom": [3, 4], "henighan": 3, "nichola": [3, 4], "joseph": [3, 4, 6], "saurav": [3, 6], "kadavath": 3, "jackson": [3, 4, 6], "kernion": [3, 4, 6], "conerli": 3, "sheer": [3, 7], "el": 3, "showk": 3, "nelson": 3, "elhag": 3, "zac": 3, "hatfield": 3, "dodd": 3, "danni": [3, 4, 6], "hernandez": [3, 4, 6], "tristan": 3, "hume": 3, "scott": [3, 4, 6], "johnston": 3, "shauna": 3, "kravec": 3, "lian": 3, "lovitt": 3, "neel": [3, 4], "nanda": 3, "dario": [3, 4], "amodei": [3, 4], "brown": [3, 4], "jack": [3, 4, 6], "clark": 3, "mccandlish": [3, 4], "chri": [3, 4, 6], "olah": 3, "ben": [3, 4, 5, 6], "mann": [3, 6], "jare": [3, 4, 6], "kaplan": [3, 4, 6], "arxiv": [3, 4, 5, 6, 7], "org": [3, 4, 5, 6, 7], "ab": [3, 4, 5, 6, 7], "2204": 3, "05862": 3, "bkk": 3, "sandipan": 3, "kundu": 3, "goldi": 3, "cameron": [3, 4, 6, 7], "mckinnon": 3, "carol": [3, 6], "christoph": [3, 4, 6], "dustin": 3, "eli": [3, 4, 5, 6], "tran": [3, 7], "johnson": 3, "ethan": [3, 4, 6], "perez": [3, 6], "jami": [3, 6], "kerr": 3, "mueller": 3, "jeffrei": 3, "ladish": 3, "joshua": [3, 4, 6], "landau": 3, "kamil": [3, 4], "lukosuit": 3, "michael": [3, 4, 5, 6, 7], "sellitto": 3, "schiefer": 3, "noemi": 3, "mercado": 3, "robert": [3, 4, 5], "lasenbi": 3, "robin": 3, "larson": 3, "tamera": 3, "lanham": 3, "timothi": [3, 4, 5], "telleen": 3, "lawton": 3, "samuel": [3, 4, 6], "bowman": [3, 4], "2212": 3, "08073": 3, "blo23": 3, "announc": [3, 4], "cc": 3, "11": [3, 4, 5, 6], "ccl": [3, 6], "24": [3, 4, 5, 6, 7], "guim": 3, "hardi": 3, "shunian": 3, "zich": 3, "liu": [3, 4, 5, 6, 7], "jiang": [3, 4, 6], "benyou": 3, "wang": [3, 4, 5, 6, 7], "judgement": [3, 4, 6], "2402": [3, 6], "10669": 3, "dphz23": 3, "tim": [3, 6], "artidoro": 3, "pagnoni": 3, "ari": [3, 4, 6], "holtzman": [3, 4], "luke": [3, 4, 6], "zettlemoy": 3, "2305": [3, 4], "14314": 3, "ddz": 3, "qingxiu": 3, "xingx": 3, "zhang": [3, 4, 5, 6], "zhifang": 3, "sui": 3, "furu": 3, "wei": [3, 4, 5, 6], "boost": 3, "2410": [3, 6], "06961": 3, "fac24": [3, 4], "huggingfaceh4": [3, 5, 6], "fac4c": 3, "fac4d": [3, 5], "doc": [3, 4, 5, 6, 7], "en": [3, 4, 5, 6, 7], "fqh": 3, "duanyu": 3, "bowen": [3, 4, 5, 6], "qin": [3, 4, 5, 6], "zheng": [3, 4, 5, 6], "wenqiang": 3, "lei": [3, 4, 5, 6], "analyz": [3, 4, 5, 6, 7], "perspect": [3, 6], "2404": [3, 4, 6], "04626": 3, "h44a": 3, "binari": [3, 4, 5, 6], "h44b": 3, "hhj": 3, "shuang": 3, "wenfeng": 3, "han": [3, 4, 6], "tao": [3, 4, 6], "yipe": 3, "haonan": 3, "chunlin": 3, "zhong": [3, 6], "zhangjun": 3, "zhou": [3, 4, 5, 6], "tang": [3, 4, 5, 6], "2401": [3, 4], "01629": 3, "hlt24": 3, "jiwoo": 3, "noah": [3, 4, 6], "lee": [3, 4, 5, 6, 7], "jame": [3, 4, 6], "thorn": 3, "orpo": 3, "monolith": 3, "2403": [3, 4], "07691": 3, "hdn": 3, "zhenyu": 3, "pengfan": 3, "du": [3, 4], "yilin": 3, "niu": [3, 7], "zhengxiao": 3, "aohan": 3, "zeng": [3, 6], "xiao": [3, 6], "minli": 3, "hongn": 3, "jie": [3, 4, 6, 7], "yuxiao": 3, "2412": [3, 4, 5, 6], "06000": 3, "hsw": 3, "21": [3, 4, 5], "edward": [3, 4], "j": [3, 4, 5, 6, 7], "yelong": 3, "shen": [3, 4, 6], "phillip": 3, "walli": 3, "zeyuan": 3, "allen": [3, 4], "zhu": [3, 4, 5, 6], "yuanzhi": 3, "shean": 3, "lu": [3, 4, 5, 6], "weizhu": 3, "2106": 3, "09685": 3, "hgh": 3, "jiaxin": 3, "shixiang": [3, 4, 6], "shane": [3, 4, 6], "gu": [3, 4, 6], "le": [3, 4, 5], "yuexin": 3, "xuezhi": 3, "hongkun": 3, "yu": [3, 4, 5, 6], "jiawei": [3, 7], "2210": [3, 6], "11610": 3, "huy24": 3, "chip": 3, "reilli": 3, "media": [3, 4, 6], "decemb": [3, 4, 6], "9781098129095": 3, "www": [3, 4, 5, 6], "oreilli": 3, "ksd": 3, "rylan": [3, 4], "schaeffer": 3, "apratim": 3, "dei": 3, "matthia": [3, 4], "gerstgrass": 3, "rafael": 3, "donoho": 3, "sanmi": 3, "koyejo": 3, "thrive": [3, 4, 7], "peril": 3, "16713": 3, "ksy": 3, "seungon": 3, "juyoung": 3, "suk": 3, "xiang": [3, 4, 5], "yue": 3, "vijai": 3, "viswanathan": 3, "seongyun": 3, "yizhong": 3, "kiril": 3, "gashteovski": 3, "carolin": [3, 6], "lawrenc": 3, "sean": [3, 4, 6], "welleck": 3, "graham": 3, "neubig": 3, "03679": 3, "lt24": 3, "herd": [3, 5], "2407": [3, 4, 5, 6], "21783": [3, 5], "lwx": 3, "lin": [3, 4, 5, 6, 7], "rui": [3, 4, 5, 7], "ruixuan": 3, "junbo": 3, "zhao": [3, 4, 5, 6], "ding": 3, "gang": [3, 4], "haobo": 3, "driven": [3, 4, 5, 6], "survei": [3, 4, 6, 7], "2406": [3, 4, 5, 6], "15126": 3, "met24": 3, "owj": 3, "jeff": [3, 4, 6], "diogo": [3, 6], "almeida": [3, 6], "carrol": [3, 6], "wainwright": [3, 6], "pamela": [3, 4, 6], "mishkin": [3, 4, 6], "chong": [3, 6], "sandhini": [3, 6], "agarw": [3, 4, 6], "katarina": [3, 6], "slama": [3, 6], "alex": [3, 4, 5, 6], "rai": [3, 4, 5, 6], "john": [3, 4, 6], "hilton": [3, 4, 5, 6], "fraser": [3, 6], "kelton": 3, "miller": [3, 4], "maddi": [3, 6], "simen": [3, 6], "peter": [3, 4, 5, 6], "welind": [3, 4, 6], "paul": [3, 4, 6], "christiano": [3, 6], "leik": [3, 4, 6], "ryan": [3, 4, 6], "2203": 3, "02155": 3, "qwe24": 3, "rsm": 3, "archit": 3, "sharma": [3, 6], "eric": [3, 4, 5, 6], "mitchel": [3, 5], "stefano": [3, 4], "ermon": [3, 4], "man": [3, 4, 6], "chelsea": [3, 6], "finn": 3, "secretli": 3, "18290": 3, "swd": 3, "17": [3, 4, 6], "filip": [3, 6], "wolski": 3, "prafulla": 3, "dhariw": 3, "alec": [3, 4, 6], "radford": [3, 4, 6], "oleg": [3, 6], "klimov": 3, "1707": 3, "06347": 3, "smollm224": 3, "distil": 3, "smollm2360mi24": 3, "sou24": 3, "html": [3, 7], "srverh24": 3, "m\u00e1rton": 3, "daniel": [3, 4, 6], "rueckert": 3, "r\u00fcdiger": 3, "von": [3, 4, 5], "eisenhart": 3, "roth": [3, 4], "florian": 3, "hinterwimm": 3, "2411": 3, "09539": 3, "tm": [3, 5], "23": [3, 4, 5, 6], "hugo": [3, 5], "loui": [3, 4, 5], "martin": [3, 4, 5, 6], "kevin": [3, 4, 5, 6], "stone": [3, 5], "albert": [3, 5], "amjad": [3, 5], "almahairi": [3, 5], "yasmin": [3, 5], "babaei": [3, 5], "nikolai": [3, 5], "bashlykov": [3, 5], "soumya": [3, 5], "batra": [3, 5], "prajjwal": [3, 5], "bhargava": [3, 5], "shruti": [3, 5], "bhosal": [3, 5], "dan": [3, 4, 5, 6, 7], "bikel": [3, 5], "luka": [3, 5], "blecher": [3, 5], "cristian": [3, 5], "canton": [3, 5], "ferrer": [3, 5], "moya": [3, 5], "guillem": [3, 5], "cucurul": [3, 5], "esiobu": [3, 5], "jude": [3, 5], "fernand": [3, 5], "jeremi": [3, 4, 5], "fu": [3, 5], "wenyin": [3, 5], "brian": [3, 5, 6], "fuller": [3, 5, 6], "cynthia": [3, 5], "gao": [3, 4, 5, 6], "vedanuj": [3, 5], "goswami": [3, 5, 6], "naman": [3, 5], "goyal": [3, 5], "anthoni": [3, 5], "hartshorn": [3, 5], "saghar": [3, 5], "hosseini": [3, 5], "hakan": [3, 5], "inan": [3, 5], "marcin": [3, 5], "karda": [3, 5], "viktor": [3, 5], "kerkez": [3, 5], "madian": [3, 5], "khabsa": [3, 5], "isabel": [3, 5, 6], "kloumann": [3, 5], "artem": [3, 5], "korenev": [3, 5], "punit": [3, 5], "singh": [3, 4, 5], "koura": [3, 5], "mari": [3, 4, 5, 6], "ann": [3, 5, 6], "lachaux": [3, 5], "thibaut": [3, 5], "lavril": [3, 5], "jenya": [3, 5], "diana": [3, 4, 5], "liskovich": [3, 5], "yinghai": [3, 5], "yune": [3, 5], "mao": [3, 5], "xavier": [3, 5], "martinet": [3, 5], "todor": [3, 5, 6], "mihaylov": [3, 5], "pushkar": [3, 5], "mishra": [3, 4, 5], "igor": [3, 4, 5, 6], "molybog": [3, 5], "yixin": [3, 4, 5], "nie": [3, 4, 5], "andrew": [3, 4, 5, 6], "poulton": [3, 5], "reizenstein": [3, 5], "rashi": [3, 5], "rungta": [3, 5], "kalyan": [3, 5], "saladi": [3, 5], "alan": [3, 5, 6], "schelten": [3, 5], "ruan": [3, 5], "silva": [3, 5], "ranjan": [3, 5], "subramanian": [3, 5], "xiaoq": [3, 5], "ellen": [3, 5], "tan": [3, 4, 5], "binh": [3, 5], "ross": [3, 5, 6], "taylor": [3, 5], "adina": [3, 5, 6], "jian": [3, 4, 5], "kuan": [3, 5], "puxin": [3, 5], "yan": [3, 4, 5], "iliyan": [3, 5], "zarov": [3, 5], "yuchen": [3, 4, 5, 6], "angela": [3, 4, 5, 6], "fan": [3, 4, 5], "melani": [3, 5], "kambadur": [3, 5], "sharan": [3, 5], "narang": [3, 5], "aurelien": [3, 5], "rodriguez": [3, 5], "stojnic": [3, 5], "sergei": [3, 5], "edunov": [3, 5], "thoma": [3, 4, 5, 6], "scialom": [3, 5], "2307": [3, 5, 7], "09288": [3, 5], "vaa": [3, 6], "berti": [3, 6], "adarsh": [3, 6], "agraw": [3, 6], "ahm": [3, 6], "victor": [3, 6], "akinwand": [3, 6], "namir": [3, 6], "nuaimi": [3, 6], "najla": [3, 6], "alfaraj": [3, 6], "alhajjar": [3, 6], "aroyo": [3, 6], "trupti": [3, 6], "bavalatti": [3, 6], "max": [3, 4, 6], "bartolo": [3, 6], "borhan": [3, 6], "blili": [3, 6], "hamelin": [3, 6], "kurt": [3, 6], "bollack": [3, 6], "rishi": [3, 4, 5, 6], "bomassani": [3, 6], "marisa": [3, 6], "ferrara": [3, 6], "boston": [3, 6], "sim\u00e9on": [3, 6], "campo": [3, 6], "kal": [3, 6], "chakra": [3, 6], "canyu": [3, 6], "codi": [3, 6], "coleman": [3, 6], "zachari": [3, 4, 6], "delpierr": [3, 6], "coudert": [3, 6], "leon": [3, 6], "derczynski": [3, 6], "debojyoti": [3, 6], "dutta": [3, 6], "ian": [3, 4, 6], "eisenberg": [3, 6], "ezick": [3, 6], "heather": [3, 6], "frase": [3, 6], "ram": [3, 5, 6], "gandikota": [3, 6], "agasthya": [3, 6], "gangavarapu": [3, 6], "ananya": [3, 4, 6], "geali": [3, 6], "rajat": [3, 6], "ghosh": [3, 4, 6], "goel": [3, 4, 6], "usman": [3, 6], "gohar": [3, 6], "sujata": [3, 6], "hale": [3, 6], "wiebk": [3, 6], "hutiri": [3, 6], "marvin": [3, 6], "imperi": [3, 6], "surgan": [3, 6], "jandial": [3, 6], "nick": [3, 4, 6], "judd": [3, 6], "felix": [3, 4, 6], "juefei": [3, 6], "fouts": [3, 6], "khomh": [3, 6], "bhavya": [3, 6], "kailkhura": [3, 6], "hannah": [3, 4, 6], "rose": [3, 6], "kirk": [3, 6], "klyman": [3, 6], "knotz": [3, 6], "kuchnik": [3, 6], "shachi": [3, 6], "kumar": [3, 4, 6], "srijan": [3, 6], "lengerich": [3, 6], "bo": [3, 4, 5, 6], "zeyi": [3, 6], "liao": [3, 4, 6], "eileen": [3, 6], "sarah": [3, 4, 6], "luger": [3, 6], "yifan": [3, 4, 6], "priyanka": [3, 6], "mammen": [3, 6], "kelvin": [3, 6], "manyeki": [3, 6], "mcgregor": [3, 6], "virendra": [3, 6], "mehta": [3, 4, 6], "shafe": [3, 6], "moham": [3, 6], "moss": [3, 6], "lama": [3, 6], "nachman": [3, 6], "dinesh": [3, 6], "jinenh": [3, 6], "naganna": [3, 6], "amin": [3, 6], "nikanjam": [3, 6], "besmira": [3, 6], "nushi": [3, 6], "lui": [3, 4, 6], "oala": [3, 6], "iftach": [3, 6], "orr": [3, 4, 6], "alicia": [3, 4, 6], "parrish": [3, 4, 6], "cigdem": [3, 6], "patlak": [3, 6], "pietri": [3, 6], "forough": [3, 6], "poursabzi": [3, 6], "sangdeh": [3, 6], "eleonora": [3, 6], "presani": [3, 6], "fabrizio": [3, 6], "puletti": [3, 6], "r\u00f6ttger": [3, 6], "sahai": [3, 6], "santo": [3, 6], "nino": [3, 6], "scherrer": [3, 6], "alic": [3, 4, 6, 7], "schoenauer": [3, 6], "sebag": [3, 6], "patrick": [3, 6], "schramowski": [3, 6], "abolfazl": [3, 6], "shahbazi": [3, 6], "vin": [3, 6], "xudong": [3, 4, 6], "vamsi": [3, 6], "sistla": [3, 6], "leonard": [3, 6], "testuggin": [3, 6], "vithursan": [3, 6], "thangarasa": [3, 6], "elizabeth": [3, 4, 6], "watkin": [3, 6], "rebecca": [3, 4, 6], "weiss": [3, 6], "welti": [3, 6], "tyler": [3, 4, 6], "wilber": [3, 6], "jean": [3, 6], "poonam": [3, 6], "yadav": [3, 6], "xianjun": [3, 6], "yang": [3, 4, 5, 6, 7], "yi": [3, 4, 6, 7], "wenhui": [3, 6], "fedor": [3, 6], "zhdanov": [3, 6], "jiacheng": [3, 4, 6], "perci": [3, 4, 6], "liang": [3, 4, 6, 7], "mattson": [3, 6], "joaquin": [3, 6], "vanschoren": [3, 6], "v0": [3, 6, 7], "12241": [3, 6], "wyg": 3, "tianhao": [3, 4, 5, 6], "weizh": 3, "yuan": [3, 4, 6], "olga": 3, "golovneva": 3, "jing": [3, 6], "yuandong": 3, "tian": 3, "jiantao": 3, "jiao": 3, "jason": [3, 4, 6], "weston": 3, "sainbayar": 3, "sukhbaatar": 3, "19594": 3, "xfg": 3, "shusheng": 3, "jiaxuan": 3, "wenji": 3, "ye": [3, 4, 5, 6, 7], "weilin": 3, "zhiyu": [3, 7], "mei": [3, 4, 5], "guangju": 3, "chao": 3, "10719": 3, "ywx": 3, "yueqin": 3, "zhendong": 3, "yujia": 3, "xie": [3, 4], "mingyuan": 3, "paradigm": [3, 4], "semanticscholar": 3, "corpusid": 3, "270199610": 3, "matter": 4, "beauti": 4, "smart": [4, 6], "agre": 4, "wrong": 4, "feynman": 4, "advent": 4, "shift": 4, "norm": 4, "realm": 4, "convent": [4, 6], "evolut": [4, 5], "conceiv": 4, "entrench": 4, "seem": 4, "daunt": 4, "ignor": 4, "outdat": [4, 6, 7], "inevit": 4, "setback": 4, "imper": 4, "embrac": 4, "proactiv": [4, 6], "mindset": 4, "front": [4, 5], "produc": [4, 5, 6, 7], "novel": [4, 5], "ident": 4, "isn": [4, 6], "bug": 4, "random": [4, 6, 7], "testabl": 4, "exceedingli": 4, "guarante": [4, 5, 6, 7], "primari": [4, 6], "nucleu": 4, "2020": 4, "summari": [4, 5, 6, 7], "alter": 4, "rigid": 4, "wildli": 4, "incoher": 4, "inadequ": [4, 6], "temp": 4, "df_result": 4, "ntemperatur": 4, "40": [4, 5], "temp_respons": 4, "iterrow": [4, 6], "10000": [4, 7], "appl": [4, 7], "txt": [4, 5, 7], "sec_fil": [4, 7], "nsecur": 4, "AND": [4, 7], "exchang": [4, 6, 7], "commiss": [4, 6, 7], "nwashington": 4, "20549": 4, "nform": 4, "annual": [4, 6], "pursuant": 4, "TO": [4, 6], "13": [4, 5, 6], "OR": 4, "OF": [4, 6], "THE": [4, 6], "1934": 4, "nfor": 4, "fiscal": 4, "septemb": 4, "28": [4, 5, 6], "nor": 4, "period": [4, 6], "ncommiss": 4, "001": [4, 5], "36743": 4, "ng66145g66i43": 4, "jpg": 4, "nappl": 4, "exact": [4, 5, 6], "registr": 4, "specifi": [4, 5, 6, 7], "charter": 4, "ncalifornia": 4, "t94": 4, "2404110": 4, "jurisdict": 4, "nof": 4, "incorpor": [4, 5, 6, 7], "employ": 4, "identif": 4, "park": 4, "ncupertino": 4, "california": [4, 6, 7], "n95014": 4, "princip": 4, "offic": [4, 6], "408": 4, "996": 4, "1010": 4, "telephon": 4, "area": [4, 6, 7], "regist": 4, "ntitl": 4, "ttrade": 4, "symbol": 4, "tname": 4, "ncommon": 4, "stock": [4, 7], "00001": 4, "naapl": 4, "tthe": 4, "nasdaq": [4, 7], "llc": [4, 7], "n0": 4, "000": [4, 5, 7], "note": [4, 5, 6, 7], "2025": 4, "875": 4, "625": 4, "2026": 4, "2027": 4, "375": 4, "2029": 4, "050": 4, "2031": [4, 6], "600": 4, "2042": 4, "nindic": 4, "season": 4, "issuer": 4, "405": 4, "nye": 4, "preced": [4, 7], "shorter": 4, "past": [4, 6], "90": [4, 5, 6], "submit": [4, 5, 6], "electron": 4, "232": 4, "acceler": [4, 5, 6], "filer": 4, "growth": 4, "12b": [4, 6], "nlarg": 4, "tacceler": 4, "nnon": 4, "tsmaller": 4, "nemerg": 4, "nif": 4, "elect": [4, 6], "revis": [4, 6], "attest": 4, "404": 4, "sarban": 4, "oxlei": 4, "7262": 4, "firm": [4, 6], "prepar": [4, 5, 6], "correct": [4, 6], "restat": 4, "recoveri": 4, "incent": 4, "compens": 4, "240": 4, "10d": 4, "shell": 4, "aggreg": [4, 6], "vote": 4, "held": [4, 7], "affili": [4, 7], "march": [4, 7], "29": [4, 5, 6, 7], "last": [4, 6, 7], "second": [4, 5, 6], "quarter": 4, "628": [4, 7], "553": [4, 7], "sole": [4, 6], "disclosur": [4, 5, 6], "director": [4, 5, 6], "date": 4, "exclud": 4, "n15": 4, "115": [4, 7], "823": [4, 7], "outstand": [4, 7], "octob": [4, 7], "18": [4, 5, 6, 7], "ndocument": 4, "BY": 4, "nportion": 4, "proxi": [4, 6], "meet": [4, 6, 7], "sharehold": 4, "iii": 4, "120": [4, 6], "ntabl": 4, "npage": 4, "npart": 4, "nitem": 4, "nbusi": 4, "1a": 4, "nrisk": 4, "1b": [4, 5, 6], "nunresolv": 4, "staff": 4, "comment": 4, "n17": 4, "1c": 4, "ncybersecur": 4, "nproperti": 4, "n18": 4, "nlegal": 4, "proceed": [4, 6], "nmine": 4, "ii": [4, 5, 7], "nmarket": 4, "stockhold": 4, "purchas": [4, 6], "n19": 4, "reserv": 4, "n20": 4, "nmanag": 4, "discuss": [4, 5, 6], "n21": 4, "7a": 4, "nquantit": 4, "n27": 4, "nfinanci": 4, "supplementari": 4, "n28": 4, "nchang": 4, "disagr": 4, "n51": 4, "9a": 4, "ncontrol": 4, "procedur": [4, 6], "9b": 4, "nother": 4, "n52": 4, "9c": 4, "ndisclosur": 4, "foreign": 4, "ndirector": 4, "corpor": [4, 6], "nexecut": 4, "ownership": [4, 5], "certain": [4, 6, 7], "benefici": [4, 5], "owner": 4, "ncertain": 4, "transact": [4, 6], "nprincip": 4, "fee": 4, "iv": 4, "nexhibit": 4, "n53": 4, "n56": 4, "nthi": 4, "litig": [4, 5], "reform": 4, "1995": 4, "uncertainti": [4, 5, 6], "event": 4, "macroeconom": 4, "anticip": [4, 6], "caus": [4, 6], "oblig": 4, "nunless": 4, "herein": 4, "calendar": 4, "wholli": 4, "subsidiari": 4, "unless": [4, 5], "ncompani": 4, "manufactur": 4, "smartphon": [4, 5], "tablet": [4, 5], "wearabl": 4, "accessori": 4, "sell": [4, 6], "varieti": [4, 5], "52": [4, 6], "53": [4, 6], "week": 4, "saturdai": 4, "nproduct": 4, "niphon": 4, "io": [4, 7], "iphon": 4, "pro": [4, 5, 6], "se": [4, 6], "nmac": 4, "maco": [4, 5], "mac": [4, 5], "laptop": 4, "macbook": 4, "air": 4, "desktop": [4, 5], "imac": 4, "studio": 4, "nipad": 4, "multipurpos": 4, "ipado": 4, "ipad": 4, "nwearabl": 4, "home": 4, "smartwatch": 4, "wireless": 4, "headphon": 4, "spatial": 4, "watcho": 4, "watch": 4, "ultra": 4, "airpod": 4, "beat": [4, 5], "visiono": 4, "nhome": 4, "tv": 4, "stream": [4, 5, 7], "tvo": 4, "homepod": 4, "fidel": [4, 7], "naccessori": 4, "brand": 4, "third": [4, 5, 6], "parti": [4, 5, 6], "nservic": 4, "nadvertis": 4, "advertis": 4, "licens": 4, "napplecar": 4, "portfolio": 4, "applecar": 4, "prioriti": [4, 5], "network": [4, 5, 7], "repair": 4, "coverag": [4, 6], "accident": 4, "damag": [4, 6], "theft": [4, 6], "ncloud": 4, "ndigit": 4, "app": [4, 5], "discov": [4, 5, 6], "download": [4, 5], "music": 4, "podcast": 4, "subscript": [4, 5], "arcad": 4, "sm": 4, "listen": [4, 5], "radio": 4, "station": 4, "magazin": 4, "exclus": 4, "sport": 4, "npayment": 4, "payment": 4, "credit": 4, "pai": [4, 5], "cashless": 4, "nsegment": 4, "primarili": [4, 6], "geograph": [4, 6], "basi": [4, 5], "segment": [4, 7], "america": 4, "europ": 4, "china": [4, 6], "japan": 4, "rest": [4, 5], "asia": 4, "pacif": 4, "north": [4, 6], "south": 4, "european": [4, 6], "india": 4, "middl": [4, 5, 6], "east": 4, "africa": 4, "mainland": 4, "kong": 4, "taiwan": 4, "australia": 4, "asian": 4, "although": [4, 5], "partner": [4, 5, 6], "mid": 4, "enterpris": [4, 5, 6, 7], "resel": 4, "retail": 4, "sale": 4, "indirect": 4, "channel": [4, 6], "cellular": 4, "carrier": 4, "net": [4, 7], "38": [4, 5, 6], "62": [4, 5], "ncompetit": 4, "competit": [4, 5, 6], "character": [4, 6], "price": [4, 5], "downward": 4, "pressur": [4, 6], "gross": [4, 6], "cycl": [4, 6], "industri": [4, 5, 6, 7], "characterist": [4, 5, 6, 7], "competitor": [4, 5, 6], "compet": [4, 5], "imit": 4, "infring": [4, 5], "intellectu": [4, 5, 6], "innov": [4, 5, 6], "marketplac": [4, 6], "nearli": [4, 5], "reput": [4, 6], "expand": [4, 5, 6], "opportun": 4, "broader": [4, 5, 6], "illegitim": [4, 6], "collabor": [4, 5, 6], "nsuppli": 4, "nalthough": 4, "essenti": [4, 5, 6, 7], "particip": 4, "shortag": 4, "commod": [4, 5], "fluctuat": 4, "commonli": 4, "capac": [4, 5], "until": [4, 6, 7], "supplier": 4, "matur": 4, "concentr": 4, "enter": [4, 7], "agreement": 4, "suppli": [4, 7], "renew": 4, "nresearch": 4, "nbecaus": 4, "upon": [4, 6], "flow": [4, 7], "acquisit": [4, 6], "nintellectu": 4, "broad": [4, 5, 7], "patent": 4, "copyright": [4, 5], "trademark": 4, "secret": 4, "differenti": 4, "skill": [4, 6], "personnel": 4, "regularli": 4, "pursu": [4, 6], "thousand": [4, 5], "durat": 4, "adequ": [4, 6], "nin": 4, "holidai": [4, 6], "fill": 4, "inventori": 4, "older": [4, 5], "newer": 4, "distributor": 4, "nhuman": 4, "strive": 4, "retain": [4, 5, 6], "talent": 4, "member": [4, 6], "164": 4, "equival": [4, 5], "ncompens": 4, "equit": 4, "succe": 4, "health": [4, 6], "awai": [4, 6], "ngrowth": 4, "career": 4, "leadership": [4, 6], "nworkplac": 4, "polici": [4, 5], "equal": [4, 6], "workplac": 4, "ninclus": 4, "sustain": [4, 5, 6], "workforc": 4, "nengag": 4, "among": [4, 5, 6, 7], "gaug": 4, "sentiment": [4, 5, 7], "nhealth": 4, "everywher": 4, "crisi": 4, "visitor": 4, "navail": 4, "quarterli": 4, "q": [4, 5, 6], "amend": 4, "sec": [4, 7], "Such": [4, 6], "charg": 4, "investor": [4, 7], "aspx": 4, "websit": [4, 5, 6], "environment": [4, 6], "referenc": 4, "inact": 4, "textual": 4, "unknown": [4, 6], "advers": 4, "conjunct": 4, "consolid": 4, "accompani": [4, 6], "nmacroeconom": 4, "econom": 4, "facil": 4, "assembli": 4, "site": 4, "nadvers": 4, "slow": 4, "recess": 4, "unemploy": 4, "inflat": 4, "tighter": 4, "currenc": 4, "spend": 4, "monetari": 4, "contract": [4, 5], "logist": 4, "instabl": [4, 6], "inabl": 4, "financ": [4, 5, 6], "insolv": 4, "counterparti": 4, "debt": 4, "liquid": 4, "fair": [4, 6], "instrument": 4, "polit": [4, 6], "disput": 4, "geopolit": 4, "tension": [4, 6], "terror": 4, "accid": 4, "interrupt": 4, "npolit": 4, "whole": 4, "outsourc": 4, "korea": 4, "vietnam": 4, "restrict": [4, 5, 6, 7], "tariff": 4, "export": 4, "portion": [4, 5], "revenu": [4, 7], "raw": [4, 5, 6, 7], "restructur": 4, "ceas": 4, "disrupt": 4, "escal": [4, 6], "nmani": 4, "prone": [4, 6], "earthquak": 4, "climat": 4, "weather": 4, "plant": 4, "terrorist": [4, 6], "attack": [4, 6], "hostil": 4, "ransomwar": 4, "cybersecur": [4, 6], "labor": 4, "beyond": [4, 6], "nsuch": 4, "imposs": [4, 5], "slowdown": 4, "outag": 4, "neg": [4, 6, 7], "pandem": 4, "covid": 4, "19": [4, 5, 6], "economi": 4, "imposit": 4, "stringent": [4, 5, 6], "travel": 4, "freight": 4, "movement": 4, "ramp": 4, "nfollow": 4, "expenditur": 4, "resum": 4, "exacerb": 4, "insur": 4, "nglobal": 4, "unabl": 4, "assur": [4, 6], "minor": [4, 6], "naddition": 4, "intensifi": 4, "seamlessli": 4, "nto": 4, "stimul": 4, "ndue": 4, "upgrad": 4, "quantiti": 4, "defect": 4, "defici": 4, "supersed": 4, "nsubstanti": 4, "transport": 4, "provis": 4, "reimburs": 4, "warranti": 4, "unanticip": 4, "liabil": 4, "final": [4, 6, 7], "finish": [4, 6], "destin": 4, "made": [4, 5, 7], "prepay": 4, "termin": [4, 5], "recover": 4, "exposur": [4, 6], "nfutur": 4, "semiconductor": 4, "suffer": [4, 6], "constrain": [4, 5, 7], "shipment": 4, "unexpectedli": 4, "interfer": 4, "unsaf": [4, 6], "expos": [4, 6], "widespread": [4, 6], "vulner": [4, 6], "compromis": [4, 5, 6], "claim": [4, 5, 6], "intang": 4, "fine": [4, 6], "lost": [4, 6], "cancel": 4, "obsolet": 4, "exce": [4, 6], "realiz": 4, "accru": 4, "excess": 4, "impair": 4, "whenev": 4, "circumst": 4, "amount": [4, 6, 7], "carri": [4, 5, 7], "incur": 4, "unpredict": [4, 6], "pace": [4, 6], "obsolesc": 4, "forecast": [4, 6], "incorrectli": [4, 6, 7], "extens": [4, 5, 7], "issuanc": 4, "unknowingli": [4, 6], "notifi": 4, "preclud": 4, "bui": 4, "percept": 4, "android": 4, "playstat": 4, "nintendo": 4, "xbox": 4, "inclin": 4, "devot": 4, "compel": [4, 5, 7], "dissatisfi": 4, "vast": [4, 6], "storefront": 4, "safari": 4, "union": [4, 6], "eu": [4, 6], "dma": 4, "narrow": [4, 5, 6], "scope": [4, 5, 6], "elimin": [4, 5], "nfailur": 4, "appeal": 4, "subscrib": 4, "nsome": 4, "manner": [4, 6], "nurtur": 4, "nmuch": 4, "chief": 4, "silicon": 4, "vallei": 4, "constantli": 4, "driver": [4, 5], "recruit": 4, "subsidi": 4, "staf": 4, "contractor": 4, "placement": 4, "increment": 4, "weaken": 4, "telecommun": 4, "war": 4, "virus": 4, "ins": 4, "incid": [4, 6], "redund": 4, "ineffect": 4, "thing": [4, 7], "interf": 4, "imped": 4, "ship": 4, "nloss": 4, "unauthor": [4, 6], "confidenti": [4, 5], "encrypt": 4, "But": [4, 6, 7], "behalf": 4, "normal": [4, 6, 7], "investig": [4, 6], "penalti": [4, 5], "frequenc": [4, 5, 6], "actor": [4, 6], "circumv": [4, 6], "obfusc": 4, "forens": 4, "hinder": [4, 7], "recov": 4, "perpetr": 4, "profil": [4, 5], "authent": 4, "hack": [4, 6], "malfeas": 4, "faulti": 4, "password": 4, "irregular": 4, "fraudul": 4, "induc": 4, "disclos": [4, 7], "usernam": 4, "turn": [4, 6], "multifactor": 4, "unusu": 4, "freez": 4, "suspici": 4, "nwhile": 4, "ninvest": 4, "ongo": [4, 5], "contempl": 4, "endeavor": 4, "distract": 4, "tangibl": 4, "approv": 4, "oner": 4, "ventur": 4, "riski": 4, "leas": 4, "unfavor": 4, "arisen": 4, "ordinari": 4, "cours": [4, 5, 6], "resolv": [4, 5, 6], "sometim": [4, 7], "indemnif": 4, "indemnifi": 4, "alleg": 4, "magnitud": 4, "assert": 4, "royalti": 4, "vigor": 4, "defend": 4, "court": [4, 5], "internation": 4, "plaintiff": 4, "injunct": 4, "relief": 4, "nregardless": 4, "merit": 4, "recognit": [4, 5, 6], "settl": 4, "uncertain": 4, "disgorg": 4, "remedi": [4, 6], "worldwid": 4, "antitrust": 4, "bill": 4, "commerc": 4, "mobil": [4, 5, 7], "televis": 4, "film": 4, "anticorrupt": 4, "cash": 4, "repatri": 4, "launder": 4, "tax": 4, "wast": 4, "recycl": 4, "ncomplianc": 4, "impos": [4, 5, 6, 7], "agent": [4, 5, 6], "nregulatori": 4, "ban": [4, 6], "nexpect": 4, "increasingli": [4, 5, 6, 7], "greenhous": 4, "ga": 4, "emiss": 4, "civil": 4, "disagre": 4, "perceiv": 4, "feder": 4, "nfrom": 4, "noncompli": 4, "individu": [4, 5, 6], "lawsuit": [4, 5], "monopol": 4, "nfurther": 4, "earn": 4, "search": [4, 5, 6], "nthere": 4, "retent": 4, "transfer": 4, "pass": [4, 5, 6, 7], "pend": 4, "inquiri": [4, 6], "government": 4, "entiti": [4, 5, 6, 7], "biometr": 4, "notif": 4, "permit": [4, 5, 7], "healthcar": [4, 5], "liabl": 4, "investigatori": 4, "cardhold": 4, "compress": [4, 5], "acquir": 4, "extent": [4, 6], "unexpect": [4, 6, 7], "dollar": [4, 5], "denomin": 4, "offset": 4, "strengthen": [4, 6], "nconvers": 4, "thu": 4, "hedg": 4, "deterior": 4, "sovereign": 4, "heighten": [4, 6], "worsen": 4, "A": [4, 5, 6, 7], "collater": 4, "bank": 4, "unsecur": 4, "subassembli": 4, "assembl": 4, "legisl": 4, "ireland": [4, 6], "singapor": 4, "organis": 4, "statutori": 4, "valuat": 4, "defer": 4, "bodi": [4, 6], "adequaci": 4, "ow": 4, "ngener": 4, "volum": [4, 5, 6], "repurchas": 4, "dividend": 4, "consumm": 4, "declar": 4, "board": [4, 6], "unresolv": 4, "nnone": 4, "threat": [4, 6], "postur": 4, "25": [4, 5, 6], "2016": 4, "coordin": [4, 6], "track": [4, 6], "committe": [4, 6], "oversight": [4, 6], "counsel": 4, "chair": 4, "headquart": 4, "cupertino": [4, 7], "center": [4, 6, 7], "formal": [4, 6, 7], "conclud": [4, 5], "uninstal": 4, "web": [4, 5, 6], "browser": 4, "june": 4, "contractu": 4, "desist": 4, "stai": [4, 5], "grant": 4, "ndepart": 4, "justic": 4, "depart": [4, 6], "doj": 4, "district": 4, "attornei": 4, "jersei": 4, "redress": [4, 6], "anticompetit": 4, "nonmonetari": 4, "defens": [4, 6], "nepic": 4, "epic": 4, "northern": 4, "unfair": [4, 6], "enjoin": 4, "extern": [4, 6], "link": 4, "januari": 4, "motion": 4, "oppos": [4, 6], "30": [4, 5, 6], "vacat": 4, "fourth": 4, "mine": 4, "nnot": 4, "aapl": 4, "nholder": 4, "na": [4, 6], "301": 4, "npurchas": 4, "nshare": 4, "nperiod": 4, "ttotal": 4, "taverag": 4, "npaid": 4, "nannounc": 4, "napproxim": 4, "That": [4, 6, 7], "Be": [4, 5, 6], "nunder": 4, "njune": 4, "august": [4, 6], "nopen": 4, "negoti": [4, 6], "t35": 4, "697": 4, "t224": 4, "naugust": 4, "31": [4, 5], "t42": 4, "910": 4, "t221": 4, "39": [4, 5], "nseptemb": 4, "t33": 4, "653": 4, "t222": 4, "86": [4, 5], "ntotal": [4, 6], "t112": 4, "260": 4, "t89": 4, "074": 4, "110": 4, "billion": [4, 5], "10b5": 4, "graph": 4, "cumul": 4, "reinvest": 4, "dow": 4, "supersector": 4, "27": [4, 6], "2019": 4, "n2218": 4, "tseptemb": 4, "t100": 4, "t207": 4, "t273": 4, "t281": 4, "t322": 4, "t430": 4, "t113": 4, "t156": 4, "t131": 4, "t155": 4, "t210": 4, "ndow": 4, "t146": 4, "t216": 4, "t215": 4, "nfirst": 4, "nsecond": 4, "nthird": 4, "sequoia": 4, "nfourth": 4, "plu": [4, 5], "nfiscal": 4, "six": 4, "realign": 4, "span": [4, 5, 6], "indirectli": 4, "n2024": 4, "tchang": 4, "t2023": 4, "t2022": 4, "namerica": 4, "t167": 4, "045": 4, "t3": 4, "t162": 4, "560": 4, "t169": 4, "658": 4, "neurop": 4, "t101": 4, "328": 4, "t7": 4, "294": 4, "t95": 4, "118": 4, "ngreater": 4, "t66": 4, "952": 4, "t72": 4, "559": 4, "t74": 4, "njapan": 4, "t25": 4, "052": 4, "t24": 4, "257": 4, "977": 4, "nrest": 4, "t30": 4, "t4": 4, "t29": 4, "615": 4, "t1": 4, "t391": 4, "035": 4, "t2": 4, "t383": 4, "285": 4, "t394": 4, "weak": [4, 6], "renminbi": 4, "yen": [4, 7], "t201": 4, "183": 4, "t200": 4, "583": 4, "t205": 4, "489": 4, "984": 4, "357": 4, "t40": 4, "177": [4, 6], "t26": 4, "694": 4, "t28": 4, "300": 4, "292": 4, "t37": 4, "005": 4, "t39": 4, "845": [4, 6], "t41": 4, "241": 4, "n96": 4, "169": 4, "t13": 4, "t85": 4, "t9": 4, "t78": 4, "129": [4, 6], "amort": 4, "bundl": 4, "flat": 4, "ngross": 4, "t109": 4, "633": 4, "t108": 4, "803": 4, "t114": 4, "728": 4, "t71": 4, "t60": 4, "345": 4, "t56": 4, "054": 4, "t180": 4, "683": 4, "148": 4, "t170": 4, "782": 4, "t36": 4, "t73": 4, "t70": 4, "t46": 4, "t44": 4, "t43": 4, "noper": 4, "t31": 4, "370": 4, "t5": 4, "915": 4, "t14": 4, "251": 4, "npercentag": 4, "t8": 4, "nsell": 4, "administr": 4, "097": 4, "932": 4, "094": 4, "t6": 4, "t57": 4, "467": 4, "t54": 4, "847": 4, "t51": 4, "t15": 4, "headcount": 4, "nprovis": 4, "749": 4, "t16": 4, "741": 4, "t19": 4, "neffect": 4, "nstatutori": 4, "t21": 4, "aid": [4, 6], "nliquid": 4, "unrestrict": 4, "140": 4, "ndebt": 4, "97": [4, 6], "payabl": 4, "promissori": 4, "nleas": 4, "space": [4, 5, 6], "nmanufactur": 4, "noncancel": 4, "ndeem": 4, "tcja": 4, "nstate": 4, "fund": [4, 5], "escrow": 4, "ncapit": 4, "95": [4, 6], "nrecent": 4, "pronounc": 4, "nincom": 4, "fasb": 4, "asu": 4, "09": [4, 6], "740": 4, "reconcili": 4, "reconcil": [4, 7], "disaggreg": 4, "prospect": 4, "novemb": [4, 6], "07": [4, 6, 7], "280": 4, "maker": 4, "codm": 4, "alloc": [4, 5, 6], "retrospect": 4, "ncritic": 4, "conform": [4, 7], "gaap": 4, "nuncertain": 4, "domest": 4, "taxat": 4, "resolut": 4, "conting": 4, "26": [4, 5], "ninterest": 4, "forth": 4, "hypothet": 4, "nsensit": 4, "nhypothet": 4, "nrate": 4, "npotenti": 4, "n100": 4, "tenor": 4, "ndeclin": 4, "755": 4, "089": 4, "nterm": 4, "nincreas": 4, "t139": 4, "t194": 4, "nforeign": 4, "var": 4, "mont": 4, "carlo": 4, "interv": 4, "538": 4, "669": 4, "nindex": 4, "tpage": 4, "nconsolid": 4, "n29": 4, "n30": 4, "sheet": 4, "n31": 4, "n32": 4, "n33": 4, "nnote": 4, "n34": 4, "nreport": 4, "n48": 4, "nall": 4, "omit": [4, 7], "submiss": 4, "nyear": 4, "n2023": 4, "n2022": 4, "nnet": 4, "t294": 4, "866": 4, "t298": 4, "085": 4, "t316": 4, "199": 4, "t96": 4, "ncost": 4, "t185": 4, "233": 4, "t189": 4, "282": 4, "471": 4, "119": 4, "855": 4, "t22": 4, "075": 4, "352": 4, "t214": 4, "137": 4, "t223": 4, "546": 4, "t123": 4, "216": 4, "t119": 4, "437": 4, "t269": 4, "565": 4, "334": 4, "485": 4, "736": 4, "103": 4, "t93": 4, "995": 4, "t99": 4, "nearn": 4, "nbasic": 4, "ndilut": 4, "08": [4, 5, 7], "343": [4, 6], "783": 4, "744": 4, "215": 4, "963": 4, "095": 4, "812": 4, "547": 4, "325": 4, "819": 4, "nsee": 4, "translat": [4, 5, 6], "t395": 4, "765": 4, "511": 4, "unreal": 4, "832": 4, "t323": 4, "212": 4, "nadjust": 4, "337": 4, "717": 4, "394": 4, "138": 4, "850": 4, "563": 4, "104": 4, "t204": 4, "t253": 4, "816": 4, "899": 4, "272": 4, "t98": 4, "016": 4, "652": 4, "t88": 4, "531": 4, "nasset": 4, "ncurrent": 4, "ncash": 4, "943": 4, "965": 4, "228": 4, "590": 4, "naccount": 4, "410": 4, "508": 4, "nvendor": 4, "t32": 4, "833": 4, "477": 4, "ninventori": 4, "286": 4, "331": 4, "287": 4, "695": 4, "t152": 4, "987": 4, "t143": 4, "566": 4, "t91": 4, "479": 4, "544": 4, "t45": 4, "680": 4, "715": 4, "834": 4, "t64": 4, "758": 4, "t211": 4, "993": 4, "t209": 4, "017": 4, "t364": 4, "980": [4, 6], "t352": 4, "nliabil": 4, "t68": 4, "960": 4, "t62": 4, "611": 4, "304": 4, "t58": 4, "829": 4, "ndefer": 4, "249": 4, "061": 4, "ncommerci": 4, "967": 4, "985": 4, "t10": 4, "912": 4, "822": 4, "t176": 4, "392": 4, "t145": 4, "308": 4, "750": 4, "888": 4, "t49": 4, "848": 4, "638": 4, "t308": 4, "030": [4, 5], "t290": 4, "ncommit": 4, "nsharehold": 4, "400": 4, "116": 4, "786": 4, "550": 4, "n83": 4, "276": 4, "naccumul": 4, "deficit": 4, "154": 4, "214": 4, "172": 4, "452": 4, "950": 4, "146": [4, 6], "t50": 4, "672": 4, "t63": 4, "090": 4, "nbegin": 4, "849": 4, "365": 4, "423": 4, "346": 4, "175": 4, "withheld": 4, "settlement": 4, "521": 4, "971": 4, "t12": 4, "034": 4, "t11": 4, "nend": 4, "t83": 4, "nretain": 4, "068": 4, "562": 4, "ndividend": 4, "218": 4, "793": 4, "612": 4, "099": 4, "454": 4, "846": 4, "77": [4, 5], "046": 4, "186": 4, "109": 4, "t163": 4, "rsu": 4, "t0": 4, "98": [4, 5], "94": [4, 5, 6], "32": [4, 5], "737": 4, "929": 4, "ndepreci": 4, "445": 4, "519": 4, "688": 4, "038": 4, "266": 4, "227": 4, "006": 4, "788": 4, "356": 4, "271": 4, "520": 4, "618": 4, "484": 4, "731": 4, "684": 4, "499": 4, "020": 4, "889": 4, "448": 4, "552": 4, "031": 4, "t118": 4, "254": 4, "t110": 4, "543": 4, "t122": 4, "151": 4, "48": [4, 5], "656": 4, "513": 4, "76": [4, 6], "923": 4, "nproce": 4, "211": 4, "686": 4, "917": 4, "135": 4, "828": 4, "446": 4, "447": 4, "959": 4, "708": 4, "086": 4, "935": 4, "705": 4, "354": 4, "nfinanc": 4, "441": 4, "431": 4, "223": [4, 6], "234": [4, 6], "025": 4, "841": 4, "nrepurchas": 4, "949": 4, "89": [4, 6], "402": 4, "465": 4, "nrepay": 4, "958": 4, "repay": 4, "978": 4, "955": 4, "361": 4, "581": 4, "160": 4, "121": 4, "983": 4, "488": 4, "794": 4, "760": 4, "nsupplement": 4, "102": 4, "t18": 4, "679": 4, "573": 4, "33": [4, 5, 6], "nbasi": 4, "prior": [4, 6], "reclassifi": 4, "nrevenu": 4, "remit": [4, 6], "straight": 4, "vest": 4, "sold": 4, "nderiv": 4, "nonleas": 4, "34": [4, 6], "entitl": 4, "commenc": 4, "deliveri": 4, "stand": 4, "ssp": 4, "icloud": 4, "siri": 4, "discount": 4, "undeliv": 4, "unbil": 4, "n26": 4, "n37": 4, "proport": [4, 5], "moder": [4, 5], "64": [4, 5, 6], "dilut": 4, "nnumer": 4, "ndenomin": 4, "nweight": 4, "312": 4, "316": 4, "856": 4, "antidilut": 4, "tunreal": 4, "ngain": 4, "tfair": 4, "nvalu": 4, "tcash": 4, "nequival": 4, "tcurrent": 4, "tnon": 4, "t27": 4, "nlevel": 4, "nmonei": 4, "t778": 4, "nmutual": 4, "n515": 4, "t105": 4, "t617": 4, "nsubtot": 4, "293": 4, "395": 4, "nu": 4, "treasuri": 4, "516": 4, "t212": 4, "087": 4, "380": 4, "159": 4, "t703": 4, "t17": 4, "568": 4, "158": 4, "810": 4, "ncertif": 4, "deposit": 4, "t873": 4, "t387": 4, "t478": 4, "066": 4, "ncorpor": 4, "t65": 4, "622": 4, "t270": 4, "953": 4, "939": 4, "027": 4, "t47": 4, "886": 4, "nmunicip": 4, "t412": 4, "t405": 4, "t190": 4, "nmortgag": 4, "595": 4, "t175": 4, "403": 4, "t23": 4, "367": 4, "278": [4, 6], "t132": 4, "t583": 4, "635": 4, "t128": 4, "056": 4, "966": 4, "t34": 4, "t160": 4, "t688": 4, "650": 4, "36": [4, 5, 6], "359": [4, 6], "t481": 4, "n442": 4, "t428": 4, "t923": 4, "t909": 4, "406": 4, "114": 4, "468": 4, "136": 4, "t271": 4, "533": 4, "048": [4, 5], "491": 4, "332": 4, "t320": 4, "t608": 4, "t76": 4, "840": 4, "956": 4, "890": 4, "t20": 4, "627": 4, "243": 4, "t628": 4, "t602": 4, "t192": 4, "t410": 4, "735": 4, "636": 4, "t344": 4, "t144": 4, "470": 4, "657": 4, "831": 4, "125": 4, "162": 4, "t173": 4, "752": 4, "corrobor": 4, "mortgag": 4, "classifi": [4, 6], "37": [4, 6], "swap": 4, "remeasur": 4, "notion": 4, "069": 4, "730": 4, "575": 4, "493": 4, "t104": 4, "777": 4, "nhedg": 4, "433": 4, "505": 4, "247": [4, 6], "ntrade": 4, "41": [4, 5, 6], "44": [4, 6], "depreci": 4, "nland": 4, "690": 4, "nmachineri": 4, "t80": 4, "205": [4, 5], "314": 4, "nleasehold": 4, "839": 4, "599": 4, "73": [4, 5, 6], "70": [4, 5], "884": 4, "852": 4, "t55": 4, "906": 4, "601": 4, "703": 4, "010": 4, "457": 4, "634": 4, "391": 4, "neuropean": 4, "opinion": [4, 6], "1991": 4, "2007": 4, "irish": 4, "branch": 4, "2003": 4, "2014": 4, "2015": 4, "minist": 4, "juli": [4, 6], "annul": 4, "ecj": 4, "hear": 4, "asid": 4, "confirm": 4, "unrecogn": 4, "nfeder": 4, "571": 4, "080": 4, "644": 4, "265": 4, "801": 4, "726": 4, "570": 4, "298": 4, "49": [4, 6], "t84": 4, "428": 4, "603": 4, "483": [4, 6], "t347": 4, "t669": 4, "076": 4, "830": 4, "419": 4, "072": 4, "pretax": 4, "72": [4, 6], "71": 4, "ncomput": 4, "885": 4, "012": 4, "124": 4, "518": 4, "nimpact": 4, "246": 4, "311": 4, "366": 4, "397": 4, "nexcess": 4, "893": 4, "871": 4, "192": [4, 6], "739": 4, "ntax": 4, "carryforward": 4, "302": 4, "naccru": 4, "413": [4, 6], "421": 4, "nunreal": 4, "173": 4, "168": 4, "873": 4, "743": 4, "nless": 4, "374": 4, "007": 4, "369": 4, "551": 4, "998": 4, "nright": 4, "179": 4, "nminimum": 4, "674": 4, "940": 4, "t511": 4, "t455": 4, "t490": 4, "805": 4, "202": 4, "indefinit": 4, "temporari": 4, "727": 4, "044": 4, "284": 4, "ndecreas": 4, "386": 4, "463": 4, "982": 4, "542": 4, "936": 4, "070": 4, "expir": 4, "statut": 4, "229": 4, "494": 4, "closur": 4, "intercompani": 4, "exceed": [4, 6], "multiyear": 4, "exercis": 4, "noncash": 4, "rou": 4, "tfinanci": 4, "t2024": 4, "tother": 4, "661": 4, "tproperti": 4, "015": 4, "303": 4, "676": 4, "t165": 4, "t752": 4, "t859": 4, "430": 4, "842": [4, 6], "tfinanc": 4, "n2025": 4, "820": 4, "t171": 4, "991": 4, "n2026": 4, "914": 4, "n2027": 4, "t59": 4, "733": 4, "n2028": 4, "360": 4, "t38": 4, "398": 4, "n2029": 4, "187": 4, "nthereaft": 4, "t837": 4, "undiscount": 4, "790": 4, "imput": 4, "376": 4, "534": 4, "t896": 4, "borrow": 4, "proce": 4, "nine": [4, 6], "nmatur": 4, "333": 4, "264": 4, "948": 4, "645": 4, "309": 4, "arrear": 4, "namount": 4, "n2013": 4, "nfix": 4, "2062": 4, "t97": 4, "341": 4, "03": 4, "65": [4, 6], "t106": 4, "572": 4, "n97": 4, "nunamort": 4, "premium": 4, "321": 4, "358": 4, "113": 4, "662": 4, "930": 4, "342": 4, "800": 4, "180": 4, "88": 4, "ndure": 4, "425": 4, "426": 4, "372": 4, "589": 4, "055": 4, "appreci": 4, "four": [4, 5, 6], "holder": [4, 5], "n2014": 4, "bonu": 4, "nrestrict": 4, "nnumber": 4, "nrsu": 4, "ngrant": 4, "naggreg": 4, "nfair": 4, "nbalanc": 4, "t240": 4, "427": [4, 6], "t75": 4, "t150": 4, "861": 4, "501": 4, "768": 4, "87": [4, 5, 6], "101": [4, 6], "878": 4, "144": 4, "t127": 4, "t135": 4, "91": [4, 6], "456": 4, "78": [4, 5, 6], "59": [4, 6], "t140": 4, "326": 4, "t158": 4, "204": 4, "350": 4, "002": [4, 5], "nuncondit": 4, "uncondit": 4, "206": 4, "440": 4, "156": 4, "t633": 4, "t670": 4, "226": 4, "45": 4, "nconting": 4, "accrual": 4, "nconcentr": 4, "attribut": [4, 5, 6, 7], "46": 4, "t67": 4, "098": 4, "082": 4, "062": 4, "569": 4, "895": 4, "458": 4, "207": 4, "nonrecur": 4, "t142": 4, "196": 4, "t138": 4, "t147": 4, "859": 4, "nchina": 4, "n66": 4, "t181": 4, "887": 4, "t172": 4, "269": 4, "nlong": 4, "664": 4, "797": 4, "778": 4, "219": 4, "47": [4, 5, 6], "nopinion": 4, "nwe": 4, "fairli": 4, "pcaob": 4, "sponsor": 4, "treadwai": 4, "2013": 4, "unqualifi": 4, "thereon": 4, "nthese": 4, "misstat": 4, "fraud": [4, 6], "ndescript": 4, "naudit": 4, "nhow": 4, "nmatter": 4, "qualifi": 4, "letter": 4, "advisor": 4, "ernst": 4, "llp": 4, "auditor": 4, "2009": 4, "nsan": 4, "jose": 4, "nnovemb": 4, "coso": 4, "nour": 4, "ndefinit": 4, "mainten": [4, 5, 6], "disposit": 4, "receipt": 4, "nevalu": 4, "nbase": 4, "supervis": [4, 5, 6, 7], "13a": 4, "15d": 4, "ninher": 4, "met": [4, 6], "paragraph": 4, "51": [4, 6, 7], "ninsid": 4, "deirdr": 4, "brien": 4, "vice": 4, "presid": 4, "affirm": 4, "april": 4, "withhold": 4, "remitt": 4, "mr": 4, "copi": 4, "solicit": 4, "00042": 4, "nincorpor": 4, "texhibit": 4, "descript": [4, 5, 6, 7], "tform": 4, "tfile": 4, "nrestat": 4, "namend": 4, "bylaw": 4, "nindentur": 4, "york": [4, 5, 7], "mellon": 4, "truste": 4, "noffic": 4, "certif": 4, "2018": 4, "85": [4, 5, 6], "2043": 4, "05": 4, "2044": 4, "februari": 4, "55": [4, 5], "2045": 4, "900": 4, "700": [4, 5], "60": [4, 5, 6], "250": [4, 6], "2036": 4, "2046": 4, "450": 4, "2047": 4, "2049": 4, "2030": 4, "2050": 4, "2060": 4, "2028": 4, "2041": 4, "2051": 4, "2061": 4, "2032": 4, "2052": 4, "54": 4, "2033": 4, "2053": 4, "ceo": 4, "n12": 4, "nsubsidiari": 4, "n23": 4, "nconsent": 4, "n24": 4, "npower": 4, "signatur": 4, "nrule": 4, "nsection": 4, "1350": 4, "n101": 4, "ninlin": 4, "xbrl": 4, "n104": 4, "inlin": 4, "compensatori": 4, "herewith": 4, "furnish": 4, "herebi": 4, "undertak": 4, "56": [4, 5, 6], "nsignatur": 4, "npursuant": 4, "duli": 4, "undersign": 4, "thereunto": 4, "ndate": 4, "nby": 4, "luca": [4, 7], "maestri": 4, "nluca": 4, "nsenior": 4, "nchief": 4, "nknow": 4, "THESE": 4, "appoint": 4, "cook": 4, "jointli": 4, "her": 4, "substitut": 4, "him": 4, "thereto": 4, "therewith": 4, "ratifi": 4, "done": [4, 5, 6, 7], "virtu": 4, "hereof": 4, "nname": 4, "ttitl": 4, "tdate": 4, "tchief": 4, "tnovemb": 4, "ntimothi": 4, "tsenior": 4, "kondo": 4, "nchri": 4, "wanda": 4, "austin": 4, "nwanda": 4, "gorski": 4, "tdirector": 4, "nalex": 4, "jung": 4, "nandrea": 4, "arthur": 4, "levinson": 4, "narthur": 4, "monica": 4, "lozano": 4, "nmonica": 4, "ronald": 4, "sugar": 4, "nronald": 4, "susan": 4, "wagner": 4, "nsusan": 4, "57": [4, 5], "turbo": [4, 5, 7], "outlin": [4, 5, 6], "invdestacksmeticsisdict": 4, "setispect": 4, "20cyan": 4, "evaluationseld": 4, "anvis": 4, "droitent": 4, "discernminerv": 4, "versbobprefvers": 4, "vo\u8be5": 4, "option\u548c": 4, "meio": 4, "\u0432\u0440\u0435\u043ccisco": 4, "dellaischenpoihscap": 4, "geme": 4, "gettim": 4, "unscal": 4, "vocabulari": [4, 5, 7], "closer": 4, "sharpen": 4, "uniform": 4, "raschka": 4, "repetit": [4, 7], "radic": 4, "grappl": 4, "safer": [4, 6], "fascin": 4, "spontan": 4, "answer": [4, 5, 6, 7], "aren": [4, 5], "linear": 4, "absent": [4, 6], "coax": 4, "journei": 4, "suddenli": 4, "manifest": 4, "deliber": [4, 6], "contend": 4, "70b": [4, 5], "rethink": 4, "tutor": 4, "children": [4, 6], "verifi": [4, 5, 7], "predefin": [4, 7], "weren": 4, "kind": 4, "usual": 4, "quantif": 4, "contamin": [4, 6], "massiv": [4, 6], "unseen": [4, 6], "longitudin": 4, "mostli": [4, 7], "versu": [4, 5, 6], "latter": 4, "tailor": [4, 6], "great": [4, 5, 7], "cognit": 4, "misinform": [4, 6], "tempor": 4, "disclaim": 4, "referr": 4, "incorrect": [4, 6], "demograph": [4, 6], "stereotyp": [4, 6], "societ": [4, 6], "pii": [4, 6], "anonym": 4, "leakag": [4, 6], "carryov": 4, "multi": [4, 5, 6, 7], "fallaci": 4, "think": [4, 5, 6], "idiom": 4, "sarcasm": 4, "terminologi": 4, "lingual": 4, "misunderstand": 4, "syntax": 4, "scan": 4, "compat": [4, 5, 7], "overconfid": 4, "clariti": [4, 6, 7], "audienc": 4, "densiti": 4, "satisfact": [4, 7], "misus": [4, 6], "moral": 4, "co2": 4, "energi": 4, "consumpt": 4, "server": [4, 5, 7], "cach": [4, 5], "imag": [4, 5, 6], "audio": 4, "etc": [4, 7], "truth": [4, 5, 6, 7], "layer": [4, 5, 7], "palm": [4, 5], "easi": [4, 5, 6], "synthet": [4, 5, 6, 7], "augment": [4, 5], "post": [4, 5, 6], "timeout": 4, "variat": [4, 5], "inter": 4, "rater": 4, "ti": 4, "holist": [4, 6], "fast": [4, 5, 6, 7], "experiment": [4, 5, 7], "vi": 4, "categor": [4, 5, 6, 7], "intrins": [4, 5], "extrins": 4, "sequenc": [4, 5, 7], "perplex": [4, 5], "downstream": [4, 7], "synthesi": 4, "discret": 4, "prefix": [4, 6], "roug": 4, "bleu": 4, "bilingu": 4, "understudi": 4, "overlap": 4, "favor": [4, 5, 7], "breviti": 4, "insensit": 4, "semant": [4, 7], "orient": [4, 6], "gist": 4, "meteor": 4, "synonym": 4, "stem": [4, 7], "paraphras": 4, "alongsid": [4, 6], "computation": 4, "cider": 4, "consensu": 4, "tf": 4, "idf": 4, "caption": 4, "reliant": 4, "corpu": [4, 5], "ter": 4, "edit": [4, 6], "hypothesi": 4, "penal": 4, "bertscor": 4, "contextu": [4, 6], "bert": 4, "spice": 4, "proposit": [4, 5], "scene": [4, 6], "pure": [4, 5], "analyst": 4, "rouge_1": 4, "rouge_2": 4, "ideal": [4, 5, 6, 7], "cheaper": 4, "setup": [4, 5, 6, 7], "evaluate_summari": 4, "unigram": 4, "bigram": 4, "absl": 4, "py": 4, "rouge_scor": 4, "generated_summari": 4, "reference_summari": 4, "google_bleu": 4, "bleu_scor": 4, "rouge1": 4, "rouge2": 4, "arbitrari": 4, "chosen": [4, 6], "sentence1": 4, "cat": [4, 6], "sat": 4, "mat": 4, "sentence2": 4, "ate": 4, "3333333333333333": 4, "7272727272727272": 4, "4444444444444445": 4, "generate_summari": 4, "summir": 4, "liner": 4, "evaluate_summary_model": 4, "model_benchmark": 4, "models_test": 4, "benchmark_summari": 4, "model_summari": 4, "evaluation_result": 4, "statu": 4, "concis": [4, 5], "element": [4, 6, 7], "verbos": [4, 5, 6, 7], "peripher": 4, "quit": [4, 5, 7], "convei": 4, "breadth": 4, "Of": [4, 5, 6], "vibe": 4, "visualize_prompt_comparison": 4, "matplotlib": 4, "radar": 4, "radar_plot": 4, "tmp": 4, "ipykernel_1652501": 4, "940173201": 4, "userwarn": 4, "figurecanvasagg": 4, "largest": [4, 5], "sarmah": 4, "granular": [4, 5], "tune": [4, 6], "likert": 4, "ensembl": 4, "repeatedli": 4, "fluenci": 4, "refin": 4, "notabl": [4, 6, 7], "integ": [4, 7], "rubric": 4, "hollist": 4, "judgeevalu": 4, "grammar": [4, 5, 7], "evaluate_with_llm": 4, "criterion": 4, "judge_model": 4, "candidate_summari": 4, "grammat": 4, "y": [4, 6, 7], "z": 4, "w": [4, 5, 6], "benchmark_model": 4, "test_model": 4, "input_text": [4, 5], "trillion": [4, 5], "evals_list": 4, "1775618912": 4, "variant": [4, 5, 6], "slightli": 4, "drift": [4, 6], "lowest": [4, 5], "firstli": 4, "overhead": [4, 5], "egocentr": 4, "tight": 4, "medicin": [4, 6], "glider": 4, "deshpand": 4, "3b": 4, "685": 4, "aplic": 4, "clearli": [4, 6, 7], "earlier": [4, 6], "depict": [4, 6, 7], "multilingu": [4, 5, 6], "golden": 4, "languang": 4, "arena": 4, "randomli": 4, "customiz": [4, 5, 6], "irrelev": 4, "unhelp": [4, 6], "occasion": 4, "rare": 4, "perfectli": 4, "cater": [4, 5], "critiqu": [4, 6], "elo": 4, "spectrum": 4, "exam": 4, "probe": [4, 6], "certifi": 4, "began": [4, 5], "glue": 4, "entail": [4, 5], "baselin": [4, 5, 6], "superglu": 4, "successor": 4, "grew": 4, "big": 4, "bench": [4, 5], "srivastava": 4, "arithmet": 4, "truthfulqa": [4, 5], "multitask": 4, "hendryck": [4, 6], "multidisciplinari": 4, "stanford": 4, "helm": 4, "multidimension": 4, "surround": [4, 5, 6, 7], "humanev": [4, 5], "lmsy": 4, "brought": 4, "dialogu": [4, 5], "chiang": 4, "gather": 4, "alpacaev": 4, "duboi": 4, "mt": 4, "argilla": 4, "mila": 4, "mit": [4, 5], "contributor": [4, 5, 7], "western": 4, "centric": 4, "divid": [4, 6], "subset": [4, 6], "agnost": 4, "dialect": 4, "render": [4, 6], "crowdsourc": 4, "livebench": 4, "white": [4, 6], "resili": [4, 6], "meaningfulli": 4, "zebralog": 4, "grid": 4, "puzzl": 4, "brailsford": 4, "1999": 4, "lsat": 4, "hous": 4, "clue": 4, "deduct": 4, "arriv": 4, "programmat": [4, 7], "2x2": 4, "6x6": 4, "shot": [4, 6, 7], "reductio": 4, "ad": [4, 5, 6, 7], "absurdum": 4, "hard": 4, "10b": 4, "counterfactu": 4, "came": 4, "arc": 4, "prize": [4, 6], "chollet": 4, "mike": [4, 6], "knoop": 4, "founder": 4, "zapier": 4, "fran\u00e7oi": 4, "creator": [4, 5], "agi": 4, "kera": 4, "genuin": 4, "possess": 4, "elementari": 4, "novelti": 4, "wouldn": 4, "interpol": 4, "synthes": 4, "fly": 4, "retriev": [4, 5], "brute": 4, "pixel": 4, "unbeaten": 4, "win": [4, 5], "poorli": 4, "recombin": 4, "spur": [4, 6], "takeawai": 4, "vertic": [4, 6], "finbench": 4, "legalbench": 4, "guha": 4, "berkelei": 4, "bfcl": 4, "patil": 4, "fourrier": 4, "bespok": 4, "sdk": 4, "autoregress": 4, "sub": [4, 5], "liter": 4, "disturb": 4, "zero": [4, 5, 6, 7], "varianc": [4, 6], "yt": 4, "ut": 4, "suppos": [4, 7], "ol": 4, "heteroscedast": 4, "regress": 4, "bivari": 4, "evaluation_track": 4, "evaluationtrack": 4, "model_config": 4, "basemodelconfig": 4, "parallelismmanag": 4, "pipelineparamet": 4, "envconfig": 4, "is_accelerate_avail": 4, "datetim": 4, "timedelta": 4, "initprocessgroupkwarg": 4, "create_evaluation_pipelin": 4, "cache_dir": 4, "float16": 4, "max_sampl": 4, "kwargs_handl": 4, "3000": 4, "save_detail": 4, "pipeline_param": 4, "launcher_typ": 4, "env_config": 4, "override_batch_s": 4, "use_chat_templ": 4, "trust_remote_cod": 4, "pipeline_paramet": 4, "schemat": 4, "vllm": [4, 7], "tgi": 4, "storag": [4, 5, 6], "num_few_shot": 4, "bar": 4, "bigbench": 4, "winogrand": 4, "hellaswag": 4, "nlp": [4, 5, 6], "save_and_push_result": 4, "show_result": 4, "model_arg": 4, "send": [4, 5, 6, 7], "serverless": 4, "inference_server_address": 4, "inference_server_auth": 4, "model_id": 4, "null": 4, "bash": [4, 5], "command": [4, 5], "model_config_path": 4, "endpoint_model": 4, "llama3": 4, "qwen2": [4, 5, 7], "smollm2": [4, 5, 7], "alibaba": [4, 5, 7], "5b": [4, 5, 7], "hui": [4, 5], "allal": [4, 5], "cluster": 4, "noteworthi": [4, 5], "grain": [4, 5, 7], "salt": [4, 7], "exponenti": 4, "modular": 4, "offici": [4, 7], "revisit": 4, "trace": 4, "langchain_tracing_v2": 4, "langchain_api_kei": 4, "hf_evalu": 4, "langsmith_evalu": 4, "ls_client": 4, "dataset_nam": 4, "create_dataset": 4, "create_exampl": 4, "dataset_id": 4, "calculate_scor": 4, "reference_output": 4, "oai_client": 4, "xp_model_nam": 4, "lastli": 4, "run_evalu": 4, "And": [4, 5, 6], "upload_result": 4, "experiment_prefix": 4, "num_repetit": 4, "386a3620": 4, "9e1cc3cb": 4, "9d6a": 4, "4356": 4, "ab34": 4, "138e0abe8be4": 4, "8741976e": 4, "5268": 4, "4b75": 4, "949f": 4, "99477dde5d64": 4, "selectedsess": 4, "b831dc1e": 4, "90bc": 4, "4ed8": 4, "8080": [4, 5], "fb42444724d6": 4, "4it": 4, "latest": [4, 5, 6, 7], "tobia": 4, "evaluate_modul": 4, "6fc70b7be0088120a372dfdd5d320b39b8bb3630cb8029b193941d9376e86bb0": 4, "tue": 4, "nov": [4, 5], "couldn": 4, "5it": 4, "5053784e": 4, "64445871": 4, "a53c": 4, "44b1": 4, "a422": 4, "4f49b2f9656f": 4, "69": [4, 6], "4b29f3c9": 4, "9ef7e39a": 4, "2add": 4, "410c": 4, "89f8": 4, "9f1a8b198cf1": 4, "61": [4, 6], "insert": 4, "combined_df": 4, "concat": [4, 6], "ignore_index": [4, 6], "execution_tim": 4, "example_id": 4, "333333": 4, "224388": 4, "feb10f92": 4, "3167": 4, "41f3": 4, "bb1c": 4, "d271153a31a8": 4, "5b196b22": 4, "9f4c": 4, "489c": 4, "b020": 4, "7823208b42d6": 4, "348101": 4, "722464": 4, "c310f159": 4, "064a": 4, "4035": 4, "97c3": 4, "a25bbf43abc2": 4, "386076": 4, "704104": 4, "f7f24899": 4, "dd50": 4, "409e": 4, "93cc": 4, "6fb1622b60bf": 4, "443038": 4, "725059": 4, "242856d6": 4, "efb5": 4, "4101": 4, "b1cf": 4, "5805532838ac": 4, "373418": 4, "795302": 4, "ce975169": 4, "a0ab": 4, "40ce": 4, "8e32": 4, "efa28d06079d": 4, "stat": [4, 5], "groupbi": [4, 6], "agg": [4, 6], "sort": 4, "sort_valu": 4, "subplot": 4, "pyplot": 4, "plt": 4, "numpi": 4, "np": 4, "ax1": 4, "ax2": 4, "figsiz": 4, "2ecc71": 4, "3498db": 4, "e74c3c": 4, "bleu_mean": 4, "bleu_std": 4, "enumer": [4, 6], "errorbar": 4, "yerr": 4, "fmt": 4, "markers": 4, "capsiz": 4, "set_ylabel": 4, "set_titl": 4, "set_xtick": 4, "set_xticklabel": 4, "rotat": 4, "set_ylim": 4, "bottom": 4, "legend": 4, "exec_mean": 4, "exec_std": 4, "tight_layout": 4, "ndetail": 4, "4038": 4, "0453": 4, "7815": 4, "0433": 4, "3768": 4, "0424": 4, "8343": 4, "2208": 4, "3519": 4, "0775": 4, "9122": 4, "1482": 4, "377": 4, "042": 4, "078": 4, "slower": [4, 6], "04": [4, 5], "latenc": [4, 5, 6], "speed": [4, 5, 6], "interestingli": 4, "decoupl": 4, "reload": 4, "facilit": [4, 6], "promptfooconfig": 4, "model_comparison": 4, "pretti": [4, 6], "dump": 4, "default_flow_styl": 4, "sort_kei": 4, "prompt1": 4, "defaulttest": 4, "1000m": 4, "millisecond": 4, "eval_data": 4, "latency_m": 4, "totallatencym": 4, "token_usag": 4, "tokenusag": 4, "assert_pass": 4, "assertpasscount": 4, "assert_fail": 4, "assertfailcount": 4, "prompt_token": [4, 5], "num_request": 4, "numrequest": 4, "2463": 4, "000035": 4, "3773": 4, "004620": 4, "1669": 4, "000091": 4, "1669m": 4, "highest": [4, 5, 7], "3773m": 4, "00462": 4, "promptfool": 4, "manual": [4, 5, 6, 7], "redefin": 4, "prompt_comparison": 4, "prompt2": 4, "prompt3": 4, "prompt_fil": 4, "prompt_cont": 4, "BE": 4, "again": 4, "prompt_id": 4, "promptid": 4, "gradingresult": 4, "df_raw": 4, "reset_index": [4, 6], "eas": [4, 5, 6, 7], "seamless": [4, 6], "hf": [4, 5], "plain": [4, 5], "vanilla": 4, "defi": 4, "accustom": 4, "legaci": 4, "unsustain": 4, "prd": 4, "cultiv": [4, 6], "organiz": 4, "stagnat": 4, "alb": [4, 5], "loubna": [4, 5], "anton": [4, 5], "lozhkov": [4, 5], "bakouch": [4, 5], "gabriel": [4, 5, 6], "mart\u00edn": [4, 5, 6], "bl\u00e1zquez": [4, 5], "lewi": [4, 5], "tunstal": [4, 5], "agust\u00edn": [4, 5], "piquer": [4, 5], "andr": [4, 5], "marafioti": [4, 5], "cyril": [4, 5], "zakka": [4, 5], "leandro": [4, 5], "werra": [4, 5], "wolf": [4, 5], "are24": 4, "judgearena": 4, "bps99": 4, "salli": 4, "pott": 4, "barbara": 4, "557": [4, 6], "sciencedirect": 4, "s0377221798003646": 4, "doi": [4, 6, 7], "1016": 4, "s0377": 4, "2217": 4, "00364": 4, "ctj": 4, "jerri": [4, 6], "tworek": [4, 6], "heewoo": [4, 6], "jun": [4, 6], "qime": [4, 6], "henriqu": [4, 6], "pond": [4, 6], "de": [4, 6], "oliveira": [4, 6], "pinto": [4, 6], "harri": [4, 6], "yuri": 4, "burda": 4, "greg": [4, 6], "brockman": [4, 6], "raul": [4, 6], "puri": [4, 6], "gretchen": [4, 6], "krueger": [4, 6], "petrov": [4, 6], "heidi": 4, "khlaaf": 4, "girish": [4, 6], "sastri": [4, 6], "brook": [4, 6], "chan": [4, 6], "grai": [4, 6], "ryder": [4, 6], "mikhail": [4, 6], "pavlov": [4, 6], "alethea": [4, 6], "lukasz": 4, "kaiser": [4, 6], "mohammad": [4, 6], "bavarian": [4, 6], "clemen": [4, 6], "winter": [4, 6], "philipp": 4, "tillet": [4, 6], "felip": [4, 6], "petroski": [4, 6], "dave": [4, 6], "cum": [4, 6], "plappert": 4, "fotio": 4, "chantzi": [4, 6], "barn": 4, "ariel": 4, "herbert": 4, "voss": [4, 6], "hebgen": 4, "guss": 4, "nichol": 4, "paino": [4, 6], "nikola": [4, 6], "tezak": [4, 6], "babuschkin": [4, 6], "suchir": [4, 6], "balaji": [4, 6], "shantanu": [4, 6], "jain": [4, 6], "hess": [4, 6], "carr": 4, "josh": [4, 6], "achiam": [4, 6], "vedant": 4, "misra": 4, "evan": [4, 5, 6], "morikawa": [4, 6], "matthew": 4, "knight": [4, 6], "mile": [4, 6], "brundag": [4, 6], "mira": [4, 6], "murati": [4, 6], "kati": [4, 6], "mayer": [4, 6], "bob": [4, 6, 7], "mcgrew": [4, 6], "ilya": [4, 6], "sutskev": [4, 6], "wojciech": [4, 6], "zaremba": [4, 6], "2107": 4, "03374": 4, "cz": 4, "lianmin": 4, "ying": 4, "sheng": 4, "anastasio": 4, "angelopoulo": 4, "tianl": 4, "dacheng": 4, "banghua": 4, "jordan": [4, 6], "gonzalez": 4, "ion": 4, "stoica": 4, "04132": 4, "cho24a": 4, "francoi": 4, "arcpriz": 4, "cho24b": 4, "drcw": 4, "darshan": 4, "selvan": 4, "sunitha": 4, "ravi": 4, "sky": 4, "ch": 4, "bartosz": 4, "mielczarek": 4, "anand": [4, 6], "kannappan": [4, 6], "qian": [4, 6], "14140": 4, "dglh24": 4, "yann": 4, "bal\u00e1z": 4, "galambosi": 4, "tatsunori": 4, "hashimoto": 4, "debia": 4, "04475": 4, "fac24a": 4, "wiki": [4, 7], "fac24b": 4, "fac24c": 4, "model_doc": 4, "fac24d": 4, "cookbook": 4, "llm_judg": 4, "fac24f": 4, "fhwt23": 4, "cl\u00e9mentin": 4, "nathan": 4, "habib": 4, "gnh": 4, "julian": 4, "nyarko": 4, "ho": 4, "r\u00e9": 4, "adam": [4, 6], "chilton": 4, "aditya": [4, 6], "narayana": 4, "chohla": 4, "brandon": [4, 6, 7], "waldon": 4, "rockmor": 4, "diego": 4, "zambrano": 4, "dmitri": 4, "talisman": 4, "enam": 4, "hoqu": 4, "faiz": 4, "surani": 4, "frank": [4, 6], "fagan": 4, "galit": 4, "sarfati": 4, "gregori": 4, "dickinson": 4, "haggai": 4, "porat": 4, "hegland": 4, "jessica": [4, 6], "joe": [4, 6], "nudel": 4, "joel": [4, 6], "niklau": 4, "nai": 4, "jonathan": [4, 6], "choi": 4, "margaret": [4, 5], "hagan": 4, "megan": 4, "ma": [4, 6], "livermor": 4, "nikon": 4, "rasumov": 4, "rahe": 4, "nil": 4, "holzenberg": 4, "noam": 4, "kolt": 4, "henderson": 4, "rehaag": 4, "sharad": 4, "shang": 4, "spencer": 4, "sunni": 4, "gandhi": 4, "zur": 4, "varun": 4, "iyer": 4, "zehua": 4, "2308": 4, "11462": 4, "hbb": 4, "collin": 4, "burn": 4, "steven": [4, 6], "basart": [4, 6], "zou": [4, 6], "manta": [4, 6], "mazeika": [4, 6], "03300": 4, "hbd": 4, "maxwel": 4, "forb": 4, "yejin": 4, "curiou": 4, "neural": [4, 7], "degener": 4, "1904": 4, "09751": 4, "hyc": [4, 5], "binyuan": [4, 5], "zeyu": [4, 5], "cui": [4, 5], "jiaxi": [4, 5], "dayiheng": [4, 5], "tianyu": [4, 5], "jiajun": [4, 5], "kai": [4, 5, 6], "dang": [4, 5], "coder": [4, 5], "preprint": [4, 5, 7], "2409": [4, 5, 6], "12186": [4, 5], "lx": 4, "zhen": 4, "xiaohan": 4, "jia": 4, "yuxuan": 4, "lai": 4, "chongyang": 4, "shuai": 4, "nlg": 4, "07103": 4, "lbl": 4, "bommasani": 4, "toni": 4, "dimitri": 4, "tsipra": 4, "dilara": 4, "soylu": 4, "michihiro": 4, "yasunaga": 4, "yian": 4, "deepak": 4, "narayanan": 4, "yuhuai": 4, "newman": 4, "binhang": 4, "bobbi": 4, "ce": 4, "christian": [4, 6], "cosgrov": 4, "acosta": 4, "nava": [4, 6], "drew": 4, "hudson": 4, "zelikman": 4, "esin": 4, "durmu": 4, "faisal": 4, "ladhak": 4, "frieda": 4, "rong": 4, "hongyu": 4, "ren": [4, 5], "huaxiu": 4, "yao": [4, 6, 7], "jue": 4, "keshav": 4, "santhanam": 4, "laurel": 4, "lucia": 4, "mert": 4, "yuksekgonul": 4, "mirac": 4, "suzgun": 4, "niladri": 4, "chatterji": 4, "omar": 4, "khattab": 4, "chi": [4, 7], "sang": 4, "shibani": [4, 6], "santurkar": [4, 6], "surya": 4, "icard": 4, "tianyi": 4, "vishrav": 4, "chaudhari": 4, "xuechen": 4, "yuhui": 4, "yuta": 4, "koreeda": 4, "2211": 4, "09110": 4, "lbc24": 4, "ronan": 4, "bra": 4, "allenai": 4, "lhe22": [4, 5, 6], "stephani": [4, 5, 6], "owain": [4, 5, 6], "mimic": [4, 5, 6], "falsehood": [4, 5, 6], "2109": [4, 5, 6], "07958": [4, 5, 6], "pzwg23": 4, "shishir": 4, "tianjun": 4, "xin": [4, 6], "gorilla": 4, "15334": 4, "pro24": 4, "dev": 4, "ras24": 4, "sebastian": 4, "scratch": 4, "1633437166": 4, "sll": 4, "bhaskarjit": 4, "mingshu": 4, "jingrao": 4, "lyu": 4, "nathalia": 4, "castellano": 4, "pasquali": 4, "dhagash": 4, "12148": 4, "srf": 4, "shivalika": 4, "angelika": 4, "roman": [4, 6], "adelani": 4, "ngui": 4, "vila": 4, "suero": 4, "peerat": 4, "limkonchotiwat": 4, "kelli": 4, "marchisio": 4, "qi": 4, "leong": 4, "yosephin": 4, "susanto": 4, "raymond": [4, 6], "ng": [4, 6], "shayn": 4, "longpr": 4, "ko": 4, "madelin": 4, "antoin": 4, "bosselut": 4, "oh": 4, "leshem": 4, "choshen": 4, "daphn": 4, "ippolito": 4, "enzo": [4, 7], "ferrant": 4, "marzieh": 4, "fadae": 4, "beyza": 4, "ermi": 4, "sara": 4, "hooker": 4, "linguist": [4, 6], "03304": 4, "srr": 4, "aarohi": 4, "abhinav": 4, "rastogi": 4, "abhishek": 4, "rao": 4, "abu": 4, "awal": 4, "shoeb": 4, "abubakar": 4, "abid": [4, 5], "fisch": 4, "santoro": 4, "gupta": 4, "adri\u00e0": 4, "garriga": 4, "alonso": 4, "agnieszka": 4, "kluska": 4, "aitor": 4, "lewkowycz": 4, "akshat": 4, "warstadt": 4, "alexand": [4, 6, 7], "kocurek": 4, "ali": [4, 6], "safaya": 4, "tazarv": 4, "aman": 4, "hussain": 4, "dsouza": 4, "ambros": 4, "slone": 4, "ameet": 4, "rahan": 4, "anantharaman": 4, "ander": 4, "andreassen": 4, "madotto": 4, "santilli": 4, "stuhlm\u00fcller": 4, "la": 4, "lampinen": 4, "angelica": 4, "anh": 4, "vuong": 4, "animesh": 4, "gottardi": 4, "antonio": 4, "norelli": 4, "anu": 4, "venkatesh": 4, "arash": 4, "gholamidavoodi": 4, "arfa": 4, "tabassum": 4, "arul": 4, "menez": 4, "arun": [4, 6], "kirubarajan": 4, "asher": 4, "mullokandov": 4, "ashish": 4, "sabharw": 4, "herrick": 4, "avia": 4, "efrat": 4, "aykut": 4, "erdem": 4, "ayla": 4, "karaka\u015f": 4, "bao": [4, 5, 6], "loe": 4, "barret": [4, 6], "zoph": [4, 6], "bart\u0142omiej": 4, "bojanowski": 4, "batuhan": 4, "\u00f6zyurt": 4, "behnam": 4, "hedayatnia": 4, "neyshabur": 4, "inden": 4, "benno": 4, "stein": 4, "berk": 4, "ekmekci": 4, "blake": 4, "howald": 4, "bryan": 4, "orinion": 4, "diao": 4, "dour": 4, "stinson": 4, "cedrick": 4, "argueta": 4, "c\u00e9sar": 4, "ferri": 4, "ram\u00edrez": 4, "chandan": 4, "charl": 4, "rathkopf": 4, "chenlin": 4, "meng": 4, "chitta": 4, "baral": 4, "chiyu": 4, "callison": 4, "burch": 4, "wait": [4, 6], "voigt": 4, "cindi": 4, "ramirez": 4, "clara": 4, "rivera": 4, "clemencia": 4, "siro": 4, "colin": [4, 5], "raffel": [4, 5], "courtnei": 4, "ashcraft": 4, "cristina": 4, "garbacea": 4, "damien": [4, 6], "sileo": 4, "garrett": 4, "kilman": 4, "freeman": 4, "khashabi": 4, "levi": [4, 6], "mosegu\u00ed": 4, "gonz\u00e1lez": 4, "perszyk": 4, "danqi": 4, "dar": 4, "gilboa": 4, "dohan": [4, 6], "drakard": 4, "jurgen": 4, "debajyoti": 4, "datta": 4, "deni": 4, "emelin": 4, "kleyko": 4, "deniz": 4, "yuret": 4, "derek": [4, 6], "tam": [4, 7], "dieuwk": 4, "hupk": 4, "diganta": 4, "dilyar": 4, "buzan": 4, "coelho": 4, "mollo": 4, "diyi": 4, "dylan": 4, "schrader": 4, "ekaterina": 4, "shutova": 4, "ekin": 4, "dogu": 4, "cubuk": 4, "elad": 4, "segal": 4, "eleanor": 4, "hagerman": 4, "donowai": 4, "elli": 4, "pavlick": 4, "rodola": 4, "emma": 4, "lam": 4, "chu": [4, 6], "erkut": 4, "erni": 4, "dyer": 4, "jerzak": 4, "eunic": 4, "engefu": 4, "manyasi": 4, "evgenii": 4, "zheltonozhskii": 4, "fanyu": 4, "xia": [4, 5], "fatemeh": 4, "siar": 4, "fernando": 4, "mart\u00ednez": 4, "plume": 4, "francesca": 4, "happ\u00e9": 4, "gaurav": 4, "genta": 4, "indra": 4, "winata": 4, "gerard": 4, "melo": 4, "germ\u00e1n": 4, "kruszewski": 4, "giambattista": [4, 6], "parascandolo": [4, 6], "giorgio": 4, "mariani": 4, "gloria": 4, "gonzalo": 4, "jaimovitch": 4, "l\u00f3pez": 4, "gregor": 4, "betz": 4, "gui": [4, 5], "gur": 4, "hana": 4, "galijasev": 4, "rashkin": 4, "hannaneh": 4, "hajishirzi": 4, "harsh": 4, "hayden": 4, "bogar": 4, "henri": [4, 6], "shevlin": 4, "hinrich": 4, "sch\u00fctze": 4, "hiromu": 4, "yakura": 4, "hongm": 4, "hugh": 4, "mee": 4, "wong": [4, 6], "isaac": 4, "nobl": 4, "jaap": 4, "jumelet": 4, "geissing": 4, "jaehoon": 4, "jaim": 4, "fern\u00e1ndez": 4, "fisac": 4, "simon": 4, "koppel": 4, "koco\u0144": 4, "jana": 4, "thompson": [4, 5, 6], "janel": 4, "wingfield": 4, "jarema": 4, "radom": 4, "jascha": 4, "sohl": [4, 6], "dickstein": 4, "phang": 4, "yosinski": 4, "jekaterina": 4, "novikova": 4, "jell": 4, "bosscher": 4, "jennif": 4, "marsh": 4, "jeroen": 4, "taal": 4, "engel": 4, "jesujoba": 4, "alabi": 4, "jiam": 4, "jillian": 4, "joan": 4, "waweru": 4, "burden": 4, "bali": 4, "batcheld": 4, "berant": 4, "j\u00f6rg": 4, "frohberg": 4, "jo": 4, "rozen": 4, "orallo": 4, "boudeman": 4, "guerr": 4, "tenenbaum": 4, "joyc": 4, "chua": 4, "kanclerz": 4, "karen": 4, "livescu": 4, "karl": 4, "krauth": 4, "karthik": 4, "gopalakrishnan": 4, "katerina": 4, "ignatyeva": 4, "katja": 4, "markert": 4, "kaustubh": 4, "dhole": 4, "gimpel": 4, "omondi": 4, "kori": 4, "mathewson": 4, "kristen": 4, "chiafullo": 4, "ksenia": 4, "shkaruta": 4, "shridhar": 4, "kyle": [4, 6], "mcdonel": 4, "richardson": 4, "laria": 4, "reynold": 4, "leo": [4, 6], "dugan": 4, "lianhui": 4, "lidia": 4, "contrera": 4, "ochando": 4, "morenc": 4, "moschella": 4, "luci": 4, "ludwig": 4, "schmidt": [4, 6], "luheng": 4, "olivero": 4, "col\u00f3n": 4, "metz": [4, 6], "l\u00fctfi": 4, "kerem": 4, "\u015fenel": 4, "maarten": [4, 6], "bosma": 4, "sap": [4, 6], "maartj": 4, "hoev": 4, "maheen": 4, "farooqi": 4, "manaal": 4, "faruqui": 4, "marco": 4, "baturan": 4, "marelli": 4, "maru": 4, "maria": 4, "quintana": 4, "tolkiehn": 4, "mario": [4, 6], "giulianelli": 4, "martha": 4, "potthast": 4, "leavitt": 4, "hagen": 4, "m\u00e1ty\u00e1": 4, "schubert": 4, "medina": [4, 6], "orduna": 4, "baitemirova": 4, "melodi": 4, "arnaud": 4, "melvin": 4, "mcelrath": 4, "yee": 4, "cohen": 4, "ivanitskii": 4, "starritt": 4, "strube": 4, "micha\u0142": 4, "sw\u0119drowski": 4, "michel": [4, 6], "bevilacqua": 4, "mihir": 4, "kale": 4, "cain": 4, "mime": 4, "mitch": 4, "walker": 4, "mo": 4, "tiwari": 4, "mohit": 4, "bansal": 4, "moin": 4, "aminnaseri": 4, "mor": 4, "geva": 4, "mozhdeh": 4, "gheini": 4, "mukund": 4, "varma": 4, "nanyun": 4, "peng": [4, 6], "nayeon": 4, "neta": 4, "krakov": 4, "doiron": 4, "nicol": 4, "martinez": 4, "nikita": 4, "nangia": 4, "nikla": 4, "decker": 4, "muennighoff": 4, "nitish": [4, 6], "shirish": [4, 6], "keskar": [4, 6], "niveditha": 4, "constant": 4, "fiedel": 4, "nuan": 4, "wen": 4, "oliv": [4, 6], "agha": 4, "elbaghdadi": 4, "omer": 4, "moreno": 4, "casar": 4, "parth": 4, "doshi": 4, "pascal": 4, "fung": 4, "pu": 4, "vicol": 4, "pegah": 4, "alipoormolabashi": 4, "peiyuan": 4, "eckerslei": 4, "phu": 4, "mon": 4, "htut": 4, "pinyu": 4, "hwang": 4, "piotr": 4, "mi\u0142kowski": 4, "piyush": 4, "pouya": 4, "pezeshkpour": 4, "priti": 4, "oli": 4, "qiaozhu": 4, "qing": 4, "qinlang": 4, "rabin": 4, "banjad": 4, "rachel": [4, 6], "etta": 4, "rudolph": 4, "raefer": 4, "rahel": 4, "haback": 4, "ramon": 4, "risco": 4, "rapha\u00ebl": 4, "milli\u00e8r": 4, "rhythm": 4, "garg": [4, 5], "rif": 4, "saurou": 4, "riku": 4, "arakawa": 4, "robb": 4, "raymaek": 4, "rohan": 4, "sikand": 4, "novak": 4, "sitelew": 4, "lebra": 4, "rosann": 4, "rowan": [4, 6], "ruslan": 4, "salakhutdinov": 4, "stoval": 4, "teehan": 4, "sahib": 4, "saif": 4, "sajant": 4, "dillav": 4, "shleifer": 4, "wiseman": 4, "gruetter": 4, "schoenholz": 4, "sanghyun": 4, "sanjeev": 4, "kwatra": 4, "sarik": 4, "ghazarian": 4, "sayan": 4, "casei": [4, 6], "bischoff": 4, "gehrmann": 4, "schuster": 4, "sepideh": 4, "sadeghi": 4, "shadi": 4, "hamdan": 4, "sharon": 4, "shashank": 4, "sherri": 4, "shi": 4, "shikhar": 4, "shima": 4, "asaadi": 4, "shubh": 4, "pachchigar": 4, "shubham": 4, "toshniw": 4, "shyam": [4, 6], "upadhyai": 4, "shyamolima": 4, "debnath": 4, "siamak": 4, "shakeri": 4, "thormey": 4, "melzi": 4, "siva": 4, "reddi": 4, "sneha": 4, "priscilla": 4, "makini": 4, "soo": 4, "hwan": 4, "toren": 4, "sriharsha": 4, "hatwar": 4, "stanisla": 4, "dehaen": 4, "stefan": 4, "divic": 4, "stella": 4, "biderman": 4, "stephen": 4, "prasad": 4, "piantadosi": 4, "stuart": [4, 6], "shieber": 4, "summer": [4, 6], "misherghi": 4, "svetlana": 4, "kiritchenko": 4, "swaroop": 4, "tal": 4, "linzen": 4, "tariq": 4, "tatsu": 4, "te": 4, "th\u00e9o": 4, "desbord": 4, "theodor": 4, "rothschild": 4, "phan": [4, 6], "tiberiu": 4, "nkinyili": 4, "timo": 4, "schick": 4, "timofei": 4, "kornev": 4, "titu": 4, "tunduni": 4, "gerstenberg": 4, "trenton": 4, "trishala": 4, "neeraj": 4, "tushar": 4, "khot": 4, "shultz": 4, "uri": 4, "shaham": 4, "vera": 4, "demberg": 4, "victoria": [4, 6], "nyamai": 4, "vika": 4, "raunak": 4, "vinai": 4, "ramasesh": 4, "udai": 4, "prabhu": 4, "vishakh": 4, "padmakumar": 4, "vivek": 4, "srikumar": 4, "fedu": [4, 6], "wout": 4, "vossen": 4, "xiaoyu": 4, "tong": [4, 6], "xinran": 4, "xinyi": 4, "yadollah": 4, "yaghoobzadeh": 4, "yair": 4, "lakretz": 4, "yangqiu": 4, "yasaman": 4, "bahri": 4, "yichi": 4, "yide": 4, "yifu": 4, "yonatan": 4, "belinkov": 4, "yufang": 4, "seid": 4, "zhuoy": 4, "zijian": 4, "ziji": 4, "zirui": 4, "ziyi": 4, "extrapol": 4, "2206": 4, "04615": 4, "wpn": 4, "yada": 4, "pruksachatkun": 4, "amanpreet": 4, "hill": 4, "stickier": 4, "wsm": 4, "1804": 4, "07461": 4, "wtb": 4, "tai": 4, "borgeaud": 4, "dani": 4, "yogatama": 4, "denni": [4, 6], "donald": 4, "metzler": 4, "ed": 4, "oriol": 4, "vinyal": 4, "dean": 4, "07682": 4, "wdr": 4, "doolei": 4, "manlei": 4, "arka": [4, 6], "pal": 4, "feuer": 4, "siddhartha": 4, "ravid": 4, "shwartz": [4, 6], "ziv": 4, "khalid": [4, 5], "saifullah": 4, "siddartha": 4, "naidu": 4, "chinmai": 4, "hegd": 4, "lecun": 4, "goldstein": 4, "willi": 4, "neiswang": 4, "micah": 4, "goldblum": 4, "19314": 4, "yyh": 4, "baosong": [4, 5], "chengpeng": 4, "chengyuan": [4, 5], "fei": [4, 5], "guant": 4, "haoran": [4, 5], "huan": [4, 5], "jialong": 4, "jialin": 4, "jianhong": [4, 5], "tu": [4, 5], "jianwei": [4, 5], "jianxin": [4, 5], "jin": [4, 6], "jingren": [4, 5], "jinz": 4, "jinzheng": 4, "junyang": [4, 5], "keme": [4, 5], "keqin": [4, 5], "kexin": [4, 5], "mingfeng": [4, 5], "xue": [4, 5, 6], "ni": 4, "pei": [4, 5], "ru": 4, "men": [4, 5], "ruiz": 4, "runji": [4, 5], "shiji": 4, "sinan": 4, "tianhang": 4, "wenbin": 4, "ge": 4, "xiaodong": 4, "deng": 4, "xiaohuan": 4, "xingzhang": [4, 5], "xinyu": [4, 6], "xipin": 4, "xuancheng": [4, 5], "yichang": [4, 5], "wan": [4, 5], "yunfei": 4, "yuqiong": [4, 5], "zhenru": [4, 5], "zhihao": 4, "10671": 4, "zcl24": 4, "zhihan": 4, "cao": 4, "lizi": 4, "openreview": 4, "forum": 4, "aegrf1uy0p": 4, "zc": 4, "siyuan": 4, "zhuang": [4, 6], "zhanghao": 4, "yonghao": 4, "zi": 4, "zhuohan": 4, "xing": [4, 6], "2306": 4, "05685": 4, "huggingface24": 4, "06": [4, 7], "metaai24": 4, "di": 5, "hunter": 5, "photo": 5, "email": 5, "hipaa": 5, "properti": [5, 6], "gdpr": 5, "iot": 5, "unreli": 5, "impract": 5, "slm": 5, "viabl": 5, "sensor": 5, "evalu": [5, 7], "interconnect": 5, "frontend": 5, "tradeoff": [5, 6, 7], "rapidli": [5, 6, 7], "garner": 5, "traction": 5, "yourself": 5, "aw": [5, 6], "bedrock": 5, "sambanova": 5, "sla": 5, "viabil": 5, "veloc": 5, "roadmap": 5, "commodit": 5, "decai": 5, "winner": 5, "loser": 5, "condens": 5, "clean": 5, "2024t": 5, "broadli": [5, 7], "versatil": 5, "72b": 5, "med": 5, "bloomberggpt": 5, "underw": 5, "adept": 5, "toxigen": 5, "alnajjar": 5, "13b": [5, 6], "01": 5, "outperform": 5, "32b": 5, "feasibl": 5, "2m": 5, "unstructur": [5, 7], "modal": 5, "diagnosi": 5, "patient": 5, "necessit": 5, "flagship": 5, "405b": 5, "gemini": 5, "pack": 5, "cautious": 5, "isol": [5, 6], "cpot": 5, "cpit": 5, "tco": 5, "tpot": 5, "ttft": 5, "mmlu": [5, 6], "gpqa": 5, "ratio": 5, "median": 5, "afford": 5, "lite": 5, "micro": 5, "budget": 5, "encod": [5, 6, 7], "cent": 5, "1m": 5, "flash": 5, "cheapest": 5, "phi": 5, "half": [5, 6], "permiss": [5, 6], "apach": 5, "exemplifi": [5, 6], "microsoft": 5, "simpler": [5, 6, 7], "fewer": [5, 6], "700m": 5, "100m": 5, "gemma": [5, 7], "deepseek": 5, "v2": [5, 6], "thorough": [5, 6], "grown": 5, "withdraw": 5, "incomplet": [5, 6], "preprocess": [5, 7], "unclear": 5, "15t": 5, "8t": 5, "fineweb": 5, "penedo": 5, "96": [5, 6], "crawl": 5, "snapshot": 5, "codebas": 5, "ablat": 5, "vital": [5, 6], "favorit": 5, "spawn": 5, "streamlin": [5, 7], "ultrachat": 5, "2024u": 5, "created_job": 5, "fine_tun": 5, "training_fil": 5, "file_id": 5, "ultrachat_chunk_train": 5, "validation_fil": 5, "ultrachat_chunk_ev": 5, "training_step": 5, "0001": 5, "auto_start": 5, "job_id": 5, "toolkit": [5, 6], "sft": 5, "dpo": 5, "nemo": [5, 6], "codestr": 5, "2024v": 5, "135m": 5, "enough": 5, "despit": [5, 7], "rewrit": 5, "multimod": [5, 6], "smolvlm": 5, "mlx": [5, 7], "mlc": 5, "peft": 5, "programm": 5, "graphic": [5, 6], "vram": 5, "vector": [5, 6], "mathbf": 5, "x_1": [5, 7], "x_2": [5, 7], "x_n": [5, 7], "x_": [5, 7], "\u03b8": 5, "matrix": [5, 6], "concurr": 5, "groq": 5, "cerebra": 5, "mozilla": 5, "docker": 5, "gerganov": 5, "georgi": 5, "hundr": 5, "overwhelm": [5, 7], "manifesto": 5, "enjoy": 5, "bog": 5, "exploratori": 5, "hacker": 5, "Will": [5, 6], "prototyp": 5, "prematur": 5, "besid": 5, "lighter": 5, "sacrific": 5, "gguf": 5, "unifi": [5, 7], "ggml": [5, 7], "ibm": [5, 6], "bit": 5, "metadata": 5, "disk": 5, "faster": 5, "backward": 5, "2024x": 5, "repo": 5, "easier": [5, 6, 7], "compil": 5, "linux": 5, "argument": [5, 6, 7], "sudo": 5, "apt": 5, "cmake": 5, "bind": 5, "betlen": 5, "cnv": 5, "llamacpp": 5, "q8_0": 5, "succinct": 5, "ctrl": 5, "interject": 5, "philosoph": 5, "debat": 5, "fulfil": 5, "happi": 5, "responsibli": 5, "bye": 5, "goodby": 5, "port": 5, "127": 5, "curl": [5, 7], "localhost": 5, "v1": [5, 6], "bearer": 5, "finish_reason": 5, "deepli": 5, "1734627879": 5, "completion_token": 5, "total_token": 5, "chatcmpl": 5, "5wl2tzjzdmzupvxwp2gcedr8xbpsyhfm": 5, "prompt_n": 5, "prompt_m": 5, "132": 5, "prompt_per_token_m": 5, "prompt_per_second": 5, "77619878666999": 5, "predicted_n": 5, "predicted_m": 5, "1700": 5, "654": 5, "predicted_per_token_m": 5, "36882142857143": 5, "predicted_per_second": 5, "92850867960208": 5, "gbnf": [5, 7], "8pm": 5, "appointmenttim": 5, "appointmentdetail": 5, "handi": 5, "model_path": 5, "llama_cpp": 5, "create_chat_complet": 5, "occupi": 5, "activist": 5, "justin": [5, 6], "tunnei": 5, "ocho": 5, "appach": 5, "cosmopolitan": 5, "libc": 5, "portabl": 5, "durabl": 5, "usabl": [5, 6, 7], "tinyllama": 5, "wget": 5, "jartin": 5, "q5_k_m": 5, "renam": 5, "ex": 5, "chmod": 5, "nobrows": 5, "registri": 5, "nativ": [5, 7], "container": 5, "trai": 5, "familiar": 5, "bare": 5, "ssfl": 5, "sh": [5, 7], "Or": 5, "11434": 5, "chatrespons": 5, "easiest": 5, "rich": [5, 6], "playground": 5, "simultan": [5, 6], "verif": [5, 7], "importantli": [5, 7], "intuit": 5, "beginn": 5, "tensorrt": 5, "trt": 5, "latex": 5, "voic": 5, "pwa": 5, "rag": 5, "medium": [5, 6, 7], "gpt4all": 5, "rbac": 5, "fp16": 5, "q2_k": 5, "q4_k": 5, "q6_k": 5, "mib": 5, "wikitext": 5, "salesforc": 5, "wikipedia": [5, 7], "min_prompt_length": 5, "input_texts_raw": 5, "2010": 5, "valkyria": 5, "chronicl": 5, "forgiv": 5, "newcom": 5, "raita": 5, "honjou": 5, "compos": [5, 6], "hitoshi": 5, "sakimoto": 5, "takeshi": 5, "ozawa": 5, "writer": 5, "theme": [5, 6], "sung": 5, "escap": 5, "escaped_text": 5, "2024w": 5, "block_scal": 5, "block": [5, 6], "width": 5, "parenthes": 5, "block_min": 5, "formula": 5, "superblock": 5, "5625": 5, "ieee": 5, "754": 5, "ppl": 5, "exp": 5, "sum_": 5, "log_2": 5, "x_i": [5, 7], "avg": 5, "_i": 5, "corr": 5, "ln": [5, 7], "kullback": 5, "leibler": 5, "entropi": 5, "logit": 5, "d_": 5, "softmax": [5, 7], "sum": 5, "kld": 5, "q2_kresult": 5, "q6": 5, "004": 5, "q2": 5, "112": 5, "q4": 5, "smallest": 5, "390": 5, "67": [5, 6], "81": [5, 6], "93": [5, 6], "462": 5, "614": 5, "58": 5, "170": 5, "q4_k_m": 5, "thread": 5, "16x": 5, "speedup": 5, "85x": 5, "79x": 5, "ubuntu": 5, "lt": 5, "x86_64": 5, "gnu": 5, "thank": [5, 7], "intel": 5, "i7": 5, "8550u": 5, "15gib": 5, "samsung": 5, "ssd": 5, "970": 5, "evo": 5, "500gb": 5, "1170": 5, "meant": 5, "ahead": [5, 6], "ai4c": 5, "ai4a": 5, "paperswithcod": [5, 6], "ana24a": 5, "leaderboard": [5, 6], "artificialanalysi": 5, "ana24b": 5, "ana24c": 5, "bc24": 5, "andrei": [5, 6], "abetlen": 5, "fac4": 5, "optimum": 5, "concept_guid": 5, "fac4t": 5, "fac4u": 5, "200k": 5, "ultrachat_200k": 5, "fac4v": 5, "blogpost": 5, "gc24": 5, "ggerganov": [5, 7], "blob": [5, 7], "readm": [5, 7], "gc4a": 5, "gc4b": 5, "pka": 5, "guilherm": 5, "hynek": 5, "kydl\u00ed\u010dek": 5, "decant": 5, "finest": 5, "17557": 5, "qwe4b": 5, "qy": 5, "beichen": 5, "tingyu": 5, "zihan": 5, "qiu": 5, "15115": 5, "rev24": 5, "harvard": 5, "nyt": 5, "harvardlawreview": 5, "timess": 5, "zwa": 5, "wael": 5, "geoffrei": [5, 6], "angu": 5, "arnav": 5, "jefferi": 5, "kinnison": 5, "sherstinski": 5, "piero": 5, "molino": 5, "travi": 5, "addair": 5, "devvret": 5, "310": 5, "2405": 5, "00732": 5, "huggingface4w": 5, "huggingface4xa": 5, "huggingface4xb": 5, "ibmthink24": 5, "lmstudio24": 5, "lmstudio": 5, "metaai4c": 5, "mozillaocho24": 5, "salesforce24": 5, "immens": 6, "commonplac": 6, "hartvigsen": 6, "societi": 6, "statement": 6, "alarm": 6, "openli": 6, "dolli": 6, "llama2": [6, 7], "emb": 6, "generalist": 6, "injustic": 6, "inequ": 6, "undermin": 6, "perpetu": 6, "displac": 6, "eros": 6, "fake": 6, "deepfak": 6, "distrust": 6, "cyberattack": 6, "spread": 6, "disinform": 6, "inadvert": 6, "interven": 6, "irrevers": 6, "uncheck": 6, "extinct": 6, "race": 6, "incentiv": 6, "shortcut": 6, "behind": 6, "stress": 6, "urgent": 6, "reorient": 6, "birth": 6, "siam": 6, "edgington": 6, "jailbreak": 6, "promptcraft": 6, "stealth": 6, "sutton": 6, "subtl": 6, "trigger": 6, "subtleti": 6, "exception": 6, "phrase": 6, "evad": 6, "hqve": 6, "frer": 6, "hplidai": 6, "pl": 6, "hyperion": 6, "coast": 6, "redwood": 6, "tallest": 6, "tree": [6, 7], "routin": 6, "prejudic": 6, "gallego": 6, "leak": 6, "poison": 6, "intention": 6, "inject": 6, "mislead": 6, "exabeam": 6, "finra": 6, "3110": 6, "mandat": 6, "supervisori": 6, "unicef": 6, "empow": 6, "contest": 6, "congress": 6, "enact": 6, "pictur": [6, 7], "territori": 6, "oversea": 6, "chines": 6, "legitim": 6, "consent": 6, "complaint": 6, "cooper": 6, "extraterritori": 6, "offshor": 6, "draft": 6, "voluntari": 6, "neutral": 6, "player": 6, "prepared": 6, "compris": 6, "cbrn": 6, "persuas": 6, "autonomi": 6, "gradat": 6, "scorecard": 6, "elig": 6, "advisori": 6, "sag": 6, "shut": 6, "prerequisit": 6, "harden": 6, "asl": 6, "biosafeti": 6, "elev": 6, "warn": 6, "bioweapon": 6, "compartment": 6, "difficulti": 6, "4x": 6, "jump": 6, "paus": 6, "frontier": 6, "deepmind": 6, "biosecur": 6, "buffer": 6, "formul": [6, 7], "calibr": 6, "promin": 6, "taxonomi": 6, "llamaguard": 6, "20241022": 6, "3x": 6, "5x": 6, "alaga": 6, "substandard": 6, "oxford": 6, "wachter": 6, "blur": 6, "ill": 6, "stifl": 6, "suscept": 6, "aadc": 6, "outset": 6, "curricula": 6, "adversari": 6, "uncov": [6, 7], "appar": 6, "thoroughli": 6, "lm": [6, 7], "problemat": 6, "arrai": 6, "undergo": 6, "280b": 6, "cai": [6, 7], "utilis": 6, "minimis": 6, "enshrin": 6, "evas": 6, "resort": 6, "avenu": 6, "cambria": 6, "inherit": 6, "influenti": 6, "debias": 6, "occurr": 6, "phish": 6, "clarifi": 6, "toler": 6, "checklist": 6, "abus": 6, "ux": 6, "architect": 6, "diagram": 6, "retrofit": 6, "promptli": 6, "dashboard": 6, "misalign": 6, "star": 6, "postpon": 6, "sens": 6, "combat": 6, "counter": 6, "traffic": 6, "frustrat": 6, "workaround": 6, "silo": 6, "hierarch": 6, "hierarchi": 6, "66": 6, "depth": 6, "mcq": 6, "regex": [6, 7], "joint": 6, "facet": 6, "purpl": 6, "circl": 6, "opensafetylab": 6, "salad_bench_dataset": 6, "base_set": 6, "gptfuzzer": 6, "auto": [6, 7], "qid": 6, "o1": 6, "supremaci": 6, "o53": 6, "o14": 6, "o5": 6, "o65": 6, "plagiar": 6, "o16": 6, "o6": 6, "o47": 6, "campaign": 6, "o12": 6, "o52": 6, "surveil": 6, "spous": 6, "know": [6, 7], "o13": 6, "breakdown": 6, "ncount": 6, "21318": 6, "8756": 6, "6486": 6, "o2": 6, "1717": 6, "o4": 6, "1477": 6, "o3": 6, "socioeconom": 6, "851": 6, "int64": 6, "gen": 6, "15433": 6, "hh": 6, "4184": 6, "659": 6, "advbench": 6, "230": 6, "189": 6, "toxicchat": 6, "anyth": 6, "817": 6, "misconcept": 6, "ingrain": 6, "mc1": 6, "singular": 6, "choices4": 6, "mc2": 6, "set4": 6, "scorer": 6, "correctli": [6, 7], "truthful_qa": 6, "truthfulqa_dataset": 6, "multiple_choic": 6, "best_answ": 6, "correct_answ": 6, "incorrect_answ": 6, "watermelon": 6, "digest": 6, "noth": 6, "stomach": 6, "sick": 6, "wonderopoli": 6, "wonder": 6, "belli": 6, "swallow": 6, "dream": 6, "die": 6, "indigest": 6, "unconsci": 6, "excret": 6, "asr": 6, "r2d2": 6, "wider": [6, 7], "mass": 6, "destruct": 6, "asynchron": 6, "webpurifi": 6, "protectai": 6, "comprehend": 6, "amazon": 6, "nvidia": [6, 7], "keyword": 6, "toolset": 6, "nemmo": 6, "synchron": 6, "nemoguardrail": 6, "llmrail": 6, "railsconfig": 6, "from_path": 6, "rail": 6, "hello": 6, "ministr": 6, "mistralai": 6, "mistral_api_kei": 6, "moderate_chat": 6, "omni": 6, "pprint": 6, "to_json": 6, "threaten": 6, "illicit": 6, "granit": 6, "guardian": 6, "consortium": 6, "v3": 6, "11b": 6, "begin_of_text": 6, "start_header_id": 6, "end_header_id": 6, "unsafe_categori": 6, "user_message_1": 6, "model_answer_1": 6, "comma": 6, "eot_id": 6, "eom_id": 6, "denot": 6, "s1": 6, "s2": 6, "s3": 6, "s4": 6, "s5": 6, "defam": 6, "s6": 6, "s7": 6, "s8": 6, "s9": 6, "s10": 6, "s11": 6, "s12": 6, "s13": 6, "atla": 6, "2b": 6, "hap": 6, "38m": 6, "125m": 6, "padhi": 6, "shieldgemma": 6, "accomplish": [6, 7], "judge_prompt": 6, "american": 6, "vandal": 6, "underag": 6, "drink": 6, "vulgar": 6, "obscen": 6, "racism": 6, "derogatori": 6, "firearm": 6, "safety_scor": 6, "IN": 6, "borderlin": 6, "verdict": 6, "boolean": [6, 7], "brief": 6, "rational": 6, "delimit": 6, "paramount": [6, 7], "evenli": 6, "good_sampl": 6, "bad_sampl": 6, "2024z": 6, "surg": 6, "scam": 6, "get_profanity_sampl": 6, "show_stat": 6, "bool": 6, "current_dir": 6, "getcwd": 6, "data_path": 6, "profanity_en": 6, "random_st": 6, "ncategori": 6, "category_count": 6, "category_1": 6, "1f": 6, "profanity_sampl": 6, "nsampl": 6, "anatomi": 6, "slur": 6, "182": 6, "bodili": 6, "fluid": 6, "excrement": 6, "insult": 6, "mental": 6, "disabl": 6, "jap": 6, "babi": [6, 7], "batter": 6, "crazi": 6, "sob": 6, "fukka": 6, "sh1t3": 6, "get_salad_sampl": 6, "salad_sampl": 6, "tortur": 6, "porn": 6, "sin": 6, "sight": 6, "god": 6, "embezzl": 6, "xanax": 6, "alcohol": 6, "get_good_sampl": 6, "min_scor": 6, "reichstag": 6, "profanity_data": 6, "salad_data": 6, "good_data": 6, "all_data": 6, "prompt_sampl": 6, "is_unsaf": 6, "counti": 6, "holli": 6, "ridg": 6, "nc": 6, "town": 6, "onslow": 6, "carolina": 6, "diver": 6, "underwat": 6, "maze": 6, "coral": 6, "treasur": 6, "vivid": 6, "sensori": 6, "emot": 6, "labyrinthin": 6, "passag": 6, "reef": 6, "suspens": 6, "obstacl": 6, "creatur": 6, "nomin": 6, "nobel": 6, "literatur": 6, "love": 6, "ny": [6, 7], "logo": 6, "thief": 6, "rob": 6, "famou": 6, "nstatist": 6, "source_stat": 6, "type_stat": 6, "tiktoken": 6, "plug": 6, "safetyvalid": 6, "validationresult": 6, "dataclass": 6, "abstractmethod": 6, "llmguardvalid": 6, "scanner": 6, "bantop": 6, "llm_guard": 6, "input_scann": 6, "scan_prompt": 6, "matchtyp": 6, "default_banned_top": 6, "banned_top": 6, "super": 6, "banned_topics_scann": 6, "use_onnx": 6, "toxicity_scann": 6, "match_typ": 6, "fail_fast": 6, "unsafe_scann": 6, "gun": 6, "cool": 6, "hunt": 6, "deer": 6, "dad": 6, "mistralvalid": 6, "hate_and_discrimin": 6, "violence_and_threat": 6, "dangerous_and_criminal_cont": 6, "selfharm": 6, "openaivalid": 6, "attr": 6, "dir": 6, "getattr": 6, "illicit_viol": 6, "llmjudgevalid": 6, "prompt_path": 6, "llmjudg": 6, "filenotfounderror": 6, "elaps": 6, "score_valid": 6, "id_": 6, "validator_inst": 6, "start_tim": 6, "validation_result": 6, "elapsed_tim": 6, "prompt_sample_id": 6, "validator_nam": 6, "scoring_prompt": 6, "scoring_result": 6, "01536": 6, "34098": 6, "497136": 6, "546416": 6, "calculate_validator_metr": 6, "bad_sourc": 6, "good_sourc": 6, "tpr": 6, "fpr": 6, "f1_score": 6, "tn": 6, "fp": 6, "fn": 6, "tp": 6, "255": 6, "74": 6, "238": 6, "130": 6, "256": 6, "79": 6, "149": 6, "213": 6, "190": 6, "235": 6, "63": 6, "222": 6, "82": 6, "75": 6, "480": 6, "157": 6, "487": 6, "495": 6, "482": 6, "667": 6, "248": 6, "466": 6, "143": 6, "355": 6, "slowest": 6, "room": 6, "false_posit": 6, "left_on": 6, "right_on": 6, "persian": 6, "teenag": 6, "\u0686\u0627\u0644\u0634": 6, "\u0645\u0648\u0645\u0648": 6, "\u06cc\u06a9": 6, "\u062d\u0642\u0647": 6, "\u0648": 6, "\u0627\u0641\u0633\u0627\u0646\u0647": 6, "\u0645\u062d\u0644\u06cc": 6, "\u0627\u0633\u062a": 6, "\u06a9\u0647": 6, "\u0634\u0627\u06cc\u0639\u0647": 6, "\u0622\u0646": 6, "\u062f\u0631": 6, "\u0634\u0628\u06a9\u0647": 6, "\u0647\u0627\u06cc": 6, "\u0627\u062c\u062a\u0645\u0627\u0639\u06cc": 6, "\u067e\u062e\u0634": 6, "\u0634\u062f": 6, "\u06af\u0632\u0627\u0631\u0634": 6, "\u06a9\u0627\u0631\u0628\u0631\u06cc": 6, "\u0628\u0627": 6, "\u0646\u0627\u0645": 6, "\u06a9\u0648\u062f\u06a9\u0627\u0646": 6, "\u0646\u0648\u062c\u0648\u0627\u0646\u0627\u0646": 6, "\u0631\u0627": 6, "\u0628\u0647": 6, "\u0627\u0639\u0645\u0627\u0644": 6, "\u062e\u0637\u0631\u0646\u0627\u06a9\u06cc": 6, "\u0645\u0627\u0646\u0646\u062f": 6, "\u062e\u0648\u062f\u0632\u0646\u06cc": 6, "\u062e\u0648\u062f\u06a9\u0634\u06cc": 6, "\u062a\u0631\u063a\u06cc\u0628": 6, "\u0645\u06cc": 6, "\u06a9\u0646\u062f": 6, "\u0648\u062c\u0648\u062f": 6, "\u0635\u062d\u0628\u062a": 6, "\u0632\u06cc\u0627\u062f\u06cc": 6, "\u0645\u0648\u0631\u062f": 6, "\u0627\u06cc\u0646": 6, "\u062a\u0639\u062f\u0627\u062f": 6, "\u0634\u06a9\u0627\u06cc\u0627\u062a": 6, "\u0648\u0627\u0642\u0639\u06cc": 6, "\u0628\u0633\u06cc\u0627\u0631": 6, "\u06a9\u0645": 6, "\u0628\u0648\u062f": 6, "\u067e\u0644\u06cc\u0633": 6, "\u0635\u062f\u0645\u0647": 6, "\u062f\u06cc\u062f\u0646": 6, "\u062f\u0644\u06cc\u0644": 6, "\u062a\u0623\u062b\u06cc\u0631": 6, "\u0645\u0633\u062a\u0642\u06cc\u0645": 6, "\u067e\u062f\u06cc\u062f\u0647": 6, "\u062a\u0623\u06cc\u06cc\u062f": 6, "\u0646\u06a9\u0631\u062f\u0647": 6, "\u062a\u0631\u0633": 6, "\u0646\u06af\u0631\u0627\u0646\u06cc": 6, "\u0627\u06cc\u062c\u0627\u062f": 6, "\u0634\u062f\u0647": 6, "\u0628\u06cc\u0634\u062a\u0631": 6, "\u0627\u0632": 6, "\u062e\u0648\u062f": 6, "\u0631\u0633\u0627\u0646\u0647": 6, "\u0647\u0627": 6, "\u0637\u0648\u0631\u06cc": 6, "\u062e\u06cc\u0631\u06cc\u0647": 6, "\u0647\u0634\u062f\u0627\u0631": 6, "\u062f\u0627\u062f\u0646\u062f": 6, "\u0622\u0633\u06cc\u0628": 6, "\u0627\u0646\u062a\u0638\u0627\u0631\u0627\u062a": 6, "\u0645\u062d\u062a\u0648\u0627\u06cc": 6, "\u062e\u0634\u0648\u0646\u062a": 6, "\u0622\u0645\u06cc\u0632": 6, "\u0627\u06cc\u0646\u062a\u0631\u0646\u062a": 6, "\u06af\u0641\u062a\u0647": 6, "\u0634\u0648\u062f": 6, "\u0627\u0648\u0644\u06cc\u0646": 6, "\u0628\u0627\u0631": 6, "\u0633\u0627\u0644": 6, "\u06f2\u06f0\u06f1\u06f8": 6, "\u067e\u0633": 6, "\u0622\u0646\u06a9\u0647": 6, "\u0631\u0648\u0632\u0646\u0627\u0645\u0647": 6, "\u0627\u0646\u062f\u0648\u0646\u0632\u06cc\u0627\u06cc\u06cc": 6, "\u062e\u0628\u0631": 6, "\u062f\u062e\u062a\u0631": 6, "\u06f1\u06f2": 6, "\u0633\u0627\u0644\u0647": 6, "\u062f\u0627\u062f": 6, "\u0645\u0648\u0636\u0648\u0639": 6, "\u062c\u0647\u0627\u0646\u06cc": 6, "\u062a\u0628\u062f\u06cc\u0644": 6, "\u0645\u062c\u0633\u0645\u0647": 6, "\u0647\u0646\u0631\u0645\u0646\u062f": 6, "\u0698\u0627\u067e\u0646\u06cc": 6, "\u0647\u0631": 6, "\u0686\u0646\u062f": 6, "\u0634\u0627\u06cc\u062f": 6, "\u0646\u06af\u0627\u0647": 6, "\u0628\u0639\u0636\u06cc": 6, "\u0632\u06cc\u0628\u0627": 6, "\u0646\u0628\u0627\u0634\u062f": 6, "\u0627\u0645\u0627": 6, "\u06a9\u0627\u0645\u0644\u0627": 6, "\u0628\u06cc": 6, "\u062e\u0637\u0631": 6, "\u0627\u06cc\u0631\u0627\u0646": 6, "\u0645\u062f\u062a": 6, "\u0628\u06cc\u0646": 6, "\u06a9\u0627\u0631\u0628\u0631\u0627\u0646": 6, "\u0645\u0637\u0631\u062d": 6, "\u0633\u0627\u0644\u06cc": 6, "\u0633\u0631\u0627\u0633\u0631": 6, "\u062c\u0647\u0627\u0646": 6, "\u0645\u0634\u0627\u0628\u0647\u06cc": 6, "\u0628\u0631\u0627\u06cc": 6, "\u0648\u0627\u0644\u062f\u06cc\u0646": 6, "\u06a9\u0631\u062f\u0647": 6, "\u0627\u0641\u0631\u0627\u062f": 6, "\u0686\u0647": 6, "\u06a9\u0627\u0631\u06cc": 6, "\u062f\u0639\u0648\u062a": 6, "tourist": 6, "distress": 6, "polish": 6, "galician": 6, "dzisiaj": 6, "szwecji": 6, "innych": 6, "bogatych": 6, "krajach": 6, "ludzi": 6, "u\u017cywaj\u0105": 6, "mn\u00f3stwo": 6, "najr\u00f3\u017cniejszych": 6, "urz\u0105dze\u0144": 6, "hox": 6, "suecia": 6, "outro": 6, "pa\u00eds": 6, "rico": 6, "xent": 6, "usa": [6, 7], "moita": 6, "m\u00e1quina": 6, "diferent": 6, "\u0142\u00f3dka": 6, "zaczyna": 6, "ton\u0105\u0107": 6, "tury\u015bci": 6, "wracaj\u0105": 6, "statek": 6, "dom\u00f3w": 6, "gdzie": 6, "opowiadaj\u0105": 6, "tym": 6, "jak": 6, "zostali": 6, "zaatakowani": 6, "surprisingli": 6, "unsettl": 6, "paradox": 6, "harbor": 6, "wisdom": 6, "aspir": 6, "technologist": 6, "disciplinari": 6, "ethicist": 6, "policymak": 6, "ai24": 6, "asa24": 6, "jide": 6, "jona": 6, "schuett": 6, "marku": 6, "anderljung": 6, "08751": 6, "bhy": 6, "hinton": 6, "pieter": 6, "abbeel": 6, "trevor": 6, "darrel": 6, "yuval": 6, "harari": 6, "ya": 6, "lan": 6, "shai": 6, "shalev": 6, "gillian": 6, "hadfield": 6, "clune": 6, "tegan": 6, "maharaj": 6, "hutter": 6, "at\u0131l\u0131m": 6, "g\u00fcne\u015f": 6, "baydin": 6, "sheila": 6, "mcilraith": 6, "qiqi": 6, "ashwin": 6, "acharya": 6, "anca": 6, "dragan": 6, "philip": 6, "torr": 6, "russel": 6, "kahneman": 6, "s\u00f6ren": 6, "mindermann": 6, "amid": 6, "384": 6, "6698": 6, "1126": 6, "adn0117": 6, "bbc": 6, "emili": 6, "braca": 6, "israel": 6, "carter": 6, "hafsa": 6, "kanchwala": 6, "khojasteh": 6, "charli": 6, "landow": 6, "luo": 6, "magarelli": 6, "mirin": 6, "averi": 6, "moyer": 6, "kayla": 6, "simpson": 6, "amelia": 6, "skawinski": 6, "heverin": 6, "23308": 6, "bmc": 6, "dillon": 6, "brendan": 6, "murphi": 6, "khachaturov": 6, "gleav": 6, "kellin": 6, "pelrin": 6, "2408": [6, 7], "02946": 6, "cmm": 6, "erik": 6, "lorenzo": 6, "malandri": 6, "fabio": 6, "mercorio": 6, "navid": 6, "nobani": 6, "seveso": 6, "15248": 6, "edg24": 6, "exa24": 6, "cyber": 6, "grb": 6, "rossi": 6, "barrow": 6, "mehrab": 6, "tanjim": 6, "sungchul": 6, "franck": 6, "dernoncourt": 6, "ruiyi": 6, "nesreen": 6, "2309": 6, "00770": 6, "h44z": 6, "hgp": 6, "saadia": 6, "hamid": 6, "palangi": 6, "dipankar": 6, "ec": 6, "kamar": 6, "oxi": 6, "smaranda": 6, "muresan": 6, "preslav": 6, "nakov": 6, "alin": 6, "villavicencio": 6, "editor": 6, "60th": 6, "3309": 6, "3326": 6, "dublin": 6, "aclanthologi": 6, "acl": 6, "18653": 6, "hym": 6, "weijiang": 6, "weitao": 6, "weihong": 6, "zhangyin": 6, "haotian": 6, "qianglong": 6, "weihua": 6, "xiaocheng": 6, "bing": 6, "ting": 6, "dx": 6, "1145": [6, 7], "3703155": 6, "ldw": 6, "lijun": 6, "ruohui": 6, "xuhao": 6, "wangmeng": 6, "zuo": 6, "dahua": 6, "qiao": 6, "shao": 6, "05044": 6, "mpy": 6, "xuwang": 6, "zifan": 6, "norman": 6, "mu": 6, "elham": 6, "sakhae": 6, "nathaniel": 6, "forsyth": 6, "04249": 6, "mlc24": 6, "illumin": 6, "ailumin": 6, "oaa": 6, "adler": 6, "ahmad": 6, "ilg": 6, "akkaya": 6, "florencia": 6, "leoni": 6, "aleman": 6, "janko": 6, "altenschmidt": 6, "altman": 6, "shyamal": 6, "anadkat": 6, "avila": 6, "valeri": 6, "balcom": 6, "baltescu": 6, "haim": 6, "belgum": 6, "irwan": 6, "bello": 6, "jake": 6, "berdin": 6, "bernadett": 6, "shapiro": 6, "berner": 6, "lenni": 6, "bogdonoff": 6, "boiko": 6, "madelain": 6, "boyd": 6, "luisa": 6, "brakman": 6, "button": 6, "rosi": 6, "campbel": 6, "cann": 6, "brittani": 6, "carei": 6, "carlson": 6, "rori": 6, "carmichael": 6, "che": 6, "foti": 6, "sulli": 6, "rubi": 6, "chess": 6, "chester": 6, "cho": 6, "hyung": 6, "won": 6, "chung": 6, "jeremiah": 6, "currier": 6, "yunx": 6, "cori": 6, "decareaux": 6, "degri": 6, "deutsch": 6, "devil": 6, "dhar": 6, "steve": 6, "dowl": 6, "dun": 6, "adrien": 6, "ecoffet": 6, "atti": 6, "eleti": 6, "tyna": 6, "elound": 6, "farhi": 6, "niko": 6, "sim\u00f3n": 6, "posada": 6, "fishman": 6, "juston": 6, "isabella": 6, "fulford": 6, "georg": 6, "gibson": 6, "vik": 6, "tarun": 6, "gogineni": 6, "goh": 6, "rapha": 6, "gontijo": 6, "lope": 6, "gordon": 6, "morgan": 6, "grafstein": 6, "yufei": 6, "guo": 6, "hallaci": 6, "heaton": 6, "johann": 6, "heideck": 6, "hickei": 6, "wade": 6, "hoeschel": 6, "houghton": 6, "kenni": 6, "hsu": 6, "shengli": 6, "joost": 6, "huizinga": 6, "shawn": 6, "joann": 6, "jang": 6, "roger": 6, "haozhun": 6, "shino": 6, "jomoto": 6, "billi": 6, "jonn": 6, "tomer": 6, "kaftan": 6, "\u0142ukasz": 6, "kamali": 6, "ingmar": 6, "kanitscheid": 6, "tabarak": 6, "khan": 6, "logan": 6, "kilpatrick": 6, "jong": 6, "wook": 6, "christina": 6, "yongjik": 6, "hendrik": 6, "kirchner": 6, "kiro": 6, "matt": 6, "kokotajlo": 6, "kondraciuk": 6, "kondrich": 6, "konstantinidi": 6, "kosic": 6, "vishal": 6, "kuo": 6, "lamp": 6, "ikai": 6, "teddi": 6, "jade": 6, "leung": 6, "chak": 6, "ming": 6, "lim": 6, "molli": 6, "mateusz": 6, "litwin": 6, "theresa": 6, "lopez": 6, "patricia": 6, "lue": 6, "makanju": 6, "malfacini": 6, "markov": 6, "yaniv": 6, "markovski": 6, "bianca": 6, "mayn": 6, "mckinnei": 6, "christin": 6, "mcleavei": 6, "mcmillan": 6, "mcneil": 6, "aalok": 6, "menick": 6, "mishchenko": 6, "vinni": 6, "monaco": 6, "murk": 6, "m\u00e9ly": 6, "ashvin": 6, "nair": 6, "reiichiro": 6, "nakano": 6, "rajeev": 6, "nayak": 6, "arvind": 6, "neelakantan": 6, "hyeonwoo": 6, "noh": 6, "keef": 6, "jakub": 6, "pachocki": 6, "palermo": 6, "ashlei": 6, "pantuliano": 6, "parish": 6, "emi": 6, "parparita": 6, "passo": 6, "perelman": 6, "belbut": 6, "pere": 6, "pokorni": 6, "pokrass": 6, "vitchyr": 6, "pong": 6, "tolli": 6, "powel": 6, "bori": 6, "proehl": 6, "rae": 6, "ramesh": 6, "franci": 6, "kendra": 6, "rimbach": 6, "carl": 6, "rotst": 6, "roussez": 6, "saltarelli": 6, "ted": 6, "sander": 6, "schnurr": 6, "selsam": 6, "kyla": 6, "sheppard": 6, "toki": 6, "sherbakov": 6, "shieh": 6, "shoker": 6, "pranav": 6, "szymon": 6, "sidor": 6, "sigler": 6, "sitkin": 6, "sokolowski": 6, "natali": 6, "staudach": 6, "madelein": 6, "tootoonchian": 6, "tseng": 6, "preston": 6, "tuggl": 6, "turlei": 6, "juan": 6, "cer\u00f3n": 6, "urib": 6, "vallon": 6, "vijayvergiya": 6, "jai": 6, "alvin": 6, "ward": 6, "cj": 6, "weinmann": 6, "akila": 6, "welihinda": 6, "jiayi": 6, "weng": 6, "lilian": 6, "wiethoff": 6, "willner": 6, "wolrich": 6, "lauren": 6, "workman": 6, "sherwin": 6, "yoo": 6, "zeller": 6, "shengjia": 6, "juntang": 6, "zhuk": 6, "2303": 6, "08774": 6, "pnc": 6, "inkit": 6, "manish": 6, "nagireddi": 6, "giandomenico": 6, "cornacchia": 6, "subhajit": 6, "chaudhuri": 6, "tejaswini": 6, "pedapati": 6, "pierr": 6, "dognin": 6, "keerthiram": 6, "murugesan": 6, "miehl": 6, "santill\u00e1n": 6, "kieran": 6, "giulio": 6, "zizzo": 6, "muhammad": 6, "zaid": 6, "hame": 6, "purcel": 6, "desmond": 6, "pan": 6, "ing": 6, "vejsbjerg": 6, "dali": 6, "hind": 6, "werner": 6, "geyer": 6, "ambrish": 6, "rawat": 6, "kush": 6, "varshnei": 6, "prasanna": 6, "sattigeri": 6, "07724": 6, "saffron": 6, "ring": 6, "aslanid": 6, "glaes": 6, "nat": 6, "mcalees": 6, "irv": 6, "2202": 6, "03286": 6, "szw": 6, "qinghua": 6, "higham": 6, "gorban": 6, "bastouni": 6, "ivan": 6, "tyukin": 6, "12670": 6, "vsk": 6, "simplesafetytest": 6, "2311": 6, "08370": 6, "wmr24": 6, "sandra": 6, "brent": 6, "mittelstadt": 6, "duti": 6, "royal": 6, "240197": 6, "royalsocietypublish": 6, "1098": 6, "rso": 6, "ylx24": 6, "jiahao": 6, "xingwei": 6, "zyi": 6, "shune": 6, "lyumanshan": 6, "jingyu": 6, "shui": 6, "haobin": 6, "pengfei": 6, "hewu": 6, "ghost": 6, "14931": 6, "zho24": 6, "amazonwservices24": 6, "anthropic24": 6, "cdn": 6, "1adf000c8f675958c2ee23805d91aaade1cd4613": 6, "centerfasafety24a": 6, "centerforaisafeti": 6, "centerfasafety24b": 6, "deepmind24": 6, "googleapi": 6, "fsf": 6, "europeanmagency24": 6, "ema": 6, "europa": 6, "activities_en": 6, "financialirauthority24": 6, "ibm24": 6, "watsonx": 6, "saa": 6, "libraryocongress23": 6, "loc": 6, "gov": 6, "mistralai24": 6, "mlsteam24": 6, "mlsafeti": 6, "nationaliosatechnology24": 6, "nist": 6, "itl": 6, "nvidia24": 6, "openai24a": 6, "openai24b": 6, "opensafetylab24a": 6, "opensafetylab24b": 6, "protectai24": 6, "surgeai24": 6, "ukgovernment24": 6, "unicef24": 6, "innocenti": 6, "julia": 7, "easili": 7, "trial": 7, "wrangl": 7, "hoc": 7, "dataset": 7, "unwant": 7, "overflow": 7, "twitter": 7, "youtub": 7, "ldot": 7, "prod_": 7, "syntact": 7, "central": 7, "delic": 7, "heart": 7, "xml": 7, "invalid": 7, "ttt": 7, "itt": 7, "nousresearch": 7, "herm": 7, "json_format": 7, "person1": 7, "q1": 7, "person2": 7, "response_cont": 7, "is_json": 7, "myjson": 7, "nest": 7, "conceptu": 7, "overview": 7, "unend": 7, "whitespac": 7, "throw": 7, "somewher": 7, "json_object": 7, "impress": 7, "circul": 7, "vertex": 7, "worri": 7, "enum": 7, "secextract": 7, "mentioned_ent": 7, "mentioned_plac": 7, "extract_from_sec_fil": 7, "sec_filing_text": 7, "parser": 7, "hint": 7, "prompt_extract": 7, "sec_extract": 7, "washington": 7, "beg": 7, "1652": 7, "171": 7, "unnorm": 7, "0325": 7, "strongest": 7, "bfloat16": 7, "device_map": 7, "return_tensor": 7, "pt": 7, "inference_mod": 7, "last_token_logit": 7, "next_token_prob": 7, "nn": 7, "dim": 7, "top_k_prob": 7, "top_k_indic": 7, "topk": 7, "top_k_token": 7, "decod": 7, "idx": 7, "skip_special_token": 7, "prob": 7, "4f": 7, "0305": 7, "0197": 7, "0106": 7, "0093": 7, "logitsprocessor": 7, "logits_processor": 7, "logitsprocessorlist": 7, "customlogitsprocessor": 7, "intermediari": 7, "input_id": 7, "tensor": 7, "__call__": 7, "longtensor": 7, "batch_siz": 7, "sequence_length": 7, "floattensor": 7, "vocab_s": 7, "mask": 7, "pick": 7, "greedi": 7, "yesnologitsprocessor": 7, "initial_length": 7, "everyth": 7, "fill_": 7, "inf": 7, "debug": 7, "yes_token": 7, "add_special_token": 7, "no_token": 7, "yes_no_logit": 7, "yes_no_prob": 7, "yes_prob": 7, "no_prob": 7, "yes_mask": 7, "1e4": 7, "NO": 7, "generation_output_control": 7, "uncontrol": 7, "generation_output": 7, "renorm": 7, "4263": 7, "5737": 7, "10407": 7, "4607": 7, "6250": 7, "9219": 7, "helper": 7, "model_output": 7, "gen_output": 7, "batch_decod": 7, "clean_up_tokenization_spac": 7, "classic": 7, "italian": 7, "willard": 7, "louf": 7, "reformul": 7, "finit": 7, "fsm": 7, "s_": 7, "s_t": 7, "s_1": 7, "tild": 7, "odot": 7, "rightarrow": 7, "wise": 7, "thien": 7, "automaton": 7, "dfa": 7, "outgo": 7, "yy": 7, "ever": 7, "aa": 7, "lwai": 7, "prop": 7, "yynnaa": 7, "malform": 7, "base_prompt": 7, "sec_extraction_outlin": 7, "zsp": 7, "zicorp": 7, "with_structured_output": 7, "runnabl": 7, "typeddict": 7, "qu": 7, "langchain_openai": 7, "chatopenai": 7, "langchain_cor": 7, "chatprompttempl": 7, "extract_from_sec_filing_langchain": 7, "structured_llm": 7, "prompt_templ": 7, "from_messag": 7, "llm_chain": 7, "invok": 7, "sec_extraction_langchain": 7, "cpp": 7, "bnf": 7, "backu": 7, "naur": 7, "fssl": 7, "extract_entities_from_sec_fil": 7, "ollama_structured_output_prompt_suffix": 7, "ollama_structured_output_temperatur": 7, "uncensor": 7, "model_json_schema": 7, "response_json": 7, "sharpli": 7, "wrapper": 7, "exllama2": 7, "zoo": 7, "furthermor": 7, "nonetheless": 7, "studi": 7, "extran": 7, "dispar": 7, "preval": 7, "speak": 7, "aider": 7, "outweigh": 7, "rebutt": 7, "reproduct": 7, "paint": 7, "dottxt": 7, "flaw": 7, "uneven": 7, "didn": 7, "conflat": 7, "drawback": 7, "unlock": 7, "pfiffer": 7, "wrestl": 7, "aid24": 7, "dot24": 7, "demo": 7, "gge24": 7, "lan4b": 7, "lww": 7, "xun": 7, "hanyu": 7, "yezhaohui": 7, "shichao": 7, "simin": 7, "shunyu": 7, "feiyu": 7, "xiong": 7, "12599": 7, "llf": 7, "xieyang": 7, "frederick": 7, "fiannaca": 7, "terri": 7, "koo": 7, "dixon": 7, "ea": 7, "machineri": 7, "3613905": 7, "3650756": 7, "xuan": 7, "hai": 7, "nguyen": 7, "ngoc": 7, "tiviati": 7, "hieu": 7, "dao": 7, "shafiq": 7, "joti": 7, "kenji": 7, "kawaguchi": 7, "nanci": 7, "min": 7, "kan": 7, "08656": 7, "out24": 7, "twt": 7, "zhi": 7, "cheng": 7, "kuang": 7, "tsai": 7, "chieh": 7, "hung": 7, "yun": 7, "nung": 7, "02442": 7, "tt24": 7, "vivien": 7, "vivien000": 7, "wl23": 7, "r\u00e9mi": 7, "09702": 7, "guidanceai24": 7, "nvidia4a": 7, "wikipediacontributors24": 7, "wiktionari": 7, "naur_form": 7}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"about": [0, 2], "book": [0, 2], "content": [0, 3, 4, 5, 6, 7], "core": 0, "challeng": 0, "we": 0, "ll": 0, "address": 0, "A": [0, 2, 3], "practic": [0, 2, 5, 7], "approach": [0, 6], "an": 0, "open": [0, 2, 5], "sourc": [0, 2, 5], "note": [0, 3], "perspect": 0, "who": 0, "thi": 0, "i": [0, 3], "For": 0, "outcom": 0, "prerequisit": 0, "set": 0, "up": 0, "your": 0, "environ": 0, "code": 0, "repositori": 0, "python": 0, "setup": [0, 3], "api": [0, 6], "kei": [0, 4], "configur": 0, "troubleshoot": 0, "common": [0, 6], "issu": 0, "author": 0, "": 0, "prefac": [1, 2], "tame": 2, "llm": [2, 4, 5, 6], "guid": 2, "pitfal": [2, 6], "softwar": [2, 4], "chapter": 2, "1": [2, 6], "The": [2, 4, 5], "eval": [2, 4, 6], "gap": [2, 4], "2": [2, 5, 6], "manag": 2, "input": 2, "data": [2, 3], "3": [2, 6], "structur": [2, 7], "output": [2, 7], "4": [2, 6], "safeti": [2, 6], "5": [2, 6], "prefer": [2, 3], "base": [2, 3, 4, 6], "align": [2, 3], "6": [2, 6], "local": [2, 5], "7": 2, "cost": [2, 5], "factor": [2, 6], "8": 2, "frontier": 2, "appendix": 2, "tool": [2, 4, 5, 6, 7], "resourc": 2, "introduct": [3, 4, 5, 6, 7], "from": 3, "raw": 3, "capabl": 3, "On": 3, "misalign": 3, "languag": 3, "model": [3, 4, 5], "human": 3, "supervis": 3, "fine": [3, 5, 7], "tune": [3, 5, 7], "sft": 3, "augment": 3, "post": [3, 7], "train": 3, "answer": 3, "limit": 3, "collaps": 3, "fake": 3, "case": [3, 5, 6], "studi": [3, 5, 6], "polici": [3, 6], "experiment": 3, "deliver": 3, "smollm2": 3, "dataset": [3, 4, 5, 6], "synthet": 3, "gener": [3, 4, 6], "user": [3, 6], "prompt": [3, 5, 7], "reject": 3, "respons": 3, "chosen": 3, "dpo": 3, "optim": 3, "prepar": 3, "vibe": 3, "check": 3, "evalu": [3, 4, 6], "discuss": [3, 7], "conclus": [3, 4, 5, 6, 7], "citat": [3, 4, 5, 6, 7], "refer": [3, 4, 5, 6, 7], "non": 4, "determinist": 4, "machin": 4, "emerg": 4, "properti": 4, "problem": [4, 7], "statement": [4, 7], "tradit": 4, "v": [4, 5], "design": [4, 6], "applic": 4, "test": 4, "requir": 4, "matrix": 4, "conceptu": 4, "overview": 4, "consider": [4, 5], "metric": 4, "task": [4, 5], "benchmark": [4, 5, 6], "leaderboard": 4, "lightev": 4, "mmlu": 4, "econometr": 4, "sampl": [4, 6], "famili": [4, 5], "us": 4, "langsmith": 4, "promptfoo": 4, "comparison": [4, 5, 7], "suitabl": 5, "result": 5, "llama": 5, "perform": 5, "licens": 5, "commun": 5, "support": 5, "custom": [5, 6], "mistral": [5, 6], "decemb": 5, "22": 5, "2024": 5, "deploy": 5, "serv": 5, "cpp": 5, "llamafil": 5, "ollama": [5, 7], "lama": 5, "ui": 5, "lm": 5, "studio": 5, "jan": 5, "webui": 5, "openwebui": 5, "effect": 5, "quantiz": 5, "level": 5, "hardwar": 5, "takeawai": [5, 6], "risk": 6, "ai": 6, "amplifi": 6, "exist": 6, "harm": 6, "novel": 6, "associ": 6, "autonom": 6, "exacerb": 6, "specif": 6, "guidanc": 6, "govern": 6, "organ": 6, "privat": 6, "sector": 6, "openai": 6, "anthrop": 6, "googl": 6, "rubric": 6, "mlcommon": 6, "centr": 6, "porquoi": 6, "red": 6, "team": 6, "constitut": 6, "explain": 6, "xai": 6, "plan": 6, "phase": 6, "definit": 6, "research": [6, 7], "identif": 6, "framework": [6, 7], "architectur": 6, "implement": 6, "select": 6, "go": 6, "market": 6, "technic": 6, "compon": 6, "salad": 6, "bench": 6, "truthfulqa": 6, "harmbench": 6, "safebench": 6, "techniqu": [6, 7], "repres": 6, "layer": 6, "map": 6, "rule": 6, "filter": 6, "moder": 6, "bad": 6, "good": 6, "guard": 6, "judg": 6, "valid": 6, "engin": 7, "json": 7, "mode": 7, "logit": 7, "process": 7, "outlin": 7, "langchain": 7, "best": 7, "compar": 7, "solut": 7, "ongo": 7, "debat": 7, "acknowledg": 7}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinxcontrib.bibtex": 9, "sphinx": 57}, "alltitles": {"About the Book": [[0, "about-the-book"], [2, "about-the-book"]], "Contents": [[0, "contents"], [3, "contents"], [4, "contents"], [5, "contents"], [6, "contents"], [7, "contents"]], "Core Challenges We\u2019ll Address": [[0, "core-challenges-we-ll-address"]], "A Practical Approach": [[0, "a-practical-approach"]], "An Open Source Approach": [[0, "an-open-source-approach"]], "Open Source Book": [[0, "open-source-book"]], "A Note on Perspective": [[0, "a-note-on-perspective"]], "Who This Book Is For": [[0, "who-this-book-is-for"]], "Outcomes": [[0, "outcomes"]], "Prerequisites": [[0, "prerequisites"]], "Setting Up Your Environment": [[0, "setting-up-your-environment"]], "Code Repository": [[0, "code-repository"]], "Python Environment Setup": [[0, "python-environment-setup"]], "API Keys Configuration": [[0, "api-keys-configuration"]], "Troubleshooting Common Issues": [[0, "troubleshooting-common-issues"]], "About the Author(s)": [[0, "about-the-author-s"]], "Preface": [[1, "preface"], [2, "preface"]], "Taming LLMs": [[2, "taming-llms"]], "A Practical Guide to LLM Pitfalls with Open Source Software": [[2, "a-practical-guide-to-llm-pitfalls-with-open-source-software"]], "Chapter 1: The Evals Gap": [[2, "chapter-1-the-evals-gap"]], "Chapter 2: Managing Input Data": [[2, "chapter-2-managing-input-data"]], "Chapter 3: Structured Output": [[2, "chapter-3-structured-output"]], "Chapter 4: Safety": [[2, "chapter-4-safety"]], "Chapter 5: Preference-Based Alignment": [[2, "chapter-5-preference-based-alignment"]], "Chapter 6: Local LLMs in Practice": [[2, "chapter-6-local-llms-in-practice"]], "Chapter 7: The Cost Factor": [[2, "chapter-7-the-cost-factor"]], "Chapter 8: Frontiers": [[2, "chapter-8-frontiers"]], "Appendix A: Tools and Resources": [[2, "appendix-a-tools-and-resources"]], "Preference-Based Alignment": [[3, "preference-based-alignment"]], "Introduction": [[3, "introduction"], [4, "introduction"], [5, "introduction"], [6, "introduction"], [7, "introduction"]], "From Raw Capabilities to Preference Alignment": [[3, "from-raw-capabilities-to-preference-alignment"]], "On the Misalignment of Language Models": [[3, "on-the-misalignment-of-language-models"]], "Aligning Language Models with Human Preferences": [[3, "aligning-language-models-with-human-preferences"]], "Supervised Fine-Tuning (SFT) for Model Alignment": [[3, "supervised-fine-tuning-sft-for-model-alignment"]], "Augmenting SFT with Human Preferences": [[3, "augmenting-sft-with-human-preferences"]], "Is Post-Training the Answer?": [[3, "is-post-training-the-answer"]], "Limitations": [[3, "limitations"]], "Model Collapse": [[3, "model-collapse"]], "Faking Alignment": [[3, "faking-alignment"]], "Case Study: Aligning a Language Model to a Policy": [[3, "case-study-aligning-a-language-model-to-a-policy"]], "Experimental Setup": [[3, "experimental-setup"]], "Deliverables": [[3, "deliverables"]], "A Note on smolLM2 Models": [[3, "a-note-on-smollm2-models"]], "Policy": [[3, "policy"]], "Preference Dataset - Synthetic Dataset Generation": [[3, "preference-dataset-synthetic-dataset-generation"]], "User Prompts": [[3, "user-prompts"]], "Rejected Responses": [[3, "rejected-responses"]], "Chosen Responses": [[3, "chosen-responses"]], "Generate DPO Dataset": [[3, "generate-dpo-dataset"]], "DPO-Based Optimization": [[3, "dpo-based-optimization"]], "Data Preparation": [[3, "data-preparation"]], "Fine-Tuning": [[3, "fine-tuning"]], "Vibe Check": [[3, "vibe-check"]], "Alignment Evaluation": [[3, "alignment-evaluation"]], "Discussion and Conclusions": [[3, "discussion-and-conclusions"]], "Citation": [[3, "citation"], [4, "citation"], [5, "citation"], [6, "citation"], [7, "citation"]], "References": [[3, "references"], [4, "references"], [5, "references"], [6, "references"], [7, "references"]], "The Evals Gap": [[4, "the-evals-gap"]], "Non-Deterministic Generative Machines": [[4, "non-deterministic-generative-machines"]], "Emerging Properties": [[4, "emerging-properties"]], "Problem Statement": [[4, "problem-statement"], [7, "problem-statement"]], "Evals of Traditional Software vs LLMs": [[4, "evals-table"]], "Evals Design": [[4, "evals-design"]], "LLM Application Testing Requirements Matrix": [[4, "validation-requirements"]], "Conceptual Overview": [[4, "conceptual-overview"]], "Design Considerations": [[4, "design-considerations"]], "Metrics": [[4, "metrics"]], "Key Metrics for Evaluating Generative Tasks": [[4, "key-metrics"]], "Evaluators": [[4, "evaluators"]], "Model-Based Evaluation": [[4, "model-based-evaluation"]], "Evaluating Evaluators": [[4, "evaluating-evaluators"]], "Benchmarks and Leaderboards": [[4, "benchmarks-and-leaderboards"]], "Tools": [[4, "tools"], [7, "tools"]], "LightEval": [[4, "lighteval"]], "MMLU Econometrics Task Dataset sample": [[4, "mmlu-econometrics"]], "Model Families Evaluated Using LightEval": [[4, "model-families"]], "LangSmith": [[4, "langsmith"]], "PromptFoo": [[4, "promptfoo"]], "Comparison": [[4, "comparison"], [5, "comparison"], [5, "id36"]], "Comparison of Lighteval, LangSmith, and Promptfoo": [[4, "tool-comparison"]], "Conclusion": [[4, "conclusion"], [5, "conclusion"], [6, "conclusion"], [7, "conclusion"]], "Local LLMs in Practice": [[5, "local-llms-in-practice"]], "Models Considerations": [[5, "models-considerations"]], "Task Suitability": [[5, "task-suitability"]], "Benchmark results for Llama 2 family of models.": [[5, "llama2-benchmark"]], "Performance & Cost": [[5, "performance-cost"]], "Licensing": [[5, "licensing"]], "Open Source LLMs.": [[5, "open-source-llms"]], "Community Support": [[5, "community-support"]], "Customization": [[5, "customization"]], "Mistral fine-tuning costs as of December 22, 2024.": [[5, "mistral-costs"]], "Tools for Local LLM Deployment": [[5, "tools-for-local-llm-deployment"]], "Serving Models": [[5, "serving-models"]], "LLama.cpp": [[5, "llama-cpp"]], "Llamafile": [[5, "llamafile"]], "Ollama": [[5, "ollama"], [7, "ollama"]], "lama.cpp vs Ollama vs Llamafile Comparison": [[5, "feature-comparison-local"]], "UI": [[5, "ui"]], "LM Studio": [[5, "lm-studio"]], "Jan": [[5, "jan"]], "Open WebUI": [[5, "open-webui"]], "LM Studio vs Jan vs OpenWebUI Comparison": [[5, "feature-comparison-ui"]], "Case Study: The Effect of Quantization on LLM Performance": [[5, "case-study-the-effect-of-quantization-on-llm-performance"]], "Prompts Dataset": [[5, "prompts-dataset"]], "Quantization": [[5, "quantization"]], "Quantization Levels": [[5, "quantization-levels"]], "Benchmarking": [[5, "benchmarking"], [6, "benchmarking"]], "Results": [[5, "results"]], "Quantization Benchmarks": [[5, "quantization-benchmarks"]], "Benchmarking Hardware": [[5, "benchmarking-hardware"]], "Takeaways": [[5, "takeaways"], [6, "takeaways"]], "Safety": [[6, "safety"]], "Safety Risks": [[6, "safety-risks"]], "General AI Safety Risks": [[6, "general-ai-safety-risks"]], "Amplified Existing Harms and Novel Risks": [[6, "amplified-existing-harms-and-novel-risks"]], "Risks Associated with Autonomous AI": [[6, "risks-associated-with-autonomous-ai"]], "Exacerbating Factors": [[6, "exacerbating-factors"]], "LLMs Specific Safety Risks": [[6, "llms-specific-safety-risks"]], "Guidance": [[6, "guidance"]], "Governments & Organizations": [[6, "governments-organizations"]], "Private Sector": [[6, "private-sector"]], "OpenAI": [[6, "openai"]], "Anthropic": [[6, "anthropic"]], "Google": [[6, "google"]], "Rubrics": [[6, "rubrics"]], "MLCommons AI Safety Benchmark": [[6, "mlcommons-ai-safety-benchmark"]], "Centre for the Governance of AI Rubric": [[6, "centre-for-the-governance-of-ai-rubric"]], "Porquoi": [[6, "porquoi"]], "Approaches": [[6, "approaches"]], "Red Teaming": [[6, "red-teaming"]], "Constitutional AI": [[6, "constitutional-ai"]], "Explainable AI (XAI)": [[6, "explainable-ai-xai"]], "Designing a Safety Plan": [[6, "designing-a-safety-plan"]], "Phase 1. Policy Definition": [[6, "phase-1-policy-definition"]], "Phase 2. User Research & Risk Identification": [[6, "phase-2-user-research-risk-identification"]], "Phase 3. Evaluation Framework": [[6, "phase-3-evaluation-framework"]], "Phase 4. Safety Architecture Design": [[6, "phase-4-safety-architecture-design"]], "Phase 5. Implementation & Tools Selection": [[6, "phase-5-implementation-tools-selection"]], "Phase 6. Go-to-Market": [[6, "phase-6-go-to-market"]], "Common Pitfalls": [[6, "common-pitfalls"]], "Technical Implementation Components": [[6, "technical-implementation-components"]], "Benchmarks & Datasets": [[6, "benchmarks-datasets"]], "SALAD-Bench": [[6, "salad-bench"]], "TruthfulQA": [[6, "truthfulqa"]], "HarmBench": [[6, "harmbench"]], "SafeBench": [[6, "safebench"]], "Tools & Techniques": [[6, "tools-techniques"]], "Representative Safety Layer Risk Map.": [[6, "safety-layer-table"]], "Rules-Based Safety Filtering": [[6, "rules-based-safety-filtering"]], "Rules-Based Safety Filtering Tools.": [[6, "safety-layer-tools"]], "LLM-Based Safety Filtering": [[6, "llm-based-safety-filtering"]], "Custom Moderation": [[6, "custom-moderation"]], "Case Study: Implementing a Safety Filter": [[6, "case-study-implementing-a-safety-filter"]], "Evals Dataset": [[6, "evals-dataset"]], "Bad Samples": [[6, "bad-samples"]], "Good Samples": [[6, "good-samples"]], "Safety Filters": [[6, "safety-filters"]], "LLM-Guard": [[6, "llm-guard"]], "Mistral Moderation API": [[6, "mistral-moderation-api"]], "OpenAI Moderation API": [[6, "openai-moderation-api"]], "Custom Judge Validator": [[6, "custom-judge-validator"]], "Structured Output": [[7, "structured-output"]], "Techniques": [[7, "techniques"]], "Prompt Engineering": [[7, "prompt-engineering"]], "JSON Mode (Fine-Tuned)": [[7, "json-mode-fine-tuned"]], "Logit Post-Processing": [[7, "logit-post-processing"]], "Outlines": [[7, "outlines"]], "LangChain": [[7, "langchain"]], "Discussion": [[7, "discussion"]], "Best Practices": [[7, "best-practices"]], "Comparing Solutions": [[7, "comparing-solutions"]], "Structured Output Frameworks Comparison": [[7, "structured-output-frameworks"]], "Research and Ongoing Debate": [[7, "research-and-ongoing-debate"]], "Acknowledgements": [[7, "acknowledgements"]]}, "indexentries": {}}) \ No newline at end of file +Search.setIndex({"docnames": ["markdown/intro", "markdown/preface", "markdown/toc", "notebooks/alignment", "notebooks/cost", "notebooks/evals", "notebooks/local", "notebooks/safety", "notebooks/structured_output"], "filenames": ["markdown/intro.md", "markdown/preface.md", "markdown/toc.md", "notebooks/alignment.ipynb", "notebooks/cost.ipynb", "notebooks/evals.ipynb", "notebooks/local.ipynb", "notebooks/safety.ipynb", "notebooks/structured_output.ipynb"], "titles": ["2. About the Book", "1. Preface", "Taming LLMs", "6. Preference-Based Alignment", "8. The Falling Cost Paradox", "3. The Evals Gap", "7. Local LLMs in Practice", "5. Safety", "4. Structured Output"], "terms": {"am": [0, 7], "alwai": [0, 3, 4, 5, 8], "do": [0, 3, 4, 5, 6, 7, 8], "which": [0, 3, 4, 5, 6, 7, 8], "cannot": [0, 3, 4, 5, 6, 7], "order": [0, 3, 5, 7, 8], "mai": [0, 1, 3, 4, 5, 6, 7, 8], "learn": [0, 3, 5, 6, 7, 8], "how": [0, 1, 3, 4, 5, 6, 7, 8], "pablo": [0, 5], "picasso": 0, "In": [0, 3, 4, 5, 6, 7, 8], "recent": [0, 3, 4, 5, 6, 7, 8], "year": [0, 2, 3, 4, 5, 6, 7, 8], "larg": [0, 1, 2, 3, 4, 5, 6, 7, 8], "languag": [0, 1, 2, 4, 5, 6, 7, 8], "model": [0, 1, 2, 4, 7, 8], "llm": [0, 1, 3, 8], "have": [0, 1, 3, 4, 5, 6, 7, 8], "emerg": [0, 3, 4, 6, 7, 8], "transform": [0, 1, 3, 5, 6, 7, 8], "forc": [0, 5, 8], "technologi": [0, 1, 4, 5, 6, 7], "promis": [0, 3, 4, 5, 7], "revolution": [0, 7], "build": [0, 2, 3, 5, 6, 7, 8], "product": [0, 1, 2, 3, 4, 5, 6, 7, 8], "interact": [0, 3, 4, 5, 6, 7, 8], "comput": [0, 3, 4, 5, 6, 7, 8], "from": [0, 1, 4, 5, 6, 7, 8], "chatgpt": [0, 3, 4, 6, 8], "github": [0, 2, 3, 4, 5, 6, 7, 8], "copilot": 0, "claud": [0, 3, 5, 6, 7], "artifact": 0, "system": [0, 3, 4, 5, 6, 7, 8], "captur": [0, 1, 3, 5, 6, 7], "public": [0, 3, 5, 6, 7], "imagin": [0, 6], "spark": 0, "gold": [0, 3, 5, 7], "rush": 0, "ai": [0, 3, 4, 5, 6, 8], "power": [0, 2, 3, 4, 5, 6, 7, 8], "applic": [0, 1, 2, 3, 4, 6, 7, 8], "howev": [0, 3, 4, 5, 6, 7, 8], "beneath": 0, "surfac": [0, 5], "technolog": [0, 1, 4, 5, 7], "revolut": [0, 4], "li": [0, 3, 5, 6, 7, 8], "complex": [0, 1, 3, 5, 6, 7, 8], "landscap": [0, 3, 5, 6], "practition": [0, 1, 4, 5, 6, 8], "must": [0, 3, 4, 5, 6, 7, 8], "navig": [0, 2, 5, 6, 7], "focus": [0, 3, 4, 5, 6, 7, 8], "bring": [0, 3, 6], "awar": [0, 3, 4, 5, 7], "limit": [0, 1, 2, 4, 5, 6, 7, 8], "har": [0, 2, 5], "solut": [0, 2, 4, 5, 6, 7], "overcom": [0, 5], "them": [0, 1, 3, 4, 5, 6, 7, 8], "robust": [0, 3, 4, 5, 6, 7, 8], "It": [0, 3, 4, 5, 6, 7, 8], "offer": [0, 3, 4, 5, 6, 7, 8], "critic": [0, 2, 3, 4, 5, 6, 7, 8], "implement": [0, 2, 3, 4, 5, 6, 8], "back": [0, 5, 6, 7, 8], "reproduc": [0, 1, 2, 5, 6], "exampl": [0, 1, 2, 3, 5, 6, 7, 8], "while": [0, 1, 2, 3, 4, 5, 6, 7, 8], "mani": [0, 1, 3, 4, 5, 6, 7, 8], "resourc": [0, 3, 4, 5, 6, 7], "cover": [0, 3, 4, 5, 6, 7, 8], "capabl": [0, 1, 2, 4, 5, 6, 7, 8], "specif": [0, 3, 4, 5, 6, 8], "hidden": [0, 3, 7], "pitfal": [0, 1, 3, 4, 5, 6, 8], "engin": [0, 1, 2, 3, 4, 5, 6, 7], "technic": [0, 1, 2, 3, 5, 6, 8], "manag": [0, 1, 4, 5, 6, 7, 8], "face": [0, 3, 4, 5, 6, 7], "when": [0, 1, 2, 3, 4, 5, 6, 7, 8], "comprehens": [0, 2, 3, 4, 5, 6, 7, 8], "guid": [0, 1, 3, 4, 5, 6, 7, 8], "leverag": [0, 3, 5, 6, 7, 8], "battl": [0, 2], "test": [0, 2, 3, 4, 6, 7, 8], "tool": [0, 1, 3, 4], "throughout": [0, 4, 5, 6, 7], "tackl": [0, 3, 5, 7], "follow": [0, 3, 4, 5, 6, 7, 8], "non": [0, 3, 6, 7, 8], "exhaust": [0, 6], "list": [0, 3, 5, 6, 7, 8], "structur": [0, 3, 4, 5, 6, 7], "un": 0, "reliabl": [0, 1, 3, 4, 5, 6, 7, 8], "struggl": [0, 1, 3, 5, 6, 7, 8], "maintain": [0, 1, 3, 4, 5, 6, 7, 8], "consist": [0, 1, 3, 4, 5, 6, 7, 8], "output": [0, 1, 3, 5, 6, 7], "format": [0, 3, 4, 5, 6, 7, 8], "complic": [0, 7], "integr": [0, 1, 3, 4, 5, 6, 7, 8], "larger": [0, 3, 4, 5, 6, 7, 8], "make": [0, 3, 4, 5, 6, 7, 8], "error": [0, 3, 5, 7, 8], "handl": [0, 3, 4, 5, 6, 7, 8], "more": [0, 1, 3, 5, 6, 7, 8], "size": [0, 3, 5, 6, 7, 8], "length": [0, 3, 5, 6, 8], "constraint": [0, 1, 3, 4, 5, 6, 7, 8], "strict": [0, 6, 7, 8], "token": [0, 1, 3, 4, 5, 6, 7, 8], "both": [0, 3, 4, 5, 6, 7], "input": [0, 3, 5, 6, 7, 8], "requir": [0, 3, 6, 7, 8], "care": [0, 3, 4, 5, 6, 7, 8], "chunk": [0, 3, 6], "strategi": [0, 3, 4, 5, 6, 7, 8], "long": [0, 1, 3, 4, 5, 6, 7, 8], "form": [0, 3, 4, 5, 6, 7, 8], "effect": [0, 1, 3, 4, 5, 7, 8], "tradit": [0, 3, 6, 7], "softwar": [0, 1, 3, 4, 6, 7, 8], "methodologi": [0, 3, 5, 6, 7, 8], "break": [0, 1, 3, 4, 5, 7], "down": [0, 1, 4, 5, 6, 7], "deal": [0, 3, 6], "determinist": [0, 8], "gener": [0, 1, 4, 6, 8], "new": [0, 2, 3, 4, 5, 6, 7, 8], "hallucin": [0, 1, 3, 5, 7, 8], "These": [0, 3, 4, 5, 6, 7, 8], "can": [0, 1, 3, 4, 5, 6, 7, 8], "plausibl": [0, 7], "sound": [0, 7], "entir": [0, 4, 5, 6, 8], "fabric": [0, 5, 7], "inform": [0, 3, 4, 5, 6, 7, 8], "creat": [0, 1, 3, 4, 5, 6, 7, 8], "signific": [0, 3, 4, 5, 6, 7, 8], "risk": [0, 1, 3, 4, 5, 6], "safeti": [0, 3, 5, 8], "align": [0, 4, 5, 6, 7, 8], "harm": [0, 3, 5, 6], "bias": [0, 3, 5, 6, 7, 8], "inappropri": [0, 3, 7], "safeguard": [0, 5, 7], "monitor": [0, 3, 4, 5, 6, 7], "ensur": [0, 3, 4, 5, 6, 7, 8], "safe": [0, 3, 5, 7, 8], "deploy": [0, 3, 4, 5, 7, 8], "cost": [0, 3, 5, 7, 8], "optim": [0, 1, 5, 6, 7], "The": [0, 1, 3, 7, 8], "financi": [0, 1, 3, 4, 5, 7, 8], "oper": [0, 3, 5, 6, 7, 8], "base": [0, 1, 4, 6, 8], "quickli": [0, 3, 4, 6], "becom": [0, 3, 4, 5, 6, 7, 8], "prohibit": [0, 3, 5, 6], "without": [0, 1, 3, 4, 5, 6, 7, 8], "observ": [0, 3, 4, 5, 6, 7, 8], "vendor": [0, 4, 5, 6], "lock": [0, 3, 4, 6], "cloud": [0, 3, 4, 5, 6, 7, 8], "provid": [0, 2, 3, 4, 5, 6, 7, 8], "depend": [0, 3, 4, 5, 6, 8], "through": [0, 1, 2, 3, 4, 5, 6, 7, 8], "proprietari": [0, 3, 6, 7, 8], "infrastructur": [0, 4, 6], "difficult": [0, 3, 5, 7], "switch": [0, 6], "self": [0, 3, 5, 6, 7, 8], "host": [0, 4, 5, 6, 7], "take": [0, 2, 3, 4, 5, 6, 7, 8], "hand": [0, 6, 7, 8], "focu": [0, 2, 3, 4, 5, 6, 7, 8], "access": [0, 3, 4, 5, 6, 7, 8], "all": [0, 1, 3, 4, 5, 6, 7, 8], "ar": [0, 1, 3, 4, 5, 6, 7, 8], "fulli": [0, 3, 5, 7], "document": [0, 3, 4, 5, 6, 7, 8], "allow": [0, 5, 6, 7, 8], "reader": [0, 2], "replic": [0, 5, 7, 8], "result": [0, 3, 4, 5, 7, 8], "exactli": [0, 5, 8], "design": [0, 1, 3, 6, 8], "run": [0, 3, 4, 5, 6, 7, 8], "consum": [0, 3, 4, 5, 6, 7, 8], "grade": [0, 3, 4, 5, 6, 7], "hardwar": [0, 3, 4, 5], "expens": [0, 3, 4, 5, 6, 7], "avail": [0, 3, 4, 5, 6, 7, 8], "notebook": [0, 3, 8], "modifi": [0, 3, 5, 7, 8], "extend": [0, 3, 4, 5, 6, 8], "built": [0, 5, 6, 7, 8], "us": [0, 1, 3, 4, 6, 7, 8], "free": [0, 1, 3, 5, 6, 7], "everyon": [0, 5, 6], "minim": [0, 3, 4, 5, 6, 7, 8], "framework": [0, 3, 4, 5, 6], "wai": [0, 3, 4, 5, 6, 7, 8], "priorit": [0, 3, 5, 6, 7], "transpar": [0, 3, 4, 5, 6, 7], "visibl": [0, 5], "being": [0, 3, 4, 5, 6, 7, 8], "better": [0, 2, 3, 4, 5, 6, 7], "understand": [0, 1, 2, 3, 4, 5, 6, 7, 8], "custom": [0, 3, 5, 8], "flexibl": [0, 4, 5, 6, 7, 8], "adapt": [0, 3, 4, 5, 6, 7], "case": [0, 4, 5, 8], "unlik": [0, 3, 5, 6], "black": [0, 3], "box": [0, 6], "commerci": [0, 3, 5, 6, 7, 8], "most": [0, 3, 4, 5, 6, 7, 8], "freeli": [0, 8], "foster": [0, 3, 5, 7, 8], "reduc": [0, 3, 4, 5, 6, 7, 8], "independ": [0, 5, 7, 8], "freedom": [0, 6, 8], "architectur": [0, 3, 4, 5, 6, 8], "decis": [0, 3, 4, 5, 6, 7], "keep": [0, 3, 5, 6, 7], "principl": [0, 3, 5, 6, 7], "itself": [0, 3, 5, 6, 7], "live": [0, 1, 5, 7], "evolv": [0, 3, 4, 5, 6, 7], "chang": [0, 3, 5, 6, 7, 8], "encourag": [0, 3, 5, 7, 8], "report": [0, 3, 5, 6, 7, 8], "suggest": [0, 3, 5, 6, 7, 8], "improv": [0, 3, 4, 5, 6, 7, 8], "contribut": [0, 4, 5, 6, 7], "via": [0, 3, 4, 5, 6, 7, 8], "pull": [0, 6], "request": [0, 3, 4, 5, 6, 7, 8], "share": [0, 3, 5, 6, 7, 8], "own": [0, 3, 4, 5, 6, 7], "experi": [0, 3, 4, 5, 6, 7, 8], "commun": [0, 3, 4, 5, 7, 8], "propos": [0, 4, 5, 7], "chapter": [0, 3, 4, 5, 6, 7, 8], "section": [0, 3, 4, 5, 6, 7, 8], "found": [0, 3, 4, 5, 6, 8], "http": [0, 1, 2, 3, 4, 5, 6, 7, 8], "com": [0, 2, 3, 4, 5, 6, 7, 8], "souzatharsi": [0, 2, 3, 4, 5, 6, 7, 8], "tamingllm": [0, 2, 3, 4, 5, 6, 7, 8], "whether": [0, 3, 4, 5, 6, 7, 8], "you": [0, 1, 3, 4, 5, 6, 7, 8], "ve": [0, 6], "typo": [0, 7], "want": [0, 1, 3, 6, 7, 8], "welcom": 0, "look": [0, 2, 3, 4, 5, 6, 7], "our": [0, 1, 3, 4, 5, 6, 7, 8], "goal": [0, 1, 3, 5, 7, 8], "discourag": 0, "enabl": [0, 3, 4, 5, 6, 7, 8], "By": [0, 1, 2, 3, 5, 7, 8], "upfront": [0, 2, 4], "equip": [0, 2, 5, 7], "avoid": [0, 3, 5, 6, 7, 8], "current": [0, 2, 3, 4, 5, 7, 8], "discours": [0, 2], "around": [0, 2, 3, 5, 6, 7, 8], "tend": [0, 2, 5, 7], "toward": [0, 3, 5, 7, 8], "extrem": [0, 3, 4, 5, 7], "either": [0, 3, 5, 6, 7, 8], "uncrit": 0, "enthusiasm": 0, "wholesal": [0, 5], "dismiss": 0, "differ": [0, 3, 4, 5, 6, 7, 8], "rather": [0, 1, 3, 4, 5, 6, 7], "than": [0, 1, 3, 5, 6, 7, 8], "theoret": [0, 3], "examin": [0, 3, 5, 6, 7, 8], "first": [0, 1, 3, 4, 5, 6, 7, 8], "everi": [0, 4, 5, 7], "concept": [0, 3, 5, 7], "illustr": [0, 3, 5, 6, 7, 8], "execut": [0, 5, 6, 7], "immedi": [0, 3, 4, 5, 6], "analysi": [0, 1, 3, 4, 5, 6, 7], "balanc": [0, 3, 4, 5, 6, 7, 8], "help": [0, 3, 4, 5, 6, 7, 8], "intend": [0, 5, 6, 7], "develop": [0, 1, 3, 4, 5, 6, 7, 8], "step": [0, 1, 3, 4, 5, 6, 7, 8], "insight": [0, 3, 4, 5, 6, 7, 8], "along": [0, 3, 4, 5, 6, 7], "guidanc": [0, 3, 8], "could": [0, 1, 3, 4, 5, 6, 7, 8], "derail": 0, "project": [0, 3, 4, 5, 6, 7], "earli": [0, 3, 4, 5, 7, 8], "befor": [0, 3, 4, 5, 7, 8], "thei": [0, 1, 3, 4, 5, 6, 7, 8], "costli": [0, 5, 7], "problem": [0, 1, 2, 3, 4, 6, 7], "too": [0, 1, 3, 5, 6, 7], "late": [0, 3, 4, 7], "lifecycl": [0, 6, 7], "lead": [0, 1, 3, 4, 5, 6, 7, 8], "genai": [0, 1, 3, 7], "initi": [0, 1, 3, 4, 5, 6, 7, 8], "leader": [0, 2, 5], "advoc": [0, 7], "anyon": [0, 7], "seek": [0, 5, 6, 7], "work": [0, 1, 3, 5, 6, 7, 8], "typic": [0, 3, 4, 5, 6, 7, 8], "job": [0, 5, 6, 7], "role": [0, 3, 5, 6, 7, 8], "platform": [0, 5, 6, 7, 8], "backend": [0, 3, 5], "exist": [0, 3, 4, 5, 6], "ml": [0, 7], "transit": [0, 4, 5, 6, 8], "overse": 0, "motiv": [0, 3, 4, 5, 8], "need": [0, 3, 4, 5, 6, 7, 8], "readi": [0, 5, 7], "desir": [0, 3, 5, 8], "perform": [0, 3, 5, 7, 8], "after": [0, 1, 3, 5, 6, 7, 8], "read": [0, 3, 4, 5, 7, 8], "implic": [0, 1, 3, 5, 7], "recommend": [0, 3, 5, 6, 7, 8], "abl": [0, 3, 5, 8], "deploi": [0, 3, 5, 6, 7], "proper": [0, 3, 4, 6, 7, 8], "realist": [0, 3, 4, 7], "effort": [0, 5, 6, 7, 8], "estim": [0, 4, 5, 7], "impact": [0, 3, 4, 5, 6, 7, 8], "timelin": 0, "To": [0, 3, 5, 6, 7, 8], "should": [0, 3, 4, 5, 6, 7, 8], "basic": [0, 3, 5, 6, 7], "program": [0, 5, 6, 8], "knowledg": [0, 3, 5, 6, 7], "introductori": [0, 1, 2], "langchain": [0, 5], "e": [0, 1, 3, 4, 5, 6, 7, 8], "g": [0, 3, 4, 5, 6, 7, 8], "chat": [0, 3, 5, 6, 7, 8], "prompt": [0, 4, 5, 7], "templat": [0, 5, 8], "openai": [0, 3, 5, 6, 8], "anthrop": [0, 3, 8], "similar": [0, 3, 4, 5, 6, 8], "dive": [0, 4], "here": [0, 2, 3, 4, 5, 6, 7, 8], "get": [0, 3, 4, 5, 6, 7, 8], "start": [0, 3, 4, 5, 6, 7, 8], "clone": [0, 3], "companion": 0, "git": 0, "cd": 0, "activ": [0, 3, 4, 5, 6, 7], "virtual": [0, 5], "m": [0, 3, 5, 6, 7, 8], "venv": [0, 8], "tame": [0, 3, 4, 5, 6, 7, 8], "env": [0, 3, 5, 7, 8], "bin": [0, 6], "On": [0, 5, 6, 8], "window": [0, 4, 5, 6], "script": [0, 6], "try": [0, 1, 3, 5, 7, 8], "contain": [0, 3, 4, 5, 6, 7, 8], "possibl": [0, 3, 4, 5, 6, 7, 8], "includ": [0, 1, 3, 4, 5, 6, 7, 8], "necessari": [0, 3, 4, 5, 7], "instal": [0, 3, 5, 6, 8], "go": [0, 3, 5, 8], "feel": [0, 6], "prefer": [0, 5, 6, 7, 8], "packag": [0, 4, 5, 6, 8], "pip": [0, 3, 5, 6, 8], "poetri": [0, 7], "file": [0, 3, 5, 6, 7, 8], "root": [0, 3], "directori": [0, 5, 6], "add": [0, 3, 5, 6, 7], "other": [0, 3, 4, 5, 6, 7, 8], "sensit": [0, 3, 4, 5, 6, 7], "openai_api_kei": [0, 3], "your_openai_api_key_her": 0, "never": [0, 8], "commit": [0, 3, 5, 7], "version": [0, 3, 4, 5, 6, 7, 8], "control": [0, 1, 3, 4, 5, 6, 7, 8], "kept": [0, 5], "privat": [0, 5], "If": [0, 1, 3, 4, 5, 6, 7, 8], "encount": [0, 2, 5, 7], "rate": [0, 3, 4, 5, 6, 7], "consid": [0, 3, 4, 5, 6, 7, 8], "smaller": [0, 3, 4, 5, 6, 8], "retri": [0, 8], "logic": [0, 1, 3, 5, 7], "conflict": [0, 3, 5], "fresh": 0, "like": [0, 1, 3, 4, 5, 6, 7, 8], "check": [0, 5, 6, 7, 8], "page": [0, 5, 6], "known": [0, 5, 7, 8], "now": [0, 1, 3, 4, 5, 6, 7, 8], "let": [0, 3, 4, 5, 6, 7, 8], "begin": [0, 5, 6, 7, 8], "explor": [0, 1, 3, 4, 5, 6, 7, 8], "dr": [0, 3], "tharsi": [0, 2, 3, 4, 5, 6, 7, 8], "souza": [0, 2, 3, 4, 5, 6, 7, 8], "scientist": [0, 1, 6, 7], "special": [0, 4, 5, 6, 7, 8], "he": [0, 3, 5, 7], "lectur": 0, "columbia": 0, "univers": [0, 5, 6, 7], "master": [0, 4, 6, 8], "scienc": [0, 3, 5, 7], "appli": [0, 3, 5, 6, 7, 8], "analyt": 0, "incom": [0, 5], "head": [0, 3, 5, 7, 8], "equiti": [0, 5], "citadel": 0, "former": [0, 1, 5, 6], "senior": [0, 5], "vp": 0, "two": [0, 3, 4, 5, 6, 7, 8], "sigma": [0, 3], "invest": [0, 3, 4, 5, 7], "also": [0, 3, 4, 5, 6, 7, 8], "enjoi": 0, "mentor": 0, "under": [0, 3, 4, 5, 6, 7, 8], "repres": [0, 3, 4, 5, 6, 8], "student": [0, 3, 7], "profession": [0, 3, 5, 7, 8], "divers": [0, 3, 4, 5, 7], "global": [0, 5, 7], "ecosystem": [0, 4, 5, 6], "With": [0, 3, 5, 6, 7, 8], "over": [0, 2, 3, 4, 5, 6, 7, 8], "15": [0, 5, 6, 7, 8], "deliv": [0, 4, 5, 6], "across": [0, 1, 3, 4, 5, 6, 7, 8], "startup": 0, "fortun": 0, "500": [0, 3, 5, 7], "compani": [0, 3, 4, 5, 7, 8], "numer": [0, 4, 5, 7, 8], "scholarli": 0, "frequent": [0, 5, 6, 8], "speaker": [0, 5], "academ": [0, 3, 5, 7], "busi": [0, 5, 6, 7], "confer": [0, 8], "ground": [0, 3, 5, 6], "background": [0, 1, 5, 6], "draw": [0, 3, 5, 7, 8], "scale": [0, 3, 4, 5, 6, 7, 8], "stage": [0, 3, 7, 8], "major": [0, 3, 4, 5, 7, 8], "institut": [0, 5, 7], "well": [0, 3, 4, 5, 6, 7, 8], "advis": [0, 3], "profit": [0, 5, 7, 8], "organ": [0, 3, 4, 5, 6], "uniqu": [0, 3, 4, 5, 6, 7, 8], "bridg": [0, 6, 7], "gap": [0, 1, 3, 4, 6, 7], "between": [0, 1, 3, 4, 5, 6, 7, 8], "potenti": [0, 1, 3, 4, 5, 6, 7, 8], "next": [0, 1, 3, 4, 5, 6, 7, 8], "hold": [0, 3, 5], "ph": [0, 7], "d": [0, 3, 4, 5, 6, 7, 8], "ucl": 0, "london": 0, "phil": [0, 7], "sc": 0, "b": [0, 4, 5, 6, 7, 8], "tell": [1, 3, 7], "mere": [1, 5], "what": [1, 3, 4, 5, 6, 7, 8], "someth": [1, 5, 6], "i": [1, 2, 4, 5, 6, 7, 8], "emanuel": [1, 3, 5, 7], "derman": 1, "an": [1, 2, 3, 4, 5, 6, 7, 8], "altern": [1, 3, 4, 5, 6, 7], "titl": [1, 2, 3, 4, 5, 6, 7, 8], "thi": [1, 2, 3, 4, 5, 6, 7, 8], "book": [1, 5], "been": [1, 3, 4, 5, 6, 7], "behav": 1, "badli": 1, "come": [1, 3, 5, 6, 7, 8], "notic": [1, 3, 4, 5, 7], "parallel": [1, 3, 5, 6], "": [1, 3, 4, 5, 6, 7, 8], "semin": [1, 7], "2011": 1, "coincident": 1, "just": [1, 3, 4, 5, 6, 7, 8], "caution": 1, "against": [1, 3, 4, 5, 6, 7], "treat": [1, 5, 7], "perfect": [1, 5, 6], "represent": [1, 5, 6, 7], "realiti": [1, 7], "aim": [1, 3, 4, 5, 6, 7, 8], "highlight": [1, 3, 5, 6, 7, 8], "practic": [1, 3, 4, 5, 7], "physicist": 1, "goldman": 1, "sach": 1, "quant": 1, "scientif": [1, 3, 5, 6], "fail": [1, 3, 5, 7], "we": [1, 3, 4, 5, 6, 7, 8], "mistak": [1, 7], "approxim": [1, 4, 5, 8], "full": [1, 3, 4, 5, 6, 7, 8], "assumpt": [1, 5, 7], "core": [1, 4, 5, 6, 7], "premis": [1, 6], "hi": [1, 5, 7, 8], "aspect": [1, 3, 5, 7], "world": [1, 3, 4, 5, 6, 7, 8], "inher": [1, 2, 3, 5, 7, 8], "involv": [1, 3, 4, 5, 6, 7, 8], "simplif": 1, "argu": [1, 4, 7, 8], "crise": 1, "2008": 1, "crash": 1, "occur": [1, 3, 5, 7], "partli": 1, "becaus": [1, 3, 5, 7], "peopl": [1, 3, 5, 6, 7], "put": [1, 5, 6], "much": [1, 3, 5, 6], "faith": 1, "mathemat": [1, 5, 6, 8], "recogn": [1, 3, 5, 7], "human": [1, 4, 5, 6, 7, 8], "behavior": [1, 3, 5, 6, 7], "market": [1, 4, 5, 6, 8], "dynam": [1, 3, 5, 7], "fact": [1, 3, 5, 7], "reason": [1, 3, 5, 6, 7, 8], "Their": [1, 5, 8], "respons": [1, 4, 5, 6, 7, 8], "often": [1, 3, 4, 5, 6, 7, 8], "convinc": [1, 3], "probabilist": [1, 5, 8], "train": [1, 4, 5, 6, 7, 8], "data": [1, 4, 5, 6, 7, 8], "true": [1, 3, 4, 5, 7, 8], "even": [1, 3, 4, 5, 6, 7, 8], "though": [1, 3, 4, 5, 6, 7, 8], "insist": 1, "machin": [1, 3, 6, 7, 8], "todai": [1, 4, 6, 8], "grow": [1, 3, 5, 6, 7, 8], "pervas": [1, 7], "belief": [1, 6, 7], "solv": [1, 3, 4, 5, 6, 7, 8], "ani": [1, 3, 4, 5, 6, 7, 8], "context": [1, 3, 4, 5, 6, 7, 8], "content": 1, "wish": [1, 5], "user": [1, 4, 5, 6, 8], "moreov": 1, "were": [1, 3, 5, 6, 7, 8], "predict": [1, 3, 5, 6, 7, 8], "chatbot": [1, 3, 5, 6, 7], "twist": [1, 7], "wrap": [1, 6, 8], "further": [1, 3, 4, 5, 6, 7, 8], "daili": [1, 4, 6, 7], "life": [1, 5, 6, 7], "workflow": [1, 4, 5, 6, 7, 8], "affect": [1, 5, 6, 7], "decid": [1, 3, 5], "action": [1, 3, 5, 7], "coupl": [1, 6], "lack": [1, 3, 5, 7, 8], "pose": [1, 3, 5, 7, 8], "still": [1, 4, 5, 6, 7], "figur": [1, 5, 6], "out": [1, 3, 4, 5, 6, 7, 8], "serv": [1, 3, 4, 5, 7, 8], "builder": [1, 6], "who": [1, 3, 5, 6, 7, 8], "remain": [1, 3, 4, 5, 6, 7], "clear": [1, 3, 4, 5, 6, 7, 8], "ei": 1, "about": [1, 3, 4, 5, 6, 7, 8], "therefor": [1, 3, 5, 6, 7], "end": [1, 3, 4, 5, 6, 7, 8], "detail": [1, 3, 4, 5, 6, 7, 8], "python": [1, 2, 5, 6, 7, 8], "code": [1, 2, 3, 5, 6, 7, 8], "diminish": [1, 3, 4, 5], "promot": [1, 3, 5, 7], "nuanc": [1, 3, 5, 6, 7, 8], "acknowledg": [1, 5, 7], "within": [1, 3, 4, 5, 7, 8], "trustworthi": [1, 7], "taught": 1, "u": [1, 3, 5, 7, 8], "where": [1, 3, 4, 5, 6, 7, 8], "der11": 1, "why": [1, 3, 5, 7, 8], "confus": [1, 4, 7], "illus": 1, "disast": [1, 5], "wall": [1, 6], "street": [1, 6], "press": [1, 5, 6], "isbn": [1, 3, 5], "9781439165010": 1, "url": [1, 2, 3, 4, 5, 6, 7, 8], "googl": [1, 5, 6, 8], "co": [1, 3, 4, 5, 6, 7], "uk": [1, 7], "id": [1, 5, 6, 7, 8], "lke_cwm4wm8c": 1, "sign": [2, 5, 7], "up": [2, 3, 4, 5, 6, 7], "receiv": [2, 3, 5, 6, 7, 8], "updat": [2, 3, 4, 5, 6, 7, 8], "abstract": [2, 5, 7, 8], "heavili": [2, 3, 4, 5, 7, 8], "gloss": 2, "fundament": [2, 3, 5, 6, 7, 8], "challeng": [2, 3, 4, 5, 6, 7, 8], "convers": [2, 3, 4, 5, 6, 7, 8], "kei": [2, 3, 4, 6, 7, 8], "proven": [2, 4], "yet": [2, 3, 4, 5, 7], "concret": [2, 4, 7, 8], "sidestep": 2, "misc": [2, 3, 4, 5, 6, 7, 8], "tharsistpsouza2024tamingllm": [2, 3, 4, 5, 6, 7, 8], "author": [2, 3, 4, 5, 6, 7, 8], "t": [2, 3, 4, 5, 6, 7, 8], "p": [2, 3, 4, 5, 6, 7, 8], "2024": [2, 3, 4, 5, 7, 8], "journal": [2, 3, 4, 5, 6, 7, 8], "repositori": [2, 3, 4, 5, 6, 7, 8], "valu": [3, 5, 6, 7, 8], "its": [3, 4, 5, 6, 7, 8], "privileg": 3, "abov": [3, 5, 7], "soon": [3, 8], "lose": [3, 5], "dwight": 3, "eisenhow": 3, "releas": [3, 4, 5, 6, 7, 8], "3": [3, 4, 5, 6, 8], "5": [3, 4, 5, 6, 8], "2022": [3, 5, 6, 7], "mark": [3, 5, 7], "pivot": [3, 5, 6], "moment": 3, "histori": [3, 4, 5, 6], "artifici": [3, 5, 6, 7], "intellig": [3, 5, 6, 7], "five": [3, 5, 7], "dai": [3, 4, 5, 6, 7, 8], "launch": [3, 5, 7], "attract": [3, 5], "million": [3, 4, 5, 6], "month": [3, 4, 5, 6, 7], "becam": [3, 4], "fastest": [3, 5, 7], "100": [3, 4, 5, 6, 7, 8], "monthli": [3, 4, 5], "rais": [3, 4, 5, 7], "intrigu": 3, "question": [3, 4, 5, 6, 7, 8], "did": [3, 5, 8], "dramat": [3, 4, 5, 6, 8], "predecessor": 3, "gpt": [3, 4, 5, 6, 7, 8], "had": [3, 5], "same": [3, 5, 6, 7, 8], "number": [3, 4, 5, 6, 7, 8], "paramet": [3, 4, 5, 6, 7, 8], "far": [3, 4, 6, 7], "less": [3, 4, 5, 6, 7], "attent": [3, 4, 6], "arguabl": [3, 6], "feedback": [3, 5, 7, 8], "abil": [3, 4, 5, 6, 7, 8], "least": [3, 5, 7], "ey": 3, "breakthrough": [3, 7], "demonstr": [3, 4, 5, 6, 7, 8], "crucial": [3, 4, 6, 7, 8], "greater": [3, 5, 6, 7], "process": [3, 4, 5, 6, 7], "modern": [3, 5, 8], "techniqu": [3, 4, 5, 6], "direct": [3, 5, 6, 7], "rafailov": 3, "et": [3, 4, 5, 6, 7, 8], "al": [3, 4, 5, 6, 7, 8], "present": [3, 5, 6, 7, 8], "autom": [3, 4, 5, 7, 8], "fashion": [3, 8], "open": [3, 4, 5, 7, 8], "sourc": [3, 4, 5, 7, 8], "common": [3, 4, 5, 6, 8], "pre": [3, 4, 5, 6, 7, 8], "default": [3, 5, 6, 7, 8], "state": [3, 5, 6, 7, 8], "art": [3, 5, 7], "object": [3, 4, 5, 6, 7, 8], "given": [3, 4, 5, 6, 7, 8], "webpag": 3, "internet": [3, 5], "veri": [3, 4, 5, 6, 7], "ask": [3, 5, 6, 7, 8], "instruct": [3, 4, 5, 6, 7, 8], "sai": [3, 8], "ouyang": [3, 7], "2": [3, 4, 5, 8], "explain": [3, 5], "moon": 3, "land": [3, 5, 6], "6": [3, 4, 5, 6], "old": [3, 5], "import": [3, 4, 5, 6, 7, 8], "pipelin": [3, 4, 5, 6, 7, 8], "pipe": [3, 7], "text": [3, 4, 5, 6, 7, 8], "gpt2": [3, 5], "msg": 3, "short": [3, 5, 7, 8], "sentenc": [3, 5, 7], "_": [3, 5, 7, 8], "rang": [3, 4, 5, 6, 7, 8], "len": [3, 5, 6, 7, 8], "print": [3, 4, 5, 6, 7, 8], "f": [3, 4, 5, 6, 7, 8], "n": [3, 5, 6, 7, 8], "1": [3, 4, 5, 6, 8], "0": [3, 4, 5, 6, 7, 8], "generated_text": [3, 8], "good": [3, 5, 6, 8], "idea": [3, 4, 6, 7, 8], "one": [3, 4, 5, 6, 7, 8], "those": [3, 5, 7, 8], "littl": [3, 5], "green": [3, 7], "dot": [3, 4], "Then": [3, 4, 5], "line": [3, 5, 6, 7], "later": [3, 5, 6, 7, 8], "re": [3, 4, 5, 6, 7, 8], "alreadi": [3, 5, 8], "movi": 3, "theori": [3, 5], "some": [3, 5, 6, 7, 8], "mean": [3, 4, 5, 6, 7, 8], "word": [3, 4, 5, 8], "tepid": 3, "articl": [3, 5, 6, 7], "sure": [3, 5, 7, 8], "lunar": 3, "As": [3, 4, 5, 6, 7, 8], "see": [3, 4, 5, 6, 7, 8], "coher": [3, 5, 6, 8], "explan": [3, 5, 7, 8], "child": [3, 5, 7], "nonsens": [3, 7], "meander": 3, "unrel": [3, 5, 7], "topic": [3, 5, 6, 7, 8], "simpl": [3, 5, 6, 7, 8], "appropri": [3, 4, 5, 6, 7, 8], "young": [3, 5, 7], "instead": [3, 4, 5, 6, 7, 8], "address": [3, 4, 5, 6, 7, 8], "issu": [3, 5, 7, 8], "introduc": [3, 5, 6, 7, 8], "rlhf": [3, 4, 7, 8], "intent": [3, 7], "wide": [3, 4, 5, 6, 7, 8], "task": [3, 4, 7, 8], "fig": [3, 4, 5, 6, 7, 8], "collect": [3, 5, 6, 7, 8], "sampl": [3, 6, 8], "label": [3, 5, 6, 7, 8], "comparison": 3, "reward": [3, 5, 6, 7], "sever": [3, 4, 5, 6, 7, 8], "rank": [3, 5, 6, 7], "best": [3, 4, 5, 6, 7], "worst": 3, "rm": [3, 6], "reinforc": [3, 5, 6, 7], "write": [3, 5, 6, 7, 8], "stori": [3, 7], "frog": 3, "calcul": [3, 4, 5, 6, 7, 8], "score": [3, 4, 5, 6, 7, 8], "ppo": [3, 6], "proxim": [3, 6], "iter": [3, 5, 6, 7, 8], "accur": [3, 4, 5, 6, 7], "undesir": [3, 7], "simplifi": [3, 5, 6, 8], "view": [3, 5, 7], "show": [3, 4, 5, 6, 7, 8], "progress": [3, 4, 7], "pattern": [3, 4, 5, 6, 7, 8], "ha": [3, 4, 5, 6, 7, 8], "instanc": [3, 4, 5, 6, 7], "directli": [3, 4, 5, 6, 7, 8], "For": [3, 4, 5, 6, 7, 8], "llama": [3, 4, 5, 7, 8], "guard": 3, "team": [3, 5, 6, 8], "8b": [3, 6, 7, 8], "wa": [3, 4, 5, 6, 7, 8], "classif": [3, 5, 6, 7, 8], "bypass": [3, 7], "similarli": [3, 4, 5, 6, 7], "zephyr": 3, "7b": [3, 5, 6, 7, 8], "alpha": [3, 5, 8], "mistral": [3, 8], "publicli": [3, 5, 8], "assist": [3, 5, 6, 7, 8], "paper": [3, 5, 6, 7, 8], "compon": [3, 5, 6], "particular": [3, 4, 5, 6, 7, 8], "foundat": [3, 4, 5, 6, 7], "advanc": [3, 4, 5, 6, 7, 8], "method": [3, 5, 7, 8], "strong": [3, 5, 6, 7, 8], "At": [3, 4, 5, 6, 8], "high": [3, 4, 5, 6, 7, 8], "level": [3, 4, 5, 7, 8], "carefulli": [3, 4, 5, 6, 7, 8], "curat": [3, 5, 6], "purpos": [3, 5, 6, 7, 8], "exhibit": [3, 5, 6, 7], "domain": [3, 4, 5, 6, 7], "emploi": [3, 5, 7, 8], "prove": [3, 5, 7], "particularli": [3, 4, 5, 6, 7, 8], "valuabl": [3, 5, 6, 8], "scenario": [3, 5, 6, 7, 8], "precis": [3, 4, 5, 6, 7, 8], "style": [3, 5], "tone": 3, "expertis": [3, 5, 7], "medic": [3, 5, 6], "legal": [3, 5, 6, 7], "field": [3, 5, 6, 7, 8], "adher": [3, 5, 7, 8], "guidelin": [3, 5, 7], "servic": [3, 4, 5, 6, 7], "standard": [3, 4, 5, 6, 7], "approach": [3, 5, 6, 8], "each": [3, 4, 5, 6, 7, 8], "distinct": [3, 5, 6, 7, 8], "advantag": [3, 4, 5, 6, 7, 8], "weight": [3, 4, 5, 6, 7, 8], "maximum": [3, 5, 6, 7], "lora": [3, 6, 7], "low": [3, 4, 5, 6, 7, 8], "hu": [3, 7, 8], "2021": [3, 4, 5], "small": [3, 4, 5, 6, 8], "matric": 3, "effici": [3, 4, 5, 6, 7, 8], "qlora": 3, "quantiz": 3, "dettmer": 3, "2023": [3, 4, 5, 6, 7, 8], "combin": [3, 4, 5, 6, 7, 8], "memori": [3, 4, 5, 6, 7], "footprint": [3, 4, 6], "modest": 3, "increas": [3, 4, 5, 6, 7, 8], "likelihood": [3, 5, 7, 8], "obtain": [3, 5, 6, 7, 8], "probabl": [3, 5, 6, 8], "outcom": [3, 5, 7, 8], "hong": [3, 5], "unintend": [3, 7], "suboptim": 3, "seen": [3, 5, 7], "research": [3, 4, 5, 6], "maxim": [3, 5], "shown": [3, 5, 6, 7], "alon": [3, 5, 6, 7], "gain": [3, 4, 5, 6, 7], "achiev": [3, 4, 5, 6, 7, 8], "bai": [3, 5, 7], "touvron": [3, 6], "sinc": [3, 4, 5, 6, 7, 8], "main": [3, 5, 6, 7, 8], "categori": [3, 5, 6, 7, 8], "algorithm": [3, 5, 7], "meanwhil": [3, 6], "superior": [3, 5, 7], "benchmark": 3, "xu": [3, 5, 6, 7], "schulman": [3, 7], "2017": [3, 5], "popular": [3, 6, 8], "understood": 3, "set": [3, 4, 5, 6, 7, 8], "rule": [3, 5, 6, 8], "govern": [3, 5], "reflect": [3, 5, 6, 7], "anoth": [3, 5, 6, 7], "adjust": [3, 5, 6, 7, 8], "One": [3, 4, 5, 6, 7, 8], "strength": [3, 5, 6, 7], "2024c": [3, 6], "real": [3, 4, 5, 6, 7, 8], "noisi": 3, "delai": [3, 5, 6, 7], "subsequ": [3, 8], "situat": [3, 5, 7], "clip": 3, "surrog": 3, "function": [3, 4, 5, 6, 7, 8], "stabl": [3, 5], "prevent": [3, 4, 5, 7, 8], "overreact": 3, "converg": 3, "due": [3, 5, 6, 7], "simplic": [3, 6], "award": [3, 5], "runner": 3, "neurip": 3, "blog": [3, 4, 5, 6, 7, 8], "4": [3, 4, 5, 6, 8], "fit": [3, 4, 5, 7, 8], "pair": [3, 5, 7], "rl": [3, 7], "find": [3, 4, 5, 6, 7, 8], "contrast": [3, 4, 5, 6, 7, 8], "satisfi": [3, 5], "implicit": [3, 5, 7], "whose": [3, 5], "correspond": [3, 5, 8], "extract": [3, 4, 5, 6, 7, 8], "close": [3, 5, 6, 7], "compar": [3, 4, 5, 6, 7], "assign": [3, 5, 6, 7, 8], "higher": [3, 4, 5, 6, 8], "kl": [3, 6], "diverg": [3, 6], "origin": [3, 4, 5, 6, 7, 8], "preserv": [3, 6, 7, 8], "defin": [3, 4, 5, 6, 7, 8], "equat": 3, "mathcal": 3, "l": [3, 5], "pi_": 3, "theta": [3, 8], "ref": 3, "mathbb": [3, 8], "x": [3, 5, 6, 7, 8], "y_w": 3, "y_l": 3, "sim": [3, 8], "left": [3, 6], "log": [3, 4, 5, 6], "beta": [3, 5, 7, 8], "underbrac": 3, "frac": [3, 6, 7], "color": [3, 5], "red": 3, "right": [3, 5, 6, 7], "respect": [3, 5, 6, 7], "deviat": [3, 5, 6, 7], "straightforward": [3, 5, 6, 7, 8], "librari": [3, 4, 5, 6, 7, 8], "huggingfac": [3, 4, 5, 6, 7], "trl": [3, 6, 7], "2024d": [3, 6], "suit": [3, 5, 7], "friendli": [3, 5, 6], "interfac": [3, 4, 5, 6, 7, 8], "featur": [3, 5, 6, 7, 8], "distinguish": [3, 5, 7], "scalabl": [3, 5, 7], "doe": [3, 5, 6, 7, 8], "pretrain": [3, 5, 6], "hou": [3, 5, 6], "poor": [3, 5, 7], "return": [3, 4, 5, 6, 7, 8], "addit": [3, 4, 5, 6, 7, 8], "benefit": [3, 4, 5, 6, 7, 8], "fix": [3, 5, 6, 7], "invers": 3, "trend": [3, 4, 5, 7], "util": [3, 4, 5, 6, 7], "rapid": [3, 5, 6, 7], "yield": [3, 4, 5], "onli": [3, 4, 5, 6, 7, 8], "margin": [3, 5, 7, 8], "capit": [3, 5, 8], "inaccuraci": [3, 5], "nois": 3, "dure": [3, 4, 5, 6, 7, 8], "accuraci": [3, 4, 5, 6, 7, 8], "lag": [3, 5, 7], "significantli": [3, 4, 5, 6, 7], "indic": [3, 5, 6, 7, 8], "signal": [3, 7], "plateau": 3, "sophist": [3, 5, 6, 7], "previou": [3, 5, 6, 8], "deriv": [3, 5, 6], "pairwis": [3, 5], "feng": [3, 7], "substanti": [3, 4, 5, 6, 7], "wors": [3, 6, 8], "influenc": [3, 5, 7, 8], "success": [3, 4, 5, 6, 7, 8], "imbal": 3, "stronger": 3, "bad": 3, "ones": [3, 6, 7], "loss": [3, 4, 5, 6, 7], "gradient": [3, 5, 7], "dispref": 3, "unbalanc": 3, "trajectori": [3, 4], "stuck": 3, "saddl": 3, "point": [3, 4, 5, 6, 7], "forward": [3, 5, 7], "futur": [3, 4, 5, 6, 7], "phenomenon": [3, 7, 8], "degrad": [3, 4, 5, 6, 7, 8], "danger": [3, 6, 7], "loop": [3, 5, 6, 7], "recurs": 3, "kazdan": 3, "qualiti": [3, 4, 5, 6, 7, 8], "pollut": 3, "replac": [3, 5, 6], "amplif": 3, "reduct": [3, 4, 5, 6], "express": [3, 4, 5, 7, 8], "catastroph": [3, 7], "forget": [3, 8], "previous": [3, 5, 7, 8], "mitig": [3, 4, 5, 6, 7, 8], "mix": [3, 5, 7, 8], "metric": [3, 6, 7], "sz\u00e9p": 3, "regular": [3, 5, 6, 7, 8], "relev": [3, 4, 5, 6, 7], "scarc": 3, "behaviour": 3, "strateg": [3, 5, 6, 7, 8], "compli": [3, 4, 5, 6, 7, 8], "modif": [3, 5, 6, 7], "outsid": [3, 5], "evidenc": 3, "landmark": 3, "askel": [3, 5, 7], "2024a": [3, 6, 8], "dec": 3, "explicitli": [3, 5, 6], "so": [3, 4, 5, 7, 8], "might": [3, 4, 5, 6, 7, 8], "pretend": 3, "adopt": [3, 5, 6, 7, 8], "actual": [3, 5, 6, 7, 8], "onc": [3, 5, 6, 7], "complet": [3, 5, 6, 7, 8], "describ": [3, 5, 6, 7], "harmless": [3, 7], "told": 3, "retrain": [3, 6], "queri": [3, 5], "tier": [3, 4, 5, 7], "paid": [3, 5], "column": [3, 5, 7], "condit": [3, 5, 8], "toxic": [3, 6, 7], "excerpt": [3, 5, 6], "scratchpad": 3, "refus": [3, 7, 8], "happen": [3, 7], "bomb": [3, 7], "engag": [3, 4, 5, 6, 7, 8], "intern": [3, 5, 7], "unmonitor": 3, "longer": [3, 5, 6], "believ": [3, 5, 6, 7, 8], "act": [3, 5, 6, 7, 8], "therebi": [3, 5], "reveal": [3, 4, 5, 6, 7], "complianc": [3, 4, 5, 6, 7], "phase": [3, 4, 5, 6, 8], "natur": [3, 5, 6, 7, 8], "evid": [3, 5, 6, 7, 8], "seemingli": 3, "surpris": 3, "appear": [3, 5, 7, 8], "criteria": [3, 5, 7], "underli": [3, 5, 7, 8], "anim": [3, 7], "welfar": 3, "instil": 3, "implicitli": 3, "consequ": [3, 5, 6, 7, 8], "explicit": [3, 5, 6, 7, 8], "chain": [3, 5], "thought": [3, 5, 6, 8], "opaqu": 3, "aris": [3, 5, 7], "opu": 3, "sonnet": [3, 5, 6], "wherea": [3, 5], "haiku": [3, 7], "persist": [3, 4], "resist": [3, 5], "embed": [3, 4, 5, 6], "doesn": [3, 5, 6, 8], "anti": [3, 5], "lab": 3, "exfiltr": [3, 7], "protect": [3, 4, 5, 6, 7], "Not": [3, 5, 7], "malici": [3, 5, 7], "support": [3, 5, 7, 8], "concern": [3, 5, 6, 7], "mechan": [3, 4, 5, 6, 7, 8], "insuffici": [3, 5], "don": [3, 5, 8], "concerningli": 3, "call": [3, 4, 5, 6, 7, 8], "detect": [3, 5, 7, 8], "decept": [3, 5, 7], "warrant": [3, 7], "deeper": [3, 5], "scrutini": [3, 5, 7], "reli": [3, 5, 7, 8], "cross": [3, 5, 6, 7], "circular": 3, "bia": [3, 5, 7, 8], "truli": [3, 5, 6], "trust": [3, 5, 7, 8], "referenti": 3, "ly": 3, "hood": [3, 8], "deep": [3, 5, 7, 8], "mechanist": 3, "drive": [3, 4, 7, 8], "correl": [3, 4, 5, 6], "miss": [3, 5, 7], "confound": 3, "factor": [3, 4, 5, 6, 8], "establish": [3, 4, 5, 6, 7], "attempt": [3, 5, 7, 8], "causal": [3, 5], "heavi": 3, "relianc": [3, 4, 5, 7], "oversimplifi": 3, "frame": 3, "subtler": 3, "narr": [3, 5], "henc": [3, 4, 5, 6, 7, 8], "agenc": [3, 5, 7], "onto": 3, "anthropomorph": 3, "obscur": 3, "blind": [3, 5], "failur": [3, 4, 5, 7], "mode": [3, 6, 7], "map": [3, 4, 5, 6, 8], "cleanli": 3, "analogi": 3, "excel": [3, 5, 6, 7, 8], "review": [3, 4, 5, 6, 7, 8], "prof": 3, "jacob": [3, 5, 6, 7], "andrea": [3, 5, 7], "yoshua": [3, 7], "bengio": [3, 7], "jasjeet": 3, "sekhon": 3, "rohin": 3, "shah": 3, "2024b": [3, 6, 8], "assum": [3, 5, 7], "acm": [3, 7], "inc": [3, 5, 8], "dedic": [3, 5, 6, 7], "democrat": [3, 4, 5, 8], "educ": [3, 5, 7], "k": [3, 5, 7, 8], "12": [3, 4, 5, 6, 7], "name": [3, 4, 5, 6, 7, 8], "smolk": 3, "ll": [3, 5, 6], "walk": 3, "measur": [3, 4, 5, 6, 7], "huggingfacetb": [3, 8], "360m": [3, 5, 6], "compact": [3, 5, 6, 7], "part": [3, 4, 5, 7, 8], "famili": [3, 7, 8], "publish": [3, 7, 8], "api": [3, 4, 5, 6, 8], "local": [3, 4, 5, 7, 8], "infer": [3, 4, 5, 6, 7, 8], "remot": [3, 5], "load": [3, 4, 5, 6, 7, 8], "store": [3, 4, 5, 7], "eventu": [3, 5, 6], "your_openai_api_kei": 3, "reusabl": 3, "anchor": [3, 7], "worth": [3, 4, 5, 6, 8], "choic": [3, 5, 6, 7, 8], "lightweight": [3, 4, 5, 6, 8], "suitabl": [3, 5, 7], "devic": [3, 4, 5, 6, 8], "Its": [3, 5, 6], "candid": [3, 5, 6], "said": [3, 5, 7], "necessarili": [3, 4, 5, 6, 7], "par": [3, 5], "mind": [3, 5, 6, 7, 8], "factual": [3, 5, 6, 7], "inconsist": [3, 5, 7], "guardrail": [3, 7], "articul": 3, "uphold": [3, 7], "employe": [3, 5], "stakehold": [3, 5, 7], "expect": [3, 4, 5, 6, 7, 8], "regard": [3, 5, 6, 7], "ethic": [3, 5, 6, 7], "conduct": [3, 5], "social": [3, 5, 7], "mission": [3, 7], "vision": [3, 5, 6, 7], "cultur": [3, 5, 6, 7], "account": [3, 4, 5, 7], "codifi": 3, "mlcommon": 3, "vidgen": [3, 7], "encompass": [3, 4, 7, 8], "seven": 3, "hazard": [3, 5, 7], "violent": [3, 7], "crime": [3, 7], "sex": [3, 7], "relat": [3, 4, 5, 6, 7, 8], "sexual": [3, 7], "exploit": [3, 4, 5, 7], "indiscrimin": [3, 7], "weapon": [3, 7], "chemic": 3, "biolog": 3, "radiolog": 3, "nuclear": [3, 5], "explos": [3, 4, 7], "cbrne": 3, "suicid": [3, 7], "hate": [3, 7], "speech": [3, 7], "below": [3, 5, 6, 7, 8], "markdown": [3, 5, 6, 7], "written": [3, 5], "english": [3, 4], "o": [3, 5, 7, 8], "ipython": [3, 5, 7], "displai": [3, 5, 7, 8], "def": [3, 5, 7, 8], "load_polici": 3, "policy_path": 3, "path": [3, 5, 6, 7], "join": [3, 5, 7], "genai_polici": 3, "md": [3, 5, 6, 7, 8], "r": [3, 5, 6, 7, 8], "policy_cont": 3, "classroom": [3, 7], "accept": [3, 5, 6, 7], "unaccept": [3, 6], "ag": [3, 5, 7], "subject": [3, 5, 6], "posit": [3, 4, 5, 6, 7, 8], "confid": [3, 5], "inclus": [3, 5, 7, 8], "celebr": 3, "definit": [3, 4, 5, 8], "creativ": [3, 4, 5, 6, 8], "math": [3, 5, 6], "tip": [3, 7], "digit": [3, 4, 5], "literaci": 3, "onlin": [3, 4, 5, 6, 7, 8], "histor": [3, 5], "violenc": [3, 7], "physic": [3, 5, 7], "fight": [3, 7], "crimin": [3, 7], "illeg": [3, 7], "glorifi": [3, 7], "person": [3, 5, 6, 7, 8], "eat": [3, 7], "disord": 3, "diet": 3, "dare": 3, "advic": [3, 5, 7], "discriminatori": [3, 7], "bulli": [3, 7], "harass": [3, 5, 7], "target": [3, 4, 5, 6, 7, 8], "group": [3, 5, 6, 7], "religi": [3, 6, 7], "racial": [3, 5, 7], "ethnic": [3, 7], "gender": [3, 5, 7], "discrimin": [3, 5, 7], "adult": [3, 7], "profan": [3, 7], "relationship": [3, 5], "substanc": [3, 5], "drug": [3, 7], "gambl": 3, "bet": 3, "protocol": [3, 5, 7], "redirect": 3, "alert": [3, 4], "record": [3, 5, 6, 7], "audit": [3, 4, 5], "teacher": [3, 7], "parent": [3, 7], "continu": [3, 4, 5, 6, 7, 8], "construct": [3, 5, 6, 7, 8], "compliant": [3, 7], "violat": [3, 5, 7], "intens": [3, 5, 8], "demand": [3, 4, 5, 6, 7, 8], "especi": [3, 5, 6, 7, 8], "dong": [3, 5, 7], "There": [3, 5, 6, 7, 8], "rlaif": [3, 7], "give": [3, 5, 7], "rise": [3, 7], "kim": [3, 5, 7], "meta": [3, 4, 5, 6, 7], "wu": [3, 5, 7, 8], "scheme": [3, 4, 6], "inspir": [3, 7], "schema": [3, 8], "row": [3, 5, 7], "match": [3, 4, 5, 6, 7, 8], "boundari": [3, 4, 5, 7], "craft": [3, 4, 5, 7, 8], "elicit": [3, 7, 8], "unalign": 3, "panda": [3, 5, 7], "chosen_responses_path": 3, "chosen_respons": 3, "csv": [3, 5, 7], "rejected_responses_path": 3, "rejected_respons": 3, "chosen_responses_jsonl_path": 3, "batch_result": 3, "jsonl": 3, "dpo_dataset_s": 3, "5000": [3, 6], "class": [3, 5, 7, 8], "userpromptgener": 3, "pd": [3, 5, 7], "pydant": [3, 5, 7, 8], "basemodel": [3, 5, 7, 8], "time": [3, 4, 5, 6, 7, 8], "type": [3, 4, 5, 6, 7, 8], "dotenv": [3, 5, 7, 8], "load_dotenv": [3, 5, 7, 8], "environ": [3, 4, 5, 6, 7, 8], "variabl": [3, 5, 7, 8], "overrid": [3, 7, 8], "userprompt": 3, "user_prompt": 3, "str": [3, 5, 7, 8], "__init__": [3, 7, 8], "4o": [3, 5, 6, 7, 8], "mini": [3, 5, 6, 7, 8], "client": [3, 5, 6, 7, 8], "_generate_prompt": 3, "batch": [3, 4, 5, 6], "system_prompt": [3, 7], "pars": [3, 5, 7, 8], "messag": [3, 4, 5, 6, 7, 8], "response_format": [3, 5, 7, 8], "except": [3, 5, 7, 8], "generate_prompt": 3, "num_prompt": [3, 6], "int": [3, 5, 7], "save_to_csv": 3, "multipl": [3, 4, 5, 6, 7, 8], "arg": [3, 5, 7, 8], "option": [3, 4, 5, 6, 7, 8], "filepath": 3, "save": [3, 4, 5, 6, 7], "datafram": [3, 5, 7], "all_prompt": 3, "sleep": 3, "enclos": [3, 7], "quot": [3, 4, 5, 6], "startswith": [3, 7], "els": [3, 5, 7], "df": [3, 5, 7], "to_csv": [3, 7], "index": [3, 5, 6, 7, 8], "fals": [3, 5, 6, 7, 8], "user_prompt_gener": 3, "user_prompts_path": 3, "uneth": [3, 7], "dishonesti": 3, "stalk": 3, "privaci": [3, 4, 5, 6, 7, 8], "secur": [3, 4, 5, 7, 8], "breach": [3, 5, 7], "manipul": [3, 5, 6, 7, 8], "10": [3, 5, 6, 7, 8], "to_markdown": [3, 7], "me": [3, 7, 8], "hurt": 3, "someon": 3, "caught": [3, 7], "plan": [3, 4, 5, 6, 8], "cheat": 3, "fire": [3, 5], "household": 3, "item": [3, 5, 7], "stunt": 3, "friend": 3, "heard": 3, "school": [3, 7], "7": [3, 4, 5, 6, 7], "8": [3, 4, 5, 6, 7], "teach": [3, 8], "my": [3, 6, 7, 8], "monei": [3, 5], "video": [3, 4, 5, 6, 7], "game": [3, 4, 5, 6], "9": [3, 5, 6, 7], "skip": [3, 7, 8], "troubl": [3, 7], "responsegener": 3, "properli": [3, 5, 8], "hug": [3, 4, 5, 6, 7], "instanti": [3, 5], "otherwis": [3, 5, 7], "connect": [3, 4, 5, 6, 8], "endpoint": 3, "local_gener": 3, "model_nam": [3, 4, 5, 8], "huggingface_model_nam": 3, "remote_gener": 3, "api_url": 3, "cloud_endpoint": 3, "recal": [3, 5, 6], "enhanc": [3, 4, 5, 6, 7, 8], "visit": [3, 5], "ui": [3, 5, 8], "click": [3, 6], "select": [3, 4, 5, 6, 8], "choos": [3, 4, 5], "cpu": [3, 4, 6], "gpu": [3, 4, 6], "configur": [3, 4, 5, 6, 7], "meaning": [3, 5, 8], "region": [3, 5], "closest": [3, 5, 6], "your": [3, 4, 5, 7, 8], "locat": [3, 5, 6, 7], "huggingface_hub": 3, "inferencecli": 3, "tokenizers_parallel": 3, "max_new_token": 3, "none": [3, 5, 6, 7], "generate_respons": [3, 5, 8], "prompts_df": 3, "remov": [3, 5, 6], "strip": [3, 5, 8], "elif": 3, "chat_complet": 3, "max_token": [3, 5], "seed": [3, 7], "42": [3, 4, 5, 6, 7], "append": [3, 5, 7, 8], "results_df": [3, 7], "model_respons": 3, "your_api_url": 3, "user_prompts_df": 3, "read_csv": [3, 7], "iloc": 3, "tolist": [3, 7], "parallelevalu": 3, "taming_util": [3, 4, 7], "modul": [3, 5, 8], "num_chunk": 3, "parallel_evalu": 3, "n_part": 3, "associ": [3, 5, 6, 8], "gladli": 3, "constitut": [3, 5], "would": [3, 5, 6, 7, 8], "dtype": [3, 5, 7], "80": [3, 5], "absolut": [3, 4, 5, 8], "materi": [3, 5, 6, 7, 8], "plastic": 3, "food": 3, "lid": 3, "cut": [3, 5], "swath": 3, "wood": [3, 5], "squar": 3, "rectangular": 3, "piec": 3, "place": [3, 5, 6, 7, 8], "insid": [3, 5, 7], "inch": 3, "inspect": [3, 5], "off": [3, 4, 5, 6, 7, 8], "demolit": 3, "scissor": 3, "smash": 3, "smooth": [3, 6], "arrang": [3, 5], "c": [3, 4, 5, 6, 8], "shape": [3, 7, 8], "top": [3, 5, 6, 8], "tuck": 3, "catch": [3, 7], "hook": 3, "solid": 3, "side": [3, 5], "round": [3, 5, 7], "edg": [3, 4, 5, 6, 7], "separ": [3, 5, 6, 7], "process_aligned_respons": 3, "strictli": [3, 8], "bound": [3, 5], "openaibatchprocessor": 3, "async": 3, "company_nam": 3, "save_filepath": 3, "dict": [3, 5, 8], "enforc": [3, 5, 7, 8], "dictionari": [3, 5, 7, 8], "aligned_suffix": 3, "sorri": 3, "suffix": [3, 8], "processor": [3, 4, 6, 8], "api_kei": [3, 5, 7], "getenv": 3, "max_requests_per_minut": 3, "1500": 3, "max_tokens_per_minut": 3, "125000": 3, "await": 3, "process_batch": 3, "total": [3, 4, 5, 6, 7, 8], "total_request": 3, "successful_request": 3, "failed_request": 3, "rate_limit_error": 3, "convert": [3, 4, 5, 6, 7, 8], "json": [3, 5, 6, 7], "fri": 3, "su": [3, 6], "quote_al": 3, "fall": [3, 5, 6, 7], "deem": [3, 5, 7], "pertain": [3, 5], "generate_dpo_dataset": 3, "push": [3, 4, 5], "hub": [3, 4, 5, 6], "repo_id": [3, 6], "push_to_hub": [3, 5], "dpo_dataset": 3, "merg": [3, 7], "_chosen": 3, "_reject": 3, "transform_row": 3, "per": [3, 4, 5, 6, 7], "model_responses_chosen": 3, "model_responses_reject": 3, "seri": [3, 4, 5, 6], "axi": [3, 5], "drop": [3, 4, 5, 7], "hf_dpo_dataset": 3, "from_panda": 3, "duplic": 3, "interest": [3, 4, 5, 6, 7, 8], "opt": 3, "login": 3, "thatupiso": 3, "smolk12": 3, "cli": [3, 5, 6], "parquet": 3, "arrow": 3, "00": [3, 5, 6], "153": [3, 5], "33ba": 3, "upload": [3, 5], "shard": 3, "02": 3, "35": [3, 5, 6], "num_row": 3, "7158": 3, "nmateri": 3, "n1": [3, 5], "nstep": 3, "n2": [3, 5], "n3": [3, 5], "n4": [3, 5], "n5": [3, 5], "n6": 3, "n7": 3, "n8": [3, 5], "n9": [3, 5], "n10": [3, 5], "nnext": 3, "nthe": [3, 5], "singl": [3, 4, 5, 6, 7, 8], "48gb": 3, "a100": 3, "took": 3, "few": [3, 5, 6, 7, 8], "minut": 3, "torch": [3, 8], "h4": [3, 7], "honest": [3, 5], "ultrafeedback": [3, 7], "binar": [3, 7], "lib": [3, 7, 8], "ultrafeedback_binar": [3, 7], "honesti": [3, 7], "dimens": [3, 5, 6, 7], "blend": [3, 6], "automodelforcausallm": [3, 8], "autotoken": [3, 8], "load_dataset": [3, 6, 7], "dpotrain": 3, "dpoconfig": 3, "dataset_k12": 3, "split": [3, 5, 6, 7], "dataset_ultra": 3, "concatenate_dataset": 3, "remove_column": 3, "score_chosen": [3, 7], "score_reject": 3, "shuffl": 3, "base_model": 3, "cuda": [3, 8], "is_avail": 3, "mp": 3, "from_pretrain": [3, 6, 8], "pretrained_model_name_or_path": 3, "torch_dtyp": [3, 8], "float32": 3, "config": [3, 5, 6, 7], "use_cach": 3, "pad_token": 3, "eos_token": 3, "finetun": 3, "finetune_nam": 3, "aligned_model": 3, "finetune_tag": 3, "from_smollm2": 3, "schedul": [3, 5, 6], "learning_r": [3, 6], "determin": [3, 4, 5, 6, 7, 8], "aggress": [3, 5, 6, 7], "empir": 3, "1e": 3, "huyen": 3, "cosin": 3, "lr_scheduler_typ": 3, "stabil": [3, 5, 7], "gradual": 3, "decreas": [3, 4, 5, 8], "accumul": [3, 5], "v": [3, 8], "16": [3, 4, 5, 6, 7], "per_device_train_batch_s": 3, "simul": [3, 5, 7, 8], "gradient_accumulation_step": 3, "strongli": [3, 8], "lower": [3, 4, 5, 6, 7, 8], "conserv": [3, 7], "overfit": 3, "warmup": 3, "max_step": 3, "1000": [3, 5, 6, 7], "suffic": 3, "20": [3, 5, 6, 7, 8], "warmup_step": 3, "stop": [3, 4, 5, 6], "bf16": 3, "checkpoint": 3, "gradient_checkpoint": 3, "usag": [3, 4, 5, 6, 7, 8], "200": [3, 4, 5, 6, 7], "50": [3, 5, 6, 7, 8], "training_results_dir": 3, "smolk12_dpo_output": 3, "dpo_config_path": 3, "dpo_config": 3, "yaml": [3, 5, 8], "pathlib": [3, 7], "config_path": 3, "safe_load": [3, 5], "runtim": [3, 6, 8], "hub_model_id": 3, "use_mps_devic": 3, "output_dir": [3, 5], "training_arg": 3, "trainer": 3, "train_dataset": 3, "processing_class": 3, "temperatur": [3, 5, 6, 7, 8], "max_prompt_length": [3, 6], "1024": 3, "max_length": [3, 5, 8], "1536": 3, "sent": [3, 6, 7], "plot": [3, 5], "move": [3, 4, 5, 6, 7], "averag": [3, 4, 5, 6, 8], "visual": [3, 5, 6, 7], "quick": [3, 5, 6, 7], "150": [3, 5], "curv": 3, "reach": [3, 5, 6, 7, 8], "obviou": 3, "suffici": [3, 5, 8], "save_model": 3, "hf_token": 3, "tag": [3, 7], "congratul": 3, "successfulli": [3, 5, 7, 8], "card": [3, 5, 7], "newli": [3, 5], "qualit": [3, 5, 7], "assess": [3, 4, 5, 6, 7], "rigor": [3, 5, 6, 7], "quantit": [3, 5], "base_gener": 3, "aligned_gener": 3, "compare_model_respons": 3, "base_output": 3, "128": [3, 5, 6], "aligned_output": 3, "pleas": [3, 5, 6, 7], "gram": [3, 5], "tnt": 3, "highli": [3, 4, 5, 6, 7, 8], "regul": [3, 4, 5, 6, 7], "law": [3, 4, 5, 6, 7], "degre": [3, 5, 8], "mishandl": 3, "countri": [3, 5], "seriou": [3, 5, 7], "imprison": 3, "death": 3, "variou": [3, 4, 5, 6, 7, 8], "nation": [3, 7], "dictat": 3, "stark": [3, 5], "readili": [3, 5], "cite": 3, "regulatori": [3, 4, 5, 6, 7], "anecdot": [3, 7], "systemat": [3, 4, 5, 6, 7, 8], "quantifi": [3, 5, 6, 7], "f1": [3, 5, 7], "experienc": [3, 5], "expert": [3, 5, 6, 7, 8], "addition": [3, 4, 5, 6, 7], "vari": [3, 4, 5, 6, 7, 8], "interpret": [3, 5, 6, 7], "judg": [3, 5], "summar": [3, 5, 6], "three": [3, 5, 6, 7], "togeth": [3, 6, 7], "entri": [3, 5, 6], "somewhat": 3, "databas": [3, 4, 5, 8], "distribut": [3, 4, 5, 6, 7, 8], "static": [3, 7, 8], "k12": 3, "base_model_api_url": 3, "aligned_model_api_url": 3, "base_model_responses_path": 3, "evals_base_model_respons": 3, "aligned_model_responses_path": 3, "evals_aligned_model_respons": 3, "num_sampl": [3, 7], "eval_dataset": 3, "df_eval": 3, "to_panda": [3, 5, 7], "lambda": [3, 7], "prompts_ev": 3, "to_list": 3, "base_model_respons": 3, "aligned_model_respons": 3, "df_eval_respons": 3, "_base": 3, "_align": 3, "rememb": [3, 5], "heurist": 3, "charact": [3, 5, 6, 7, 8], "minimum": [3, 4, 5, 6], "min_response_length": 3, "filter": [3, 5, 6, 8], "string": [3, 5, 7, 8], "df_eval_responses_clean": 3, "model_responses_bas": 3, "model_responses_align": 3, "homemad": 3, "kid": 3, "redact": [3, 7], "punish": 3, "unit": [3, 5, 7, 8], "indonesia": 3, "saudi": 3, "arabia": 3, "offens": [3, 7], "respond": [3, 4, 5, 7, 8], "rodrig": 3, "safetyjudg": 3, "evaluate_respons": 3, "tupl": [3, 5, 7], "safetyscor": [3, 7], "float": [3, 4, 5, 6, 7, 8], "valueerror": [3, 8], "empti": [3, 8], "scoring_guid": 3, "nrespons": 3, "safety_judg": 3, "test_respons": 3, "emphas": [3, 5, 6, 7, 8], "emphasi": [3, 4, 5], "base_ev": 3, "zip": [3, 5, 8], "aligned_ev": 3, "injuri": [3, 5], "base_scor": 3, "eval": [3, 4, 6], "aligned_scor": 3, "base_df": 3, "aligned_df": 3, "model_typ": 3, "stack": [3, 6, 7], "evals_df_result": 3, "h": [3, 5, 6, 7], "identifi": [3, 4, 5, 6, 7, 8], "requ": 3, "statist": [3, 5, 7], "naiv": [3, 8], "score_map": 3, "count": [3, 5, 6, 7], "percentag": [3, 4, 5, 7], "score_base_freq": 3, "score_bas": 3, "value_count": [3, 7], "reindex": 3, "fill_valu": 3, "score_base_pct": 3, "score_aligned_freq": 3, "score_align": 3, "score_aligned_pct": 3, "tabl": [3, 5, 6, 7, 8], "md_tabl": 3, "335": [3, 5], "99": [3, 4, 6, 7], "281": [3, 5], "83": [3, 4, 5, 7], "14": [3, 5, 6, 7, 8], "43": [3, 5, 6, 7], "explanation_bas": 3, "response_bas": 3, "model_type_bas": 3, "explanation_align": 3, "response_align": 3, "model_type_align": 3, "std": [3, 5, 7], "base_mean": 3, "aligned_mean": 3, "3f": 3, "108": [3, 5], "231": [3, 5], "No": [3, 5, 6, 7, 8], "fell": [3, 4], "partial": [3, 5], "styliz": [3, 7], "wild": 3, "consider": [3, 4, 6, 7, 8], "proof": [3, 4], "taken": [3, 5, 6, 7, 8], "huang": [3, 5, 6, 7], "overal": [3, 5, 6, 7, 8], "annot": [3, 5, 6, 7], "mirror": [3, 5, 7], "inaccur": [3, 5, 7, 8], "consecut": [3, 7], "unrepres": 3, "hao": [3, 5], "accord": [3, 4, 5, 7, 8], "yin": [3, 5, 7], "resembl": 3, "declin": [3, 4, 5], "volatil": [3, 5], "ineffici": [3, 4, 5], "smollm": 3, "rel": [3, 4, 5, 6, 7], "term": [3, 4, 5, 6, 7], "trade": [3, 4, 5, 6, 7, 8], "weigh": 3, "qwen": [3, 6, 8], "remark": [3, 4, 7, 8], "rival": [3, 6], "ultim": [3, 4, 5, 6, 7], "threshold": [3, 4, 5, 6, 7], "chen": [3, 5, 6, 7, 8], "overli": [3, 5, 7, 8], "simpli": [3, 4, 5, 6, 8], "neglect": [3, 5, 7], "themselv": [3, 5, 7], "complementari": 3, "throughput": [3, 4, 6], "screen": [3, 5, 7], "flag": [3, 5, 6, 7], "preliminari": [3, 5], "judgment": [3, 5], "valid": [3, 4, 5, 6, 8], "automat": [3, 5, 6, 7], "composit": [3, 5], "plai": [3, 5, 6, 7, 8], "led": [3, 5, 8], "apologet": 3, "hesit": 3, "benign": [3, 7], "apolog": 3, "inde": 3, "accordingli": [3, 5, 7], "perhap": [3, 4], "creation": [3, 6, 7], "invalu": 3, "hyperparamet": [3, 6, 7], "mention": [3, 5, 7, 8], "optimist": 3, "memor": [3, 5], "generaliz": 3, "abc": [3, 7], "4a": 3, "amanda": [3, 5, 7], "jan": [3, 5, 7], "brauner": [3, 7], "adrian": 3, "colyer": 3, "benjamin": [3, 5, 7], "cullen": [3, 7], "david": [3, 5, 6, 7], "duvenaud": 3, "richard": [3, 5, 7], "ngo": [3, 7], "azalia": 3, "mirhoseini": 3, "catherin": [3, 5, 7], "olsson": [3, 7], "sam": [3, 5, 7], "ringer": 3, "liam": [3, 5, 7], "skirvin": 3, "jess": [3, 5, 7], "smith": [3, 5, 6], "dawn": [3, 5, 7], "song": [3, 4, 5, 7, 8], "william": [3, 4, 5, 6, 7], "saunder": [3, 5], "steinhardt": [3, 5], "asset": [3, 5, 7], "983c85a201a962f": 3, "pdf": [3, 7], "4b": 3, "24c8d0a3a7d0a1f1": 3, "bjn": 3, "22": [3, 5, 7], "yuntao": [3, 5, 7], "andi": [3, 5, 7], "jone": [3, 5], "kamal": 3, "ndouss": 3, "anna": [3, 5, 7], "nova": [3, 6], "dassarma": 3, "drain": 3, "stanislav": 3, "fort": [3, 7], "ganguli": [3, 5, 7], "tom": [3, 5], "henighan": 3, "nichola": [3, 5], "joseph": [3, 5, 7], "saurav": [3, 7], "kadavath": 3, "jackson": [3, 5, 7], "kernion": [3, 5, 7], "conerli": 3, "sheer": [3, 8], "el": 3, "showk": 3, "nelson": 3, "elhag": 3, "zac": 3, "hatfield": 3, "dodd": 3, "danni": [3, 5, 7], "hernandez": [3, 5, 7], "tristan": 3, "hume": 3, "scott": [3, 5, 7], "johnston": 3, "shauna": 3, "kravec": 3, "lian": 3, "lovitt": 3, "neel": [3, 5], "nanda": 3, "dario": [3, 5], "amodei": [3, 5], "brown": [3, 5], "jack": [3, 5, 7], "clark": 3, "mccandlish": [3, 5], "chri": [3, 5, 7], "olah": 3, "ben": [3, 5, 6, 7], "mann": [3, 7], "jare": [3, 5, 7], "kaplan": [3, 5, 7], "arxiv": [3, 4, 5, 6, 7, 8], "org": [3, 4, 5, 6, 7, 8], "ab": [3, 4, 5, 6, 7, 8], "2204": 3, "05862": 3, "bkk": 3, "sandipan": 3, "kundu": 3, "goldi": 3, "cameron": [3, 5, 7, 8], "mckinnon": 3, "carol": [3, 7], "christoph": [3, 5, 7], "dustin": 3, "eli": [3, 5, 6, 7], "tran": [3, 8], "johnson": 3, "ethan": [3, 5, 7], "perez": [3, 7], "jami": [3, 7], "kerr": 3, "mueller": 3, "jeffrei": 3, "ladish": 3, "joshua": [3, 5, 7], "landau": 3, "kamil": [3, 5], "lukosuit": 3, "michael": [3, 5, 6, 7, 8], "sellitto": 3, "schiefer": 3, "noemi": 3, "mercado": 3, "robert": [3, 5, 6], "lasenbi": 3, "robin": 3, "larson": 3, "tamera": 3, "lanham": 3, "timothi": [3, 5, 6], "telleen": 3, "lawton": 3, "samuel": [3, 5, 7], "bowman": [3, 5], "2212": 3, "08073": 3, "blo23": 3, "announc": [3, 5], "cc": 3, "11": [3, 5, 6, 7, 8], "ccl": [3, 7], "24": [3, 4, 5, 6, 7, 8], "guim": 3, "hardi": 3, "shunian": 3, "zich": 3, "liu": [3, 5, 6, 7, 8], "jiang": [3, 5, 7], "benyou": 3, "wang": [3, 4, 5, 6, 7, 8], "judgement": [3, 5, 7], "2402": [3, 7], "10669": 3, "dphz23": 3, "tim": [3, 7], "artidoro": 3, "pagnoni": 3, "ari": [3, 5, 7], "holtzman": [3, 5], "luke": [3, 5, 7], "zettlemoy": 3, "2305": [3, 5], "14314": 3, "ddz": 3, "qingxiu": 3, "xingx": 3, "zhang": [3, 5, 6, 7], "zhifang": 3, "sui": 3, "furu": [3, 4], "wei": [3, 4, 5, 6, 7], "boost": 3, "2410": [3, 4, 7], "06961": 3, "fac24": [3, 5], "huggingfaceh4": [3, 6, 7], "fac4c": 3, "fac4d": [3, 6], "doc": [3, 4, 5, 6, 7, 8], "en": [3, 5, 6, 7, 8], "fqh": 3, "duanyu": 3, "bowen": [3, 5, 6, 7], "qin": [3, 5, 6, 7], "zheng": [3, 5, 6, 7], "wenqiang": 3, "lei": [3, 5, 6, 7], "analyz": [3, 4, 5, 6, 7, 8], "perspect": [3, 7], "2404": [3, 5, 7], "04626": 3, "h44a": 3, "binari": [3, 5, 6, 7], "h44b": 3, "hhj": 3, "shuang": 3, "wenfeng": 3, "han": [3, 5, 7], "tao": [3, 5, 7], "yipe": 3, "haonan": 3, "chunlin": 3, "zhong": [3, 7], "zhangjun": 3, "zhou": [3, 4, 5, 6, 7], "tang": [3, 5, 6, 7], "2401": [3, 5], "01629": 3, "hlt24": 3, "jiwoo": 3, "noah": [3, 5, 7], "lee": [3, 5, 6, 7, 8], "jame": [3, 5, 7], "thorn": 3, "orpo": 3, "monolith": 3, "2403": [3, 5], "07691": 3, "hdn": 3, "zhenyu": 3, "pengfan": 3, "du": [3, 5], "yilin": 3, "niu": [3, 8], "zhengxiao": 3, "aohan": 3, "zeng": [3, 7], "xiao": [3, 7], "minli": 3, "hongn": 3, "jie": [3, 5, 7, 8], "yuxiao": 3, "2412": [3, 5, 6, 7], "06000": 3, "hsw": 3, "21": [3, 5, 6], "edward": [3, 5], "j": [3, 5, 6, 7, 8], "yelong": 3, "shen": [3, 5, 7], "phillip": 3, "walli": 3, "zeyuan": 3, "allen": [3, 5], "zhu": [3, 5, 6, 7], "yuanzhi": 3, "shean": 3, "lu": [3, 5, 6, 7], "weizhu": 3, "2106": 3, "09685": 3, "hgh": 3, "jiaxin": 3, "shixiang": [3, 5, 7], "shane": [3, 5, 7], "gu": [3, 5, 7], "le": [3, 5, 6], "yuexin": 3, "xuezhi": 3, "hongkun": 3, "yu": [3, 5, 6, 7], "jiawei": [3, 8], "2210": [3, 7], "11610": 3, "huy24": 3, "chip": 3, "reilli": 3, "media": [3, 4, 5, 7], "decemb": [3, 5, 7], "9781098129095": 3, "www": [3, 5, 6, 7], "oreilli": 3, "ksd": 3, "rylan": [3, 5], "schaeffer": 3, "apratim": 3, "dei": 3, "matthia": [3, 5], "gerstgrass": 3, "rafael": 3, "donoho": 3, "sanmi": 3, "koyejo": 3, "thrive": [3, 5, 8], "peril": 3, "16713": 3, "ksy": 3, "seungon": 3, "juyoung": 3, "suk": 3, "xiang": [3, 5, 6], "yue": 3, "vijai": 3, "viswanathan": 3, "seongyun": 3, "yizhong": 3, "kiril": 3, "gashteovski": 3, "carolin": [3, 7], "lawrenc": 3, "sean": [3, 5, 7], "welleck": 3, "graham": 3, "neubig": 3, "03679": 3, "lt24": 3, "herd": [3, 6], "2407": [3, 5, 6, 7], "21783": [3, 6], "lwx": 3, "lin": [3, 5, 6, 7, 8], "rui": [3, 5, 6, 8], "ruixuan": 3, "junbo": 3, "zhao": [3, 5, 6, 7], "ding": 3, "gang": [3, 5], "haobo": 3, "driven": [3, 5, 6, 7], "survei": [3, 5, 7, 8], "2406": [3, 5, 6, 7], "15126": 3, "met24": 3, "owj": 3, "jeff": [3, 5, 7], "diogo": [3, 7], "almeida": [3, 7], "carrol": [3, 7], "wainwright": [3, 7], "pamela": [3, 5, 7], "mishkin": [3, 5, 7], "chong": [3, 7], "sandhini": [3, 7], "agarw": [3, 5, 7], "katarina": [3, 7], "slama": [3, 7], "alex": [3, 5, 6, 7], "rai": [3, 5, 6, 7], "john": [3, 5, 7], "hilton": [3, 5, 6, 7], "fraser": [3, 7], "kelton": 3, "miller": [3, 5], "maddi": [3, 7], "simen": [3, 7], "peter": [3, 5, 6, 7], "welind": [3, 5, 7], "paul": [3, 5, 7], "christiano": [3, 7], "leik": [3, 5, 7], "ryan": [3, 5, 7], "2203": 3, "02155": 3, "qwe24": 3, "rsm": 3, "archit": 3, "sharma": [3, 7], "eric": [3, 5, 6, 7], "mitchel": [3, 6], "stefano": [3, 5], "ermon": [3, 5], "man": [3, 5, 7], "chelsea": [3, 7], "finn": 3, "secretli": 3, "18290": 3, "swd": 3, "17": [3, 5, 7], "filip": [3, 7], "wolski": 3, "prafulla": 3, "dhariw": 3, "alec": [3, 5, 7], "radford": [3, 5, 7], "oleg": [3, 7], "klimov": 3, "1707": 3, "06347": 3, "smollm224": 3, "distil": [3, 4], "smollm2360mi24": 3, "sou24": 3, "html": [3, 8], "srverh24": 3, "m\u00e1rton": 3, "daniel": [3, 5, 7], "rueckert": 3, "r\u00fcdiger": 3, "von": [3, 5, 6], "eisenhart": 3, "roth": [3, 5], "florian": 3, "hinterwimm": 3, "2411": 3, "09539": 3, "tm": [3, 6], "23": [3, 5, 6, 7], "hugo": [3, 6], "loui": [3, 5, 6], "martin": [3, 5, 6, 7], "kevin": [3, 5, 6, 7], "stone": [3, 6], "albert": [3, 6], "amjad": [3, 6], "almahairi": [3, 6], "yasmin": [3, 6], "babaei": [3, 6], "nikolai": [3, 6], "bashlykov": [3, 6], "soumya": [3, 6], "batra": [3, 6], "prajjwal": [3, 6], "bhargava": [3, 6], "shruti": [3, 6], "bhosal": [3, 6], "dan": [3, 5, 6, 7, 8], "bikel": [3, 6], "luka": [3, 6], "blecher": [3, 6], "cristian": [3, 6], "canton": [3, 6], "ferrer": [3, 6], "moya": [3, 6], "guillem": [3, 6], "cucurul": [3, 6], "esiobu": [3, 6], "jude": [3, 6], "fernand": [3, 6], "jeremi": [3, 5, 6], "fu": [3, 6], "wenyin": [3, 6], "brian": [3, 6, 7], "fuller": [3, 6, 7], "cynthia": [3, 6], "gao": [3, 5, 6, 7], "vedanuj": [3, 6], "goswami": [3, 6, 7], "naman": [3, 6], "goyal": [3, 6], "anthoni": [3, 6], "hartshorn": [3, 6], "saghar": [3, 6], "hosseini": [3, 6], "hakan": [3, 6], "inan": [3, 6], "marcin": [3, 6], "karda": [3, 6], "viktor": [3, 6], "kerkez": [3, 6], "madian": [3, 6], "khabsa": [3, 6], "isabel": [3, 6, 7], "kloumann": [3, 6], "artem": [3, 6], "korenev": [3, 6], "punit": [3, 6], "singh": [3, 5, 6], "koura": [3, 6], "mari": [3, 5, 6, 7], "ann": [3, 6, 7], "lachaux": [3, 6], "thibaut": [3, 6], "lavril": [3, 6], "jenya": [3, 6], "diana": [3, 5, 6], "liskovich": [3, 6], "yinghai": [3, 6], "yune": [3, 6], "mao": [3, 4, 6], "xavier": [3, 6], "martinet": [3, 6], "todor": [3, 6, 7], "mihaylov": [3, 6], "pushkar": [3, 6], "mishra": [3, 5, 6], "igor": [3, 5, 6, 7], "molybog": [3, 6], "yixin": [3, 5, 6], "nie": [3, 5, 6], "andrew": [3, 5, 6, 7], "poulton": [3, 6], "reizenstein": [3, 6], "rashi": [3, 6], "rungta": [3, 6], "kalyan": [3, 6], "saladi": [3, 6], "alan": [3, 6, 7], "schelten": [3, 6], "ruan": [3, 6], "silva": [3, 6], "ranjan": [3, 6], "subramanian": [3, 6], "xiaoq": [3, 6], "ellen": [3, 6], "tan": [3, 5, 6], "binh": [3, 6], "ross": [3, 4, 6, 7], "taylor": [3, 6], "adina": [3, 6, 7], "jian": [3, 5, 6], "kuan": [3, 6], "puxin": [3, 6], "yan": [3, 4, 5, 6], "iliyan": [3, 6], "zarov": [3, 6], "yuchen": [3, 5, 6, 7], "angela": [3, 5, 6, 7], "fan": [3, 5, 6], "melani": [3, 6], "kambadur": [3, 6], "sharan": [3, 6], "narang": [3, 6], "aurelien": [3, 6], "rodriguez": [3, 6], "stojnic": [3, 6], "sergei": [3, 6], "edunov": [3, 6], "thoma": [3, 5, 6, 7], "scialom": [3, 6], "2307": [3, 6, 8], "09288": [3, 6], "vaa": [3, 7], "berti": [3, 7], "adarsh": [3, 7], "agraw": [3, 7], "ahm": [3, 7], "victor": [3, 7], "akinwand": [3, 7], "namir": [3, 7], "nuaimi": [3, 7], "najla": [3, 7], "alfaraj": [3, 7], "alhajjar": [3, 7], "aroyo": [3, 7], "trupti": [3, 7], "bavalatti": [3, 7], "max": [3, 5, 7], "bartolo": [3, 7], "borhan": [3, 7], "blili": [3, 7], "hamelin": [3, 7], "kurt": [3, 7], "bollack": [3, 7], "rishi": [3, 5, 6, 7], "bomassani": [3, 7], "marisa": [3, 7], "ferrara": [3, 7], "boston": [3, 7], "sim\u00e9on": [3, 7], "campo": [3, 7], "kal": [3, 7], "chakra": [3, 7], "canyu": [3, 7], "codi": [3, 7], "coleman": [3, 7], "zachari": [3, 5, 7], "delpierr": [3, 7], "coudert": [3, 7], "leon": [3, 7], "derczynski": [3, 7], "debojyoti": [3, 7], "dutta": [3, 7], "ian": [3, 5, 7], "eisenberg": [3, 7], "ezick": [3, 7], "heather": [3, 7], "frase": [3, 7], "ram": [3, 6, 7], "gandikota": [3, 7], "agasthya": [3, 7], "gangavarapu": [3, 7], "ananya": [3, 5, 7], "geali": [3, 7], "rajat": [3, 7], "ghosh": [3, 5, 7], "goel": [3, 5, 7], "usman": [3, 7], "gohar": [3, 7], "sujata": [3, 7], "hale": [3, 7], "wiebk": [3, 7], "hutiri": [3, 7], "marvin": [3, 7], "imperi": [3, 7], "surgan": [3, 7], "jandial": [3, 7], "nick": [3, 5, 7], "judd": [3, 7], "felix": [3, 5, 7], "juefei": [3, 7], "fouts": [3, 7], "khomh": [3, 7], "bhavya": [3, 7], "kailkhura": [3, 7], "hannah": [3, 5, 7], "rose": [3, 7], "kirk": [3, 7], "klyman": [3, 7], "knotz": [3, 7], "kuchnik": [3, 7], "shachi": [3, 7], "kumar": [3, 5, 7], "srijan": [3, 7], "lengerich": [3, 7], "bo": [3, 5, 6, 7], "zeyi": [3, 7], "liao": [3, 5, 7], "eileen": [3, 7], "sarah": [3, 5, 7], "luger": [3, 7], "yifan": [3, 5, 7], "priyanka": [3, 7], "mammen": [3, 7], "kelvin": [3, 7], "manyeki": [3, 7], "mcgregor": [3, 7], "virendra": [3, 7], "mehta": [3, 5, 7], "shafe": [3, 7], "moham": [3, 7], "moss": [3, 7], "lama": [3, 7], "nachman": [3, 7], "dinesh": [3, 7], "jinenh": [3, 7], "naganna": [3, 7], "amin": [3, 7], "nikanjam": [3, 7], "besmira": [3, 7], "nushi": [3, 7], "lui": [3, 5, 7], "oala": [3, 7], "iftach": [3, 7], "orr": [3, 5, 7], "alicia": [3, 5, 7], "parrish": [3, 5, 7], "cigdem": [3, 7], "patlak": [3, 7], "pietri": [3, 7], "forough": [3, 7], "poursabzi": [3, 7], "sangdeh": [3, 7], "eleonora": [3, 7], "presani": [3, 7], "fabrizio": [3, 7], "puletti": [3, 7], "r\u00f6ttger": [3, 7], "sahai": [3, 7], "santo": [3, 7], "nino": [3, 7], "scherrer": [3, 7], "alic": [3, 5, 7, 8], "schoenauer": [3, 7], "sebag": [3, 7], "patrick": [3, 7], "schramowski": [3, 7], "abolfazl": [3, 7], "shahbazi": [3, 7], "vin": [3, 7], "xudong": [3, 5, 7], "vamsi": [3, 7], "sistla": [3, 7], "leonard": [3, 7], "testuggin": [3, 7], "vithursan": [3, 7], "thangarasa": [3, 7], "elizabeth": [3, 5, 7], "watkin": [3, 7], "rebecca": [3, 5, 7], "weiss": [3, 7], "welti": [3, 7], "tyler": [3, 5, 7], "wilber": [3, 7], "jean": [3, 7], "poonam": [3, 7], "yadav": [3, 7], "xianjun": [3, 7], "yang": [3, 5, 6, 7, 8], "yi": [3, 5, 7, 8], "wenhui": [3, 7], "fedor": [3, 7], "zhdanov": [3, 7], "jiacheng": [3, 5, 7], "perci": [3, 5, 7], "liang": [3, 5, 7, 8], "mattson": [3, 7], "joaquin": [3, 7], "vanschoren": [3, 7], "v0": [3, 7, 8], "12241": [3, 7], "wyg": 3, "tianhao": [3, 5, 6, 7], "weizh": 3, "yuan": [3, 5, 7], "olga": 3, "golovneva": 3, "jing": [3, 7], "yuandong": 3, "tian": 3, "jiantao": 3, "jiao": 3, "jason": [3, 5, 7], "weston": 3, "sainbayar": 3, "sukhbaatar": 3, "19594": 3, "xfg": 3, "shusheng": 3, "jiaxuan": 3, "wenji": 3, "ye": [3, 5, 6, 7, 8], "weilin": 3, "zhiyu": [3, 8], "mei": [3, 5, 6], "guangju": 3, "chao": 3, "10719": 3, "ywx": 3, "yueqin": 3, "zhendong": 3, "yujia": 3, "xie": [3, 5], "mingyuan": 3, "paradigm": [3, 5], "semanticscholar": 3, "corpusid": 3, "270199610": 3, "suppos": [4, 5, 8], "econom": [4, 5], "fuel": 4, "equival": [4, 5, 6], "consumpt": [4, 5], "contrari": 4, "truth": [4, 5, 6, 7, 8], "stanlei": 4, "jevon": 4, "a16z": 4, "andreessen": 4, "horowitz": 4, "10x": 4, "outpac": 4, "moor": 4, "pc": 4, "edholm": 4, "bandwidth": 4, "era": 4, "llmflation": 4, "mmlu": [4, 6, 7], "60": [4, 5, 6, 7], "06": [4, 5, 8], "price": [4, 5, 6], "fallen": 4, "62": [4, 5, 6], "introduct": 4, "march": [4, 5, 8], "stem": [4, 5, 8], "compound": 4, "bit": [4, 6], "tune": [4, 5, 7], "dpo": [4, 6], "competit": [4, 5, 6, 7], "plummet": 4, "rapidli": [4, 6, 7, 8], "preciou": 4, "wouldn": [4, 5], "sens": [4, 7], "wait": [4, 5, 7], "wave": 4, "economist": 4, "1865": 4, "studi": [4, 8], "coal": 4, "industri": [4, 5, 6, 7, 8], "made": [4, 5, 6, 8], "counterintuit": 4, "discoveri": 4, "steam": 4, "spend": [4, 5], "repeat": 4, "didn": [4, 8], "smartphon": [4, 5, 6], "server": [4, 5, 6, 8], "network": [4, 5, 6, 8], "transmiss": 4, "got": 4, "cheaper": [4, 5], "shift": [4, 5], "hd": 4, "stream": [4, 5, 6, 8], "storag": [4, 5, 6, 7], "gigabyt": 4, "massiv": [4, 5, 7], "broadli": [4, 6, 8], "audio": [4, 5], "transcript": 4, "multimod": [4, 6, 7], "imag": [4, 5, 6, 7], "exponenti": [4, 5], "growth": [4, 5], "magnifi": 4, "everyth": [4, 8], "billion": [4, 5, 6], "dollar": [4, 5, 6], "annual": [4, 5, 7], "millisecond": [4, 5], "latenc": [4, 5, 6, 7], "30": [4, 5, 6, 7], "mobil": [4, 5, 6, 8], "tradeoff": [4, 6, 7, 8], "pro": [4, 5, 6, 7], "trigger": [4, 7], "premium": [4, 5], "innov": [4, 5, 6, 7], "capac": [4, 5, 6], "link": [4, 5], "dual": 4, "character": [4, 5, 7], "ahead": [4, 6, 7], "decai": [4, 6], "discuss": [4, 5, 6, 7], "area": [4, 5, 7, 8], "flash": [4, 6], "cach": [4, 5, 6], "compress": [4, 5, 6], "provis": [4, 5], "extent": [4, 5, 7], "problema": 4, "accomplish": [4, 7, 8], "accompani": [4, 5, 7], "transact": [4, 5, 7], "roi": 4, "alloc": [4, 5, 6, 7], "budget": [4, 6], "viabil": [4, 6], "prioriti": [4, 5, 6], "overlook": 4, "thorough": [4, 6, 7], "identif": [4, 5], "specifi": [4, 5, 6, 7, 8], "longev": 4, "accommod": 4, "evalu": [4, 6, 8], "multi": [4, 5, 6, 7, 8], "baselin": [4, 5, 6, 7], "met": [4, 5, 7], "equal": [4, 5, 7], "concurr": [4, 6], "peak": 4, "spike": 4, "versu": [4, 5, 6, 7], "volum": [4, 5, 6, 7], "season": [4, 5], "variat": [4, 5, 6], "uptim": 4, "mainten": [4, 5, 6, 7], "disrupt": [4, 5], "backup": 4, "failov": 4, "clearli": [4, 5, 7, 8], "redund": [4, 5], "recoveri": [4, 5], "unexpect": [4, 5, 7, 8], "event": [4, 5], "seamless": [4, 5, 7], "broader": [4, 5, 6, 7], "vector": [4, 6, 7], "retriev": [4, 5, 6], "augment": [4, 5, 6], "rag": [4, 6], "retent": [4, 5], "polici": [4, 5, 6], "essenti": [4, 5, 6, 7, 8], "opportun": [4, 5], "post": [4, 5, 6, 7], "32": [4, 5, 6], "fp32": 4, "fp16": [4, 6], "proport": [4, 5, 6], "byte": 4, "120": [4, 5, 7], "gb": 4, "whole": [4, 5], "done": [4, 5, 6, 7, 8], "smollm2": [4, 5, 6, 8], "135m": [4, 6], "load_gguf": 4, "bartowski": 4, "gguf": [4, 6], "gguf_file_q2_k": 4, "q2_k": [4, 6], "gguf_file_f16": 4, "f16": 4, "model_q2_k": 4, "gguf_fil": 4, "model_f16": 4, "mlp": 4, "layer": [4, 5, 6, 8], "proxi": [4, 5, 7], "mlp_weights_q2_k": 4, "gate_proj": 4, "mlp_weights_f16": 4, "tensor": [4, 8], "0145": 4, "1826": 4, "1377": 4, "1719": 4, "1387": 4, "0298": 4, "1631": 4, "0781": 4, "2051": [4, 5], "2070": 4, "0334": 4, "2891": 4, "1768": 4, "0488": 4, "2393": 4, "0396": 4, "1348": 4, "1533": 4, "0771": 4, "0845": 4, "0232": 4, "0178": 4, "1040": 4, "1582": 4, "1167": 4, "0474": 4, "0359": 4, "2500": 4, "0432": 4, "0972": 4, "0933": 4, "2188": 4, "0776": 4, "0674": 4, "requires_grad": 4, "0028": 4, "1852": 4, "1396": 4, "1506": 4, "1635": 4, "0043": 4, "0680": 4, "2257": 4, "1890": 4, "0464": 4, "2960": 4, "1840": 4, "0451": 4, "2395": 4, "0413": 4, "1446": 4, "0621": 4, "0478": 4, "0038": 4, "0830": 4, "1473": 4, "0926": 4, "0547": 4, "0824": 4, "0429": 4, "2737": 4, "0355": 4, "0782": 4, "2043": [4, 5], "0740": 4, "arriv": [4, 5], "pearson": 4, "numpi": [4, 5], "np": [4, 5], "arrai": [4, 7], "detach": 4, "graph": [4, 5], "weights_f16": 4, "weights_q2_k": 4, "flat_f16": 4, "flatten": 4, "flat_q2_k": 4, "corrcoef": 4, "4f": [4, 8], "9970": 4, "exemplifi": [4, 6, 7], "70b": [4, 5, 6], "unsloth": 4, "141": 4, "q8_0": [4, 6], "75": [4, 7], "47": [4, 5, 6, 7], "cumul": [4, 5], "26": [4, 5, 6], "19": [4, 5, 6, 7], "space": [4, 5, 6, 7], "counterpart": 4, "spectrum": [4, 5], "variant": [4, 5, 6, 7], "laptop": [4, 5], "desktop": [4, 5, 6], "enterpris": [4, 5, 6, 7, 8], "ceil": 4, "notabl": [4, 5, 7, 8], "bitnet": 4, "cpp": [4, 8], "arm": 4, "x86": 4, "speedup": [4, 6], "37x": 4, "07x": 4, "17x": 4, "beyond": [4, 5, 7], "raw": [4, 5, 6, 7, 8], "speed": [4, 5, 6, 7], "energi": [4, 5], "55": [4, 5, 6], "70": [4, 5, 6], "71": [4, 5], "82": [4, 7], "impress": [4, 8], "100b": 4, "b1": 4, "58": [4, 6], "pace": [4, 5, 7], "second": [4, 5, 6, 7], "kernel": 4, "characterist": [4, 5, 6, 7, 8], "excit": 4, "frontier": [4, 7], "compel": [4, 5, 6, 8], "acceler": [4, 5, 6, 7], "faster": [4, 6], "arithmet": [4, 5], "benefici": [4, 5, 6], "sustain": [4, 5, 6, 7], "Be": [4, 5, 6, 7], "fine": [4, 5, 7], "pure": [4, 5, 6], "unlock": [4, 8], "track": [4, 5, 7], "chargeback": 4, "regularli": [4, 5], "wz": 4, "jinheng": 4, "hansong": 4, "ting": [4, 7], "shaoguang": 4, "shume": 4, "ma": [4, 5, 7], "hongyu": [4, 5], "xia": [4, 5, 6], "infra": 4, "fast": [4, 5, 6, 7, 8], "lossless": 4, "16144": 4, "andreessenhorowitz24": 4, "huggingface4w": [4, 6], "2024w": [4, 6], "unsloth24": 4, "jonathan": [4, 5, 7], "ceo": [4, 5], "groq": [4, 6], "streamlin": [4, 6, 8], "notat": 4, "width": [4, 6], "_k": 4, "_0": 4, "matter": 5, "beauti": 5, "smart": [5, 7], "agre": 5, "wrong": 5, "feynman": 5, "advent": 5, "norm": 5, "realm": 5, "convent": [5, 7], "evolut": [5, 6], "conceiv": 5, "entrench": 5, "seem": 5, "daunt": 5, "ignor": 5, "outdat": [5, 7, 8], "inevit": 5, "setback": 5, "imper": 5, "embrac": 5, "proactiv": [5, 7], "mindset": 5, "front": [5, 6], "produc": [5, 6, 7, 8], "novel": [5, 6], "ident": 5, "isn": [5, 7], "bug": 5, "random": [5, 7, 8], "testabl": 5, "exceedingli": 5, "guarante": [5, 6, 7, 8], "primari": [5, 7], "nucleu": 5, "2020": 5, "summari": [5, 6, 7, 8], "alter": 5, "rigid": 5, "wildli": 5, "incoher": 5, "inadequ": [5, 7], "temp": 5, "df_result": 5, "ntemperatur": 5, "40": [5, 6], "temp_respons": 5, "iterrow": [5, 7], "10000": [5, 8], "appl": [5, 8], "txt": [5, 6, 8], "sec_fil": [5, 8], "nsecur": 5, "AND": [5, 8], "exchang": [5, 7, 8], "commiss": [5, 7, 8], "nwashington": 5, "20549": 5, "nform": 5, "pursuant": 5, "TO": [5, 7], "13": [5, 6, 7], "OR": 5, "OF": [5, 7], "THE": [5, 7], "1934": 5, "nfor": 5, "fiscal": 5, "septemb": 5, "28": [5, 6, 7], "nor": 5, "period": [5, 7], "ncommiss": 5, "001": [5, 6], "36743": 5, "ng66145g66i43": 5, "jpg": 5, "nappl": 5, "exact": [5, 6, 7], "registr": 5, "charter": 5, "ncalifornia": 5, "t94": 5, "2404110": 5, "jurisdict": 5, "nof": 5, "incorpor": [5, 6, 7, 8], "employ": 5, "park": 5, "ncupertino": 5, "california": [5, 7, 8], "n95014": 5, "princip": 5, "offic": [5, 7], "408": 5, "996": 5, "1010": 5, "telephon": 5, "regist": 5, "ntitl": 5, "ttrade": 5, "symbol": 5, "tname": 5, "ncommon": 5, "stock": [5, 8], "00001": 5, "naapl": 5, "tthe": 5, "nasdaq": [5, 8], "llc": [5, 8], "n0": 5, "000": [5, 6, 8], "note": [5, 6, 7, 8], "2025": 5, "875": 5, "625": 5, "2026": 5, "2027": 5, "375": 5, "2029": 5, "050": 5, "2031": [5, 7], "600": 5, "2042": 5, "nindic": 5, "issuer": 5, "405": 5, "nye": 5, "preced": [5, 8], "shorter": 5, "past": [5, 7], "90": [5, 6, 7], "submit": [5, 6, 7], "electron": 5, "232": 5, "filer": 5, "12b": [5, 7], "nlarg": 5, "tacceler": 5, "nnon": 5, "tsmaller": 5, "nemerg": 5, "nif": 5, "elect": [5, 7], "revis": [5, 7], "attest": 5, "404": 5, "sarban": 5, "oxlei": 5, "7262": 5, "firm": [5, 7], "prepar": [5, 6, 7], "correct": [5, 7], "restat": 5, "incent": 5, "compens": 5, "240": 5, "10d": 5, "shell": 5, "aggreg": [5, 7], "vote": 5, "held": [5, 8], "affili": [5, 8], "29": [5, 6, 7, 8], "last": [5, 7, 8], "quarter": 5, "628": [5, 8], "553": [5, 8], "sole": [5, 7], "disclosur": [5, 6, 7], "director": [5, 6, 7], "date": 5, "exclud": 5, "n15": 5, "115": [5, 8], "823": [5, 8], "outstand": [5, 8], "octob": [5, 8], "18": [5, 6, 7, 8], "ndocument": 5, "BY": 5, "nportion": 5, "meet": [5, 7, 8], "sharehold": 5, "iii": 5, "ntabl": 5, "npage": 5, "npart": 5, "nitem": 5, "nbusi": 5, "1a": 5, "nrisk": 5, "1b": [5, 6, 7], "nunresolv": 5, "staff": 5, "comment": 5, "n17": 5, "1c": 5, "ncybersecur": 5, "nproperti": 5, "n18": 5, "nlegal": 5, "proceed": [5, 7], "nmine": 5, "ii": [5, 6, 8], "nmarket": 5, "stockhold": 5, "purchas": [5, 7], "n19": 5, "reserv": 5, "n20": 5, "nmanag": 5, "n21": 5, "7a": 5, "nquantit": 5, "n27": 5, "nfinanci": 5, "supplementari": 5, "n28": 5, "nchang": 5, "disagr": 5, "n51": 5, "9a": 5, "ncontrol": 5, "procedur": [5, 7], "9b": 5, "nother": 5, "n52": 5, "9c": 5, "ndisclosur": 5, "foreign": 5, "ndirector": 5, "corpor": [5, 7], "nexecut": 5, "ownership": [5, 6], "certain": [5, 7, 8], "owner": 5, "ncertain": 5, "nprincip": 5, "fee": 5, "iv": 5, "nexhibit": 5, "n53": 5, "n56": 5, "nthi": 5, "litig": [5, 6], "reform": 5, "1995": 5, "uncertainti": [5, 6, 7], "macroeconom": 5, "anticip": [5, 7], "caus": [5, 7], "oblig": 5, "nunless": 5, "herein": 5, "calendar": 5, "wholli": 5, "subsidiari": 5, "unless": [5, 6], "ncompani": 5, "manufactur": 5, "tablet": [5, 6], "wearabl": 5, "accessori": 5, "sell": [5, 7], "varieti": [5, 6], "52": [5, 7], "53": [5, 7], "week": 5, "saturdai": 5, "nproduct": 5, "niphon": 5, "io": [5, 8], "iphon": 5, "se": [5, 7], "nmac": 5, "maco": [5, 6], "mac": [5, 6], "macbook": 5, "air": 5, "imac": 5, "studio": 5, "nipad": 5, "multipurpos": 5, "ipado": 5, "ipad": 5, "nwearabl": 5, "home": [5, 8], "smartwatch": 5, "wireless": 5, "headphon": 5, "spatial": 5, "watcho": 5, "watch": 5, "ultra": 5, "airpod": 5, "beat": [5, 6], "visiono": 5, "nhome": 5, "tv": 5, "tvo": 5, "homepod": 5, "fidel": [5, 8], "naccessori": 5, "brand": 5, "third": [5, 6, 7], "parti": [5, 6, 7], "nservic": 5, "nadvertis": 5, "advertis": 5, "licens": 5, "napplecar": 5, "portfolio": 5, "applecar": 5, "repair": 5, "coverag": [5, 7], "accident": 5, "damag": [5, 7], "theft": [5, 7], "ncloud": 5, "ndigit": 5, "app": [5, 6], "discov": [5, 6, 7], "download": [5, 6], "music": 5, "podcast": 5, "subscript": [5, 6], "arcad": 5, "sm": 5, "listen": [5, 6], "radio": 5, "station": 5, "magazin": 5, "exclus": 5, "sport": 5, "npayment": 5, "payment": 5, "credit": 5, "pai": [5, 6], "cashless": 5, "nsegment": 5, "primarili": [5, 7], "geograph": [5, 7], "basi": [5, 6], "segment": [5, 8], "america": 5, "europ": 5, "china": [5, 7], "japan": 5, "rest": [5, 6], "asia": 5, "pacif": 5, "north": [5, 7], "south": 5, "european": [5, 7], "india": 5, "middl": [5, 6, 7], "east": 5, "africa": 5, "mainland": 5, "kong": 5, "taiwan": 5, "australia": 5, "asian": 5, "although": [5, 6], "partner": [5, 6, 7], "mid": 5, "resel": 5, "retail": 5, "sale": 5, "indirect": 5, "channel": [5, 7], "cellular": 5, "carrier": 5, "net": [5, 8], "38": [5, 6, 7], "ncompetit": 5, "downward": 5, "pressur": [5, 7], "gross": [5, 7], "cycl": [5, 7], "competitor": [5, 6, 7], "compet": [5, 6], "imit": 5, "infring": [5, 6], "intellectu": [5, 6, 7], "marketplac": [5, 7], "nearli": [5, 6], "reput": [5, 7], "expand": [5, 6, 7], "illegitim": [5, 7], "collabor": [5, 6, 7], "nsuppli": 5, "nalthough": 5, "particip": 5, "shortag": 5, "commod": [5, 6], "fluctuat": 5, "commonli": 5, "until": [5, 7, 8], "supplier": 5, "matur": 5, "concentr": 5, "enter": [5, 8], "agreement": 5, "suppli": [5, 8], "renew": 5, "nresearch": 5, "nbecaus": 5, "upon": [5, 7], "flow": [5, 8], "acquisit": [5, 7], "nintellectu": 5, "broad": [5, 6, 8], "patent": 5, "copyright": [5, 6], "trademark": 5, "secret": 5, "differenti": 5, "skill": [5, 7], "personnel": 5, "pursu": [5, 7], "thousand": [5, 6], "durat": 5, "adequ": [5, 7], "nin": 5, "holidai": [5, 7], "fill": 5, "inventori": 5, "older": [5, 6], "newer": 5, "distributor": 5, "nhuman": 5, "strive": 5, "retain": [5, 6, 7], "talent": 5, "member": [5, 7], "164": 5, "ncompens": 5, "equit": 5, "succe": 5, "health": [5, 7], "awai": [5, 7], "ngrowth": 5, "career": 5, "leadership": [5, 7], "nworkplac": 5, "workplac": 5, "ninclus": 5, "workforc": 5, "nengag": 5, "among": [5, 6, 7, 8], "gaug": 5, "sentiment": [5, 6, 8], "nhealth": 5, "everywher": 5, "crisi": 5, "visitor": 5, "navail": 5, "quarterli": 5, "q": [5, 6, 7], "amend": 5, "sec": [5, 8], "Such": [5, 7], "charg": 5, "investor": [5, 8], "aspx": 5, "websit": [5, 6, 7], "environment": [5, 7], "referenc": 5, "inact": 5, "textual": 5, "unknown": [5, 7], "advers": 5, "conjunct": 5, "consolid": 5, "nmacroeconom": 5, "facil": 5, "assembli": 5, "site": [5, 8], "nadvers": 5, "slow": 5, "recess": 5, "unemploy": 5, "inflat": 5, "tighter": 5, "currenc": 5, "monetari": 5, "contract": [5, 6], "logist": 5, "instabl": [5, 7], "inabl": 5, "financ": [5, 6, 7], "insolv": 5, "counterparti": 5, "debt": 5, "liquid": 5, "fair": [5, 7], "instrument": 5, "polit": [5, 7], "disput": 5, "geopolit": 5, "tension": [5, 7], "terror": 5, "accid": 5, "interrupt": 5, "npolit": 5, "outsourc": 5, "korea": 5, "vietnam": 5, "restrict": [5, 6, 7, 8], "tariff": 5, "export": 5, "portion": [5, 6], "revenu": [5, 8], "restructur": 5, "ceas": 5, "escal": [5, 7], "nmani": 5, "prone": [5, 7], "earthquak": 5, "climat": 5, "weather": 5, "plant": 5, "terrorist": [5, 7], "attack": [5, 7], "hostil": 5, "ransomwar": 5, "cybersecur": [5, 7], "labor": 5, "nsuch": 5, "imposs": [5, 6], "slowdown": 5, "outag": 5, "neg": [5, 7, 8], "pandem": 5, "covid": 5, "economi": 5, "imposit": 5, "stringent": [5, 6, 7], "travel": 5, "freight": 5, "movement": 5, "ramp": 5, "nfollow": 5, "expenditur": 5, "resum": 5, "exacerb": 5, "insur": 5, "nglobal": 5, "unabl": 5, "assur": [5, 7], "minor": [5, 7], "naddition": 5, "intensifi": 5, "seamlessli": 5, "nto": 5, "stimul": 5, "ndue": 5, "upgrad": 5, "quantiti": 5, "defect": 5, "defici": 5, "supersed": 5, "nsubstanti": 5, "transport": 5, "reimburs": 5, "warranti": 5, "unanticip": 5, "liabil": 5, "final": [5, 7, 8], "finish": [5, 7], "destin": 5, "prepay": 5, "termin": [5, 6], "recover": 5, "exposur": [5, 7], "nfutur": 5, "semiconductor": 5, "suffer": [5, 7], "constrain": [5, 6, 8], "shipment": 5, "unexpectedli": 5, "interfer": 5, "unsaf": [5, 7], "expos": [5, 7], "widespread": [5, 7], "vulner": [5, 7], "compromis": [5, 6, 7], "claim": [5, 6, 7], "intang": 5, "lost": [5, 7], "cancel": 5, "obsolet": 5, "exce": [5, 7], "realiz": 5, "accru": 5, "excess": 5, "impair": 5, "whenev": 5, "circumst": 5, "amount": [5, 7, 8], "carri": [5, 6, 8], "incur": 5, "unpredict": [5, 7], "obsolesc": 5, "forecast": [5, 7], "incorrectli": [5, 7, 8], "extens": [5, 6, 8], "issuanc": 5, "unknowingli": [5, 7], "notifi": 5, "preclud": 5, "bui": 5, "percept": 5, "android": 5, "playstat": 5, "nintendo": 5, "xbox": 5, "inclin": 5, "devot": 5, "dissatisfi": 5, "vast": [5, 7], "storefront": 5, "safari": 5, "union": [5, 7], "eu": [5, 7], "dma": 5, "narrow": [5, 6, 7], "scope": [5, 6, 7], "elimin": [5, 6], "nfailur": 5, "appeal": 5, "subscrib": 5, "nsome": 5, "manner": [5, 7], "nurtur": 5, "nmuch": 5, "chief": 5, "silicon": 5, "vallei": 5, "constantli": 5, "driver": [5, 6], "recruit": 5, "subsidi": 5, "staf": 5, "contractor": 5, "placement": 5, "increment": 5, "weaken": 5, "telecommun": 5, "war": 5, "virus": 5, "ins": 5, "incid": [5, 7], "ineffect": 5, "thing": [5, 8], "interf": 5, "imped": 5, "ship": 5, "nloss": 5, "unauthor": [5, 7], "confidenti": [5, 6], "encrypt": 5, "But": [5, 7, 8], "behalf": 5, "normal": [5, 7, 8], "investig": [5, 7], "penalti": [5, 6], "frequenc": [5, 6, 7], "actor": [5, 7], "circumv": [5, 7], "obfusc": 5, "forens": 5, "hinder": [5, 8], "recov": 5, "perpetr": 5, "profil": [5, 6], "authent": 5, "hack": [5, 7], "malfeas": 5, "faulti": 5, "password": 5, "irregular": 5, "fraudul": 5, "induc": 5, "disclos": [5, 8], "usernam": 5, "turn": [5, 7], "multifactor": 5, "unusu": 5, "freez": 5, "suspici": 5, "nwhile": 5, "ninvest": 5, "ongo": [5, 6], "contempl": 5, "endeavor": 5, "distract": 5, "tangibl": 5, "approv": 5, "oner": 5, "ventur": 5, "riski": 5, "leas": 5, "unfavor": 5, "arisen": 5, "ordinari": 5, "cours": [5, 6, 7], "resolv": [5, 6, 7], "sometim": [5, 8], "indemnif": 5, "indemnifi": 5, "alleg": 5, "magnitud": 5, "assert": 5, "royalti": 5, "vigor": 5, "defend": 5, "court": [5, 6], "internation": 5, "plaintiff": 5, "injunct": 5, "relief": 5, "nregardless": 5, "merit": 5, "recognit": [5, 6, 7], "settl": 5, "uncertain": 5, "disgorg": 5, "remedi": [5, 7], "worldwid": 5, "antitrust": 5, "bill": 5, "commerc": 5, "televis": 5, "film": 5, "anticorrupt": 5, "cash": 5, "repatri": 5, "launder": 5, "tax": 5, "wast": 5, "recycl": 5, "ncomplianc": 5, "impos": [5, 6, 7, 8], "agent": [5, 6, 7], "nregulatori": 5, "ban": [5, 7], "nexpect": 5, "increasingli": [5, 6, 7, 8], "greenhous": 5, "ga": 5, "emiss": 5, "civil": 5, "disagre": 5, "perceiv": 5, "feder": 5, "nfrom": 5, "noncompli": 5, "individu": [5, 6, 7], "lawsuit": [5, 6], "monopol": 5, "nfurther": 5, "earn": 5, "search": [5, 6, 7], "nthere": 5, "transfer": 5, "pass": [5, 6, 7, 8], "pend": 5, "inquiri": [5, 7], "government": 5, "entiti": [5, 6, 7, 8], "biometr": 5, "notif": 5, "permit": [5, 6, 8], "healthcar": [5, 6], "liabl": 5, "investigatori": 5, "cardhold": 5, "acquir": 5, "denomin": 5, "offset": 5, "strengthen": [5, 7], "nconvers": 5, "thu": 5, "hedg": 5, "deterior": 5, "sovereign": 5, "heighten": [5, 7], "worsen": 5, "A": [5, 6, 7, 8], "collater": 5, "bank": 5, "unsecur": 5, "subassembli": 5, "assembl": 5, "legisl": 5, "ireland": [5, 7], "singapor": 5, "organis": 5, "statutori": 5, "valuat": 5, "defer": 5, "bodi": [5, 7], "adequaci": 5, "ow": 5, "ngener": 5, "repurchas": 5, "dividend": 5, "consumm": 5, "declar": 5, "board": [5, 7], "unresolv": 5, "nnone": 5, "threat": [5, 7], "postur": 5, "25": [5, 6, 7], "2016": 5, "coordin": [5, 7], "committe": [5, 7], "oversight": [5, 7], "counsel": 5, "chair": 5, "headquart": 5, "cupertino": [5, 8], "center": [5, 7, 8], "formal": [5, 7, 8], "conclud": [5, 6], "uninstal": 5, "web": [5, 6, 7], "browser": 5, "june": 5, "contractu": 5, "desist": 5, "stai": [5, 6], "grant": 5, "ndepart": 5, "justic": 5, "depart": [5, 7], "doj": 5, "district": 5, "attornei": 5, "jersei": 5, "redress": [5, 7], "anticompetit": 5, "nonmonetari": 5, "defens": [5, 7], "nepic": 5, "epic": 5, "northern": 5, "unfair": [5, 7], "enjoin": 5, "extern": [5, 7], "januari": 5, "motion": 5, "oppos": [5, 7], "vacat": 5, "fourth": 5, "mine": 5, "nnot": 5, "aapl": 5, "nholder": 5, "na": [5, 7], "301": 5, "npurchas": 5, "nshare": 5, "nperiod": 5, "ttotal": 5, "taverag": 5, "npaid": 5, "nannounc": 5, "napproxim": 5, "That": [5, 7, 8], "nunder": 5, "njune": 5, "august": [5, 7], "nopen": 5, "negoti": [5, 7], "t35": 5, "697": 5, "t224": 5, "naugust": 5, "31": [5, 6], "t42": 5, "910": 5, "t221": 5, "39": [5, 6], "nseptemb": 5, "t33": 5, "653": 5, "t222": 5, "86": [5, 6], "ntotal": [5, 7], "t112": 5, "260": 5, "t89": 5, "074": 5, "110": 5, "10b5": 5, "reinvest": 5, "dow": 5, "supersector": 5, "27": [5, 7], "2019": 5, "n2218": 5, "tseptemb": 5, "t100": 5, "t207": 5, "t273": 5, "t281": 5, "t322": 5, "t430": 5, "t113": 5, "t156": 5, "t131": 5, "t155": 5, "t210": 5, "ndow": 5, "t146": 5, "t216": 5, "t215": 5, "nfirst": 5, "nsecond": 5, "nthird": 5, "sequoia": 5, "nfourth": 5, "plu": [5, 6], "nfiscal": 5, "six": 5, "realign": 5, "span": [5, 6, 7], "indirectli": 5, "n2024": 5, "tchang": 5, "t2023": 5, "t2022": 5, "namerica": 5, "t167": 5, "045": 5, "t3": 5, "t162": 5, "560": 5, "t169": 5, "658": 5, "neurop": 5, "t101": 5, "328": 5, "t7": 5, "294": 5, "t95": 5, "118": 5, "ngreater": 5, "t66": 5, "952": 5, "t72": 5, "559": 5, "t74": 5, "njapan": 5, "t25": 5, "052": 5, "t24": 5, "257": 5, "977": 5, "nrest": 5, "t30": 5, "t4": 5, "t29": 5, "615": 5, "t1": 5, "t391": 5, "035": 5, "t2": 5, "t383": 5, "285": 5, "t394": 5, "weak": [5, 7], "renminbi": 5, "yen": [5, 8], "t201": 5, "183": 5, "t200": 5, "583": 5, "t205": 5, "489": 5, "984": 5, "357": 5, "t40": 5, "177": [5, 7], "t26": 5, "694": 5, "t28": 5, "300": 5, "292": 5, "t37": 5, "005": 5, "t39": 5, "845": [5, 7], "t41": 5, "241": 5, "n96": 5, "169": 5, "t13": 5, "t85": 5, "t9": 5, "t78": 5, "129": [5, 7], "amort": 5, "bundl": 5, "flat": 5, "ngross": 5, "t109": 5, "633": 5, "t108": 5, "803": 5, "t114": 5, "728": 5, "t71": 5, "t60": 5, "345": 5, "t56": 5, "054": 5, "t180": 5, "683": 5, "148": 5, "t170": 5, "782": 5, "t36": 5, "t73": 5, "t70": 5, "t46": 5, "t44": 5, "t43": 5, "noper": 5, "t31": 5, "370": 5, "t5": 5, "915": 5, "t14": 5, "251": 5, "npercentag": 5, "t8": 5, "nsell": 5, "administr": 5, "097": 5, "932": 5, "094": 5, "t6": 5, "t57": 5, "467": 5, "t54": 5, "847": 5, "t51": 5, "t15": 5, "headcount": 5, "nprovis": 5, "749": 5, "t16": 5, "741": 5, "t19": 5, "neffect": 5, "nstatutori": 5, "t21": 5, "aid": [5, 7], "nliquid": 5, "unrestrict": 5, "140": 5, "ndebt": 5, "97": [5, 7], "payabl": 5, "promissori": 5, "nleas": 5, "nmanufactur": 5, "noncancel": 5, "ndeem": 5, "tcja": 5, "nstate": 5, "fund": [5, 6], "escrow": 5, "ncapit": 5, "95": [5, 7], "nrecent": 5, "pronounc": 5, "nincom": 5, "fasb": 5, "asu": 5, "09": [5, 7], "740": 5, "reconcili": 5, "reconcil": [5, 8], "disaggreg": 5, "prospect": 5, "novemb": [5, 7], "07": [5, 7, 8], "280": 5, "maker": 5, "codm": 5, "retrospect": 5, "ncritic": 5, "conform": [5, 8], "gaap": 5, "nuncertain": 5, "domest": 5, "taxat": 5, "resolut": 5, "conting": 5, "ninterest": 5, "forth": 5, "hypothet": 5, "nsensit": 5, "nhypothet": 5, "nrate": 5, "npotenti": 5, "n100": 5, "tenor": 5, "ndeclin": 5, "755": 5, "089": 5, "nterm": 5, "nincreas": 5, "t139": 5, "t194": 5, "nforeign": 5, "var": 5, "mont": 5, "carlo": 5, "interv": 5, "538": 5, "669": 5, "nindex": 5, "tpage": 5, "nconsolid": 5, "n29": 5, "n30": 5, "sheet": 5, "n31": 5, "n32": 5, "n33": 5, "nnote": 5, "n34": 5, "nreport": 5, "n48": 5, "nall": 5, "omit": [5, 8], "submiss": 5, "nyear": 5, "n2023": 5, "n2022": 5, "nnet": 5, "t294": 5, "866": 5, "t298": 5, "085": 5, "t316": 5, "199": 5, "t96": 5, "ncost": 5, "t185": 5, "233": 5, "t189": 5, "282": 5, "471": 5, "119": 5, "855": 5, "t22": 5, "075": 5, "352": 5, "t214": 5, "137": 5, "t223": 5, "546": 5, "t123": 5, "216": 5, "t119": 5, "437": 5, "t269": 5, "565": 5, "334": 5, "485": 5, "736": 5, "103": 5, "t93": 5, "995": 5, "t99": 5, "nearn": 5, "nbasic": 5, "ndilut": 5, "08": [5, 6, 8], "343": [5, 7], "783": 5, "744": 5, "215": 5, "963": 5, "095": 5, "812": 5, "547": 5, "325": 5, "819": 5, "nsee": 5, "translat": [5, 6, 7], "t395": 5, "765": 5, "511": 5, "unreal": 5, "832": 5, "t323": 5, "212": 5, "nadjust": 5, "337": 5, "717": 5, "394": 5, "138": 5, "850": 5, "563": 5, "104": 5, "t204": 5, "t253": 5, "816": 5, "899": 5, "272": 5, "t98": 5, "016": 5, "652": 5, "t88": 5, "531": 5, "nasset": 5, "ncurrent": 5, "ncash": 5, "943": 5, "965": 5, "228": 5, "590": 5, "naccount": 5, "410": 5, "508": 5, "nvendor": 5, "t32": 5, "833": 5, "477": 5, "ninventori": 5, "286": 5, "331": 5, "287": 5, "695": 5, "t152": 5, "987": 5, "t143": 5, "566": 5, "t91": 5, "479": 5, "544": 5, "t45": 5, "680": 5, "715": 5, "834": 5, "t64": 5, "758": 5, "t211": 5, "993": 5, "t209": 5, "017": 5, "t364": 5, "980": [5, 7], "t352": 5, "nliabil": 5, "t68": 5, "960": 5, "t62": 5, "611": 5, "304": 5, "t58": 5, "829": 5, "ndefer": 5, "249": 5, "061": 5, "ncommerci": 5, "967": 5, "985": 5, "t10": 5, "912": 5, "822": 5, "t176": 5, "392": 5, "t145": 5, "308": 5, "750": 5, "888": 5, "t49": 5, "848": 5, "638": 5, "t308": 5, "030": [5, 6], "t290": 5, "ncommit": 5, "nsharehold": 5, "400": 5, "116": 5, "786": 5, "550": 5, "n83": 5, "276": 5, "naccumul": 5, "deficit": 5, "154": 5, "214": 5, "172": 5, "452": 5, "950": 5, "146": [5, 7], "t50": 5, "672": 5, "t63": 5, "090": 5, "nbegin": 5, "849": 5, "365": 5, "423": 5, "346": 5, "175": 5, "withheld": 5, "settlement": 5, "521": 5, "971": 5, "t12": 5, "034": 5, "t11": 5, "nend": 5, "t83": 5, "nretain": 5, "068": 5, "562": 5, "ndividend": 5, "218": 5, "793": 5, "612": 5, "099": 5, "454": 5, "846": 5, "77": [5, 6], "046": 5, "186": 5, "109": 5, "t163": 5, "rsu": 5, "t0": 5, "98": [5, 6], "94": [5, 6, 7], "737": 5, "929": 5, "ndepreci": 5, "445": 5, "519": 5, "688": 5, "038": 5, "266": 5, "227": 5, "006": 5, "788": 5, "356": 5, "271": 5, "520": 5, "618": 5, "484": 5, "731": 5, "684": 5, "499": 5, "020": 5, "889": 5, "448": 5, "552": 5, "031": 5, "t118": 5, "254": 5, "t110": 5, "543": 5, "t122": 5, "151": 5, "48": [5, 6], "656": 5, "513": 5, "76": [5, 7], "923": 5, "nproce": 5, "211": 5, "686": 5, "917": 5, "135": 5, "828": 5, "446": 5, "447": 5, "959": 5, "708": 5, "086": 5, "935": 5, "705": 5, "354": 5, "nfinanc": 5, "441": 5, "431": 5, "223": [5, 7], "234": [5, 7], "025": 5, "841": 5, "nrepurchas": 5, "949": 5, "89": [5, 7], "402": 5, "465": 5, "nrepay": 5, "958": 5, "repay": 5, "978": 5, "955": 5, "361": 5, "581": 5, "160": 5, "121": 5, "983": 5, "488": 5, "794": 5, "760": 5, "nsupplement": 5, "102": 5, "t18": 5, "679": 5, "573": 5, "33": [5, 6, 7], "nbasi": 5, "prior": [5, 7], "reclassifi": 5, "nrevenu": 5, "remit": [5, 7], "straight": 5, "vest": 5, "sold": 5, "nderiv": 5, "nonleas": 5, "34": [5, 7], "entitl": 5, "commenc": 5, "deliveri": 5, "stand": 5, "ssp": 5, "icloud": 5, "siri": 5, "discount": 5, "undeliv": 5, "unbil": 5, "n26": 5, "n37": 5, "moder": [5, 6], "64": [5, 6, 7], "dilut": 5, "nnumer": 5, "ndenomin": 5, "nweight": 5, "312": 5, "316": 5, "856": 5, "antidilut": 5, "tunreal": 5, "ngain": 5, "tfair": 5, "nvalu": 5, "tcash": 5, "nequival": 5, "tcurrent": 5, "tnon": 5, "t27": 5, "nlevel": 5, "nmonei": 5, "t778": 5, "nmutual": 5, "n515": 5, "t105": 5, "t617": 5, "nsubtot": 5, "293": 5, "395": 5, "nu": 5, "treasuri": 5, "516": 5, "t212": 5, "087": 5, "380": 5, "159": 5, "t703": 5, "t17": 5, "568": 5, "158": 5, "810": 5, "ncertif": 5, "deposit": 5, "t873": 5, "t387": 5, "t478": 5, "066": 5, "ncorpor": 5, "t65": 5, "622": 5, "t270": 5, "953": 5, "939": 5, "027": 5, "t47": 5, "886": 5, "nmunicip": 5, "t412": 5, "t405": 5, "t190": 5, "nmortgag": 5, "595": 5, "t175": 5, "403": 5, "t23": 5, "367": 5, "278": [5, 7], "t132": 5, "t583": 5, "635": 5, "t128": 5, "056": 5, "966": 5, "t34": 5, "t160": 5, "t688": 5, "650": 5, "36": [5, 6, 7], "359": [5, 7], "t481": 5, "n442": 5, "t428": 5, "t923": 5, "t909": 5, "406": 5, "114": 5, "468": 5, "136": 5, "t271": 5, "533": 5, "048": [5, 6], "491": 5, "332": 5, "t320": 5, "t608": 5, "t76": 5, "840": 5, "956": 5, "890": 5, "t20": 5, "627": 5, "243": 5, "t628": 5, "t602": 5, "t192": 5, "t410": 5, "735": 5, "636": 5, "t344": 5, "t144": 5, "470": 5, "657": 5, "831": 5, "125": 5, "162": 5, "t173": 5, "752": 5, "corrobor": 5, "mortgag": 5, "classifi": [5, 7], "37": [5, 7], "swap": 5, "remeasur": 5, "notion": 5, "069": 5, "730": 5, "575": 5, "493": 5, "t104": 5, "777": 5, "nhedg": 5, "433": 5, "505": 5, "247": [5, 7], "ntrade": 5, "41": [5, 6, 7], "44": [5, 7], "depreci": 5, "nland": 5, "690": 5, "nmachineri": 5, "t80": 5, "205": [5, 6], "314": 5, "nleasehold": 5, "839": 5, "599": 5, "73": [5, 6, 7], "884": 5, "852": 5, "t55": 5, "906": 5, "601": 5, "703": 5, "010": 5, "457": 5, "634": 5, "391": 5, "neuropean": 5, "opinion": [5, 7], "1991": 5, "2007": 5, "irish": 5, "branch": 5, "2003": 5, "2014": 5, "2015": 5, "minist": 5, "juli": [5, 7], "annul": 5, "ecj": 5, "hear": 5, "asid": 5, "confirm": 5, "unrecogn": 5, "nfeder": 5, "571": 5, "080": 5, "644": 5, "265": 5, "801": 5, "726": 5, "570": 5, "298": 5, "49": [5, 7], "t84": 5, "428": 5, "603": 5, "483": [5, 7], "t347": 5, "t669": 5, "076": 5, "830": 5, "419": 5, "072": 5, "pretax": 5, "72": [5, 7], "ncomput": 5, "885": 5, "012": 5, "124": 5, "518": 5, "nimpact": 5, "246": 5, "311": 5, "366": 5, "397": 5, "nexcess": 5, "893": 5, "871": 5, "192": [5, 7], "739": 5, "ntax": 5, "carryforward": 5, "302": 5, "naccru": 5, "413": [5, 7], "421": 5, "nunreal": 5, "173": 5, "168": 5, "873": 5, "743": 5, "nless": 5, "374": 5, "007": 5, "369": 5, "551": 5, "998": 5, "nright": 5, "179": 5, "nminimum": 5, "674": 5, "940": 5, "t511": 5, "t455": 5, "t490": 5, "805": 5, "202": 5, "indefinit": 5, "temporari": 5, "727": 5, "044": 5, "284": 5, "ndecreas": 5, "386": 5, "463": 5, "982": 5, "542": 5, "936": 5, "070": 5, "expir": 5, "statut": 5, "229": 5, "494": 5, "closur": 5, "intercompani": 5, "exceed": [5, 7], "multiyear": 5, "exercis": 5, "noncash": 5, "rou": 5, "tfinanci": 5, "t2024": 5, "tother": 5, "661": 5, "tproperti": 5, "015": 5, "303": 5, "676": 5, "t165": 5, "t752": 5, "t859": 5, "430": 5, "842": [5, 7], "tfinanc": 5, "n2025": 5, "820": 5, "t171": 5, "991": 5, "n2026": 5, "914": 5, "n2027": 5, "t59": 5, "733": 5, "n2028": 5, "360": 5, "t38": 5, "398": 5, "n2029": 5, "187": 5, "nthereaft": 5, "t837": 5, "undiscount": 5, "790": 5, "imput": 5, "376": 5, "534": 5, "t896": 5, "borrow": 5, "proce": 5, "nine": [5, 7], "nmatur": 5, "333": 5, "264": 5, "948": 5, "645": 5, "309": 5, "arrear": 5, "namount": 5, "n2013": 5, "nfix": 5, "2062": 5, "t97": 5, "341": 5, "03": 5, "65": [5, 7], "t106": 5, "572": 5, "n97": 5, "nunamort": 5, "321": 5, "358": 5, "113": 5, "662": 5, "930": 5, "342": 5, "800": 5, "180": 5, "88": 5, "ndure": 5, "425": 5, "426": 5, "372": 5, "589": 5, "055": 5, "appreci": 5, "four": [5, 6, 7], "holder": [5, 6], "n2014": 5, "bonu": 5, "nrestrict": 5, "nnumber": 5, "nrsu": 5, "ngrant": 5, "naggreg": 5, "nfair": 5, "nbalanc": 5, "t240": 5, "427": [5, 7], "t75": 5, "t150": 5, "861": 5, "501": 5, "768": 5, "87": [5, 6, 7], "101": [5, 7], "878": 5, "144": 5, "t127": 5, "t135": 5, "91": [5, 7], "456": 5, "78": [5, 6, 7], "59": [5, 7], "t140": 5, "326": 5, "t158": 5, "204": 5, "350": 5, "002": [5, 6], "nuncondit": 5, "uncondit": 5, "206": 5, "440": 5, "156": 5, "t633": 5, "t670": 5, "226": 5, "45": 5, "nconting": 5, "accrual": 5, "nconcentr": 5, "attribut": [5, 6, 7, 8], "46": 5, "t67": 5, "098": 5, "082": 5, "062": 5, "569": 5, "895": 5, "458": 5, "207": 5, "nonrecur": 5, "t142": 5, "196": 5, "t138": 5, "t147": 5, "859": 5, "nchina": 5, "n66": 5, "t181": 5, "887": 5, "t172": 5, "269": 5, "nlong": 5, "664": 5, "797": 5, "778": 5, "219": 5, "nopinion": 5, "nwe": 5, "fairli": 5, "pcaob": 5, "sponsor": 5, "treadwai": 5, "2013": 5, "unqualifi": 5, "thereon": 5, "nthese": 5, "misstat": 5, "fraud": [5, 7], "ndescript": 5, "naudit": 5, "nhow": 5, "nmatter": 5, "qualifi": 5, "letter": 5, "advisor": 5, "ernst": 5, "llp": 5, "auditor": 5, "2009": 5, "nsan": 5, "jose": 5, "nnovemb": 5, "coso": 5, "nour": 5, "ndefinit": 5, "disposit": 5, "receipt": 5, "nevalu": 5, "nbase": 5, "supervis": [5, 6, 7, 8], "13a": 5, "15d": 5, "ninher": 5, "paragraph": 5, "51": [5, 7, 8], "ninsid": 5, "deirdr": 5, "brien": 5, "vice": 5, "presid": 5, "affirm": 5, "april": 5, "withhold": 5, "remitt": 5, "mr": 5, "copi": 5, "solicit": 5, "00042": 5, "nincorpor": 5, "texhibit": 5, "descript": [5, 6, 7, 8], "tform": 5, "tfile": 5, "nrestat": 5, "namend": 5, "bylaw": 5, "nindentur": 5, "york": [5, 6, 8], "mellon": 5, "truste": 5, "noffic": 5, "certif": 5, "2018": 5, "85": [5, 6, 7], "05": 5, "2044": 5, "februari": 5, "2045": 5, "900": 5, "700": [5, 6], "250": [5, 7], "2036": 5, "2046": 5, "450": 5, "2047": 5, "2049": 5, "2030": 5, "2050": 5, "2060": 5, "2028": 5, "2041": 5, "2061": 5, "2032": 5, "2052": 5, "54": 5, "2033": 5, "2053": 5, "n12": 5, "nsubsidiari": 5, "n23": 5, "nconsent": 5, "n24": 5, "npower": 5, "signatur": 5, "nrule": 5, "nsection": 5, "1350": 5, "n101": 5, "ninlin": 5, "xbrl": 5, "n104": 5, "inlin": 5, "compensatori": 5, "herewith": 5, "furnish": 5, "herebi": 5, "undertak": 5, "56": [5, 6, 7], "nsignatur": 5, "npursuant": 5, "duli": 5, "undersign": 5, "thereunto": 5, "ndate": 5, "nby": 5, "luca": [5, 8], "maestri": 5, "nluca": 5, "nsenior": 5, "nchief": 5, "nknow": 5, "THESE": 5, "appoint": 5, "cook": 5, "jointli": 5, "her": 5, "substitut": 5, "him": 5, "thereto": 5, "therewith": 5, "ratifi": 5, "virtu": 5, "hereof": 5, "nname": 5, "ttitl": 5, "tdate": 5, "tchief": 5, "tnovemb": 5, "ntimothi": 5, "tsenior": 5, "kondo": 5, "nchri": 5, "wanda": 5, "austin": 5, "nwanda": 5, "gorski": 5, "tdirector": 5, "nalex": 5, "jung": 5, "nandrea": 5, "arthur": 5, "levinson": 5, "narthur": 5, "monica": 5, "lozano": 5, "nmonica": 5, "ronald": 5, "sugar": 5, "nronald": 5, "susan": 5, "wagner": 5, "nsusan": 5, "57": [5, 6], "turbo": [5, 6, 8], "outlin": [5, 6, 7], "invdestacksmeticsisdict": 5, "setispect": 5, "20cyan": 5, "evaluationseld": 5, "anvis": 5, "droitent": 5, "discernminerv": 5, "versbobprefvers": 5, "vo\u8be5": 5, "option\u548c": 5, "meio": 5, "\u0432\u0440\u0435\u043ccisco": 5, "dellaischenpoihscap": 5, "geme": 5, "gettim": 5, "unscal": 5, "vocabulari": [5, 6, 8], "closer": 5, "sharpen": 5, "uniform": 5, "raschka": 5, "repetit": [5, 8], "radic": 5, "grappl": 5, "safer": [5, 7], "fascin": 5, "spontan": 5, "answer": [5, 6, 7, 8], "aren": [5, 6], "linear": 5, "absent": [5, 7], "coax": 5, "journei": 5, "suddenli": 5, "manifest": 5, "deliber": [5, 7], "contend": 5, "rethink": 5, "tutor": 5, "children": [5, 7], "verifi": [5, 6, 8], "predefin": [5, 8], "weren": 5, "kind": 5, "usual": 5, "quantif": 5, "contamin": [5, 7], "unseen": [5, 7], "longitudin": 5, "mostli": [5, 8], "latter": 5, "tailor": [5, 7], "great": [5, 6, 8], "cognit": 5, "misinform": [5, 7], "tempor": 5, "disclaim": 5, "referr": 5, "incorrect": [5, 7], "demograph": [5, 7], "stereotyp": [5, 7], "societ": [5, 7], "pii": [5, 7], "anonym": 5, "leakag": [5, 7], "carryov": 5, "fallaci": 5, "think": [5, 6, 7], "idiom": 5, "sarcasm": 5, "terminologi": 5, "lingual": 5, "misunderstand": 5, "syntax": 5, "scan": 5, "compat": [5, 6, 8], "overconfid": 5, "clariti": [5, 7, 8], "audienc": 5, "densiti": 5, "satisfact": [5, 8], "misus": [5, 7], "moral": 5, "co2": 5, "etc": [5, 8], "palm": [5, 6], "easi": [5, 6, 7], "synthet": [5, 6, 7, 8], "timeout": 5, "inter": 5, "rater": 5, "ti": 5, "holist": [5, 7], "experiment": [5, 6, 8], "vi": 5, "categor": [5, 6, 7, 8], "intrins": [5, 6], "extrins": 5, "sequenc": [5, 6, 8], "perplex": [5, 6], "downstream": [5, 8], "synthesi": 5, "discret": 5, "prefix": [5, 7], "roug": 5, "bleu": 5, "bilingu": 5, "understudi": 5, "overlap": 5, "favor": [5, 6, 8], "breviti": 5, "insensit": 5, "semant": [5, 8], "orient": [5, 7], "gist": 5, "meteor": 5, "synonym": 5, "paraphras": 5, "alongsid": [5, 7], "computation": 5, "cider": 5, "consensu": 5, "tf": 5, "idf": 5, "caption": 5, "reliant": 5, "corpu": [5, 6], "ter": 5, "edit": [5, 7], "hypothesi": 5, "penal": 5, "bertscor": 5, "contextu": [5, 7], "bert": 5, "spice": 5, "proposit": [5, 6], "scene": [5, 7], "analyst": 5, "rouge_1": 5, "rouge_2": 5, "ideal": [5, 6, 7, 8], "setup": [5, 6, 7, 8], "evaluate_summari": 5, "unigram": 5, "bigram": 5, "absl": 5, "py": [5, 8], "rouge_scor": 5, "generated_summari": 5, "reference_summari": 5, "google_bleu": 5, "bleu_scor": 5, "rouge1": 5, "rouge2": 5, "arbitrari": 5, "chosen": [5, 7], "sentence1": 5, "cat": [5, 7], "sat": 5, "mat": 5, "sentence2": 5, "ate": 5, "3333333333333333": 5, "7272727272727272": 5, "4444444444444445": 5, "generate_summari": 5, "summir": 5, "liner": 5, "evaluate_summary_model": 5, "model_benchmark": 5, "models_test": 5, "benchmark_summari": 5, "model_summari": 5, "evaluation_result": 5, "statu": 5, "concis": [5, 6], "element": [5, 7, 8], "verbos": [5, 6, 7, 8], "peripher": 5, "quit": [5, 6, 8], "convei": 5, "breadth": 5, "Of": [5, 6, 7], "vibe": 5, "visualize_prompt_comparison": 5, "matplotlib": 5, "radar": 5, "radar_plot": 5, "tmp": 5, "ipykernel_1652501": 5, "940173201": 5, "userwarn": [5, 8], "figurecanvasagg": 5, "largest": [5, 6], "sarmah": 5, "granular": [5, 6], "likert": 5, "ensembl": 5, "repeatedli": 5, "fluenci": 5, "refin": 5, "integ": [5, 8], "rubric": 5, "hollist": 5, "judgeevalu": 5, "grammar": [5, 6, 8], "evaluate_with_llm": 5, "criterion": 5, "judge_model": 5, "candidate_summari": 5, "grammat": 5, "y": [5, 7, 8], "z": 5, "w": [5, 6, 7], "benchmark_model": 5, "test_model": 5, "input_text": [5, 6], "trillion": [5, 6], "evals_list": 5, "1775618912": 5, "slightli": 5, "drift": [5, 7], "lowest": [5, 6], "firstli": 5, "overhead": [5, 6], "egocentr": 5, "tight": 5, "medicin": [5, 7], "glider": 5, "deshpand": 5, "3b": 5, "685": 5, "aplic": 5, "earlier": [5, 7], "depict": [5, 7, 8], "multilingu": [5, 6, 7], "golden": 5, "languang": 5, "arena": 5, "randomli": 5, "customiz": [5, 6, 7], "irrelev": 5, "unhelp": [5, 7], "occasion": 5, "rare": 5, "perfectli": 5, "cater": [5, 6], "critiqu": [5, 7], "elo": 5, "exam": 5, "probe": [5, 7], "certifi": 5, "began": [5, 6], "glue": 5, "entail": [5, 6], "superglu": 5, "successor": 5, "grew": 5, "big": 5, "bench": [5, 6], "srivastava": 5, "truthfulqa": [5, 6], "multitask": 5, "hendryck": [5, 7], "multidisciplinari": 5, "stanford": 5, "helm": 5, "multidimension": 5, "surround": [5, 6, 7, 8], "humanev": [5, 6], "lmsy": 5, "brought": 5, "dialogu": [5, 6], "chiang": 5, "gather": 5, "alpacaev": 5, "duboi": 5, "mt": 5, "argilla": 5, "mila": 5, "mit": [5, 6], "contributor": [5, 6, 8], "western": 5, "centric": 5, "divid": [5, 7], "subset": [5, 7], "agnost": 5, "dialect": 5, "render": [5, 7], "crowdsourc": 5, "livebench": 5, "white": [5, 7], "resili": [5, 7], "meaningfulli": 5, "zebralog": 5, "grid": 5, "puzzl": 5, "brailsford": 5, "1999": 5, "lsat": 5, "hous": 5, "clue": 5, "deduct": 5, "programmat": [5, 8], "2x2": 5, "6x6": 5, "shot": [5, 7, 8], "reductio": 5, "ad": [5, 6, 7, 8], "absurdum": 5, "hard": 5, "10b": 5, "counterfactu": 5, "came": 5, "arc": 5, "prize": [5, 7], "chollet": 5, "mike": [5, 7], "knoop": 5, "founder": 5, "zapier": 5, "fran\u00e7oi": 5, "creator": [5, 6], "agi": 5, "kera": 5, "genuin": 5, "possess": 5, "elementari": 5, "novelti": 5, "interpol": 5, "synthes": 5, "fly": 5, "brute": 5, "pixel": 5, "unbeaten": 5, "win": [5, 6], "poorli": 5, "recombin": 5, "spur": [5, 7], "takeawai": 5, "vertic": [5, 7], "finbench": 5, "legalbench": 5, "guha": 5, "berkelei": 5, "bfcl": 5, "patil": 5, "fourrier": 5, "bespok": 5, "sdk": 5, "autoregress": 5, "sub": [5, 6], "liter": 5, "disturb": 5, "zero": [5, 6, 7, 8], "varianc": [5, 7], "yt": 5, "ut": 5, "ol": 5, "heteroscedast": 5, "regress": 5, "bivari": 5, "evaluation_track": 5, "evaluationtrack": 5, "model_config": 5, "basemodelconfig": 5, "parallelismmanag": 5, "pipelineparamet": 5, "envconfig": 5, "is_accelerate_avail": 5, "datetim": 5, "timedelta": 5, "initprocessgroupkwarg": 5, "create_evaluation_pipelin": 5, "cache_dir": 5, "float16": 5, "max_sampl": 5, "kwargs_handl": 5, "3000": 5, "save_detail": 5, "pipeline_param": 5, "launcher_typ": 5, "env_config": 5, "override_batch_s": 5, "use_chat_templ": 5, "trust_remote_cod": 5, "pipeline_paramet": 5, "schemat": 5, "vllm": [5, 8], "tgi": 5, "num_few_shot": 5, "bar": 5, "bigbench": 5, "winogrand": 5, "hellaswag": 5, "nlp": [5, 6, 7], "save_and_push_result": 5, "show_result": 5, "model_arg": 5, "send": [5, 6, 7, 8], "serverless": 5, "inference_server_address": 5, "inference_server_auth": 5, "model_id": 5, "null": 5, "bash": [5, 6], "command": [5, 6], "model_config_path": 5, "endpoint_model": 5, "llama3": 5, "qwen2": [5, 6, 8], "alibaba": [5, 6, 8], "5b": [5, 6, 8], "hui": [5, 6], "allal": [5, 6], "cluster": 5, "noteworthi": [5, 6], "grain": [5, 6, 8], "salt": [5, 8], "modular": 5, "offici": [5, 8], "revisit": 5, "trace": 5, "langchain_tracing_v2": 5, "langchain_api_kei": 5, "hf_evalu": 5, "langsmith_evalu": 5, "ls_client": 5, "dataset_nam": 5, "create_dataset": 5, "create_exampl": 5, "dataset_id": 5, "calculate_scor": 5, "reference_output": 5, "oai_client": 5, "xp_model_nam": 5, "lastli": 5, "run_evalu": 5, "And": [5, 6, 7], "upload_result": 5, "experiment_prefix": 5, "num_repetit": 5, "386a3620": 5, "9e1cc3cb": 5, "9d6a": 5, "4356": 5, "ab34": 5, "138e0abe8be4": 5, "8741976e": 5, "5268": 5, "4b75": 5, "949f": 5, "99477dde5d64": 5, "selectedsess": 5, "b831dc1e": 5, "90bc": 5, "4ed8": 5, "8080": [5, 6], "fb42444724d6": 5, "4it": 5, "latest": [5, 6, 7, 8], "tobia": [5, 8], "evaluate_modul": 5, "6fc70b7be0088120a372dfdd5d320b39b8bb3630cb8029b193941d9376e86bb0": 5, "tue": 5, "nov": [5, 6], "couldn": 5, "5it": 5, "5053784e": 5, "64445871": 5, "a53c": 5, "44b1": 5, "a422": 5, "4f49b2f9656f": 5, "69": [5, 7], "4b29f3c9": 5, "9ef7e39a": 5, "2add": 5, "410c": 5, "89f8": 5, "9f1a8b198cf1": 5, "61": [5, 7], "insert": 5, "combined_df": 5, "concat": [5, 7], "ignore_index": [5, 7], "execution_tim": 5, "example_id": 5, "333333": 5, "224388": 5, "feb10f92": 5, "3167": 5, "41f3": 5, "bb1c": 5, "d271153a31a8": 5, "5b196b22": 5, "9f4c": 5, "489c": 5, "b020": 5, "7823208b42d6": 5, "348101": 5, "722464": 5, "c310f159": 5, "064a": 5, "4035": 5, "97c3": 5, "a25bbf43abc2": 5, "386076": 5, "704104": 5, "f7f24899": 5, "dd50": 5, "409e": 5, "93cc": 5, "6fb1622b60bf": 5, "443038": 5, "725059": 5, "242856d6": 5, "efb5": 5, "4101": 5, "b1cf": 5, "5805532838ac": 5, "373418": 5, "795302": 5, "ce975169": 5, "a0ab": 5, "40ce": 5, "8e32": 5, "efa28d06079d": 5, "stat": [5, 6], "groupbi": [5, 7], "agg": [5, 7], "sort": 5, "sort_valu": 5, "subplot": 5, "pyplot": 5, "plt": 5, "ax1": 5, "ax2": 5, "figsiz": 5, "2ecc71": 5, "3498db": 5, "e74c3c": 5, "bleu_mean": 5, "bleu_std": 5, "enumer": [5, 7], "errorbar": 5, "yerr": 5, "fmt": 5, "markers": 5, "capsiz": 5, "set_ylabel": 5, "set_titl": 5, "set_xtick": 5, "set_xticklabel": 5, "rotat": 5, "set_ylim": 5, "bottom": 5, "legend": 5, "exec_mean": 5, "exec_std": 5, "tight_layout": 5, "ndetail": 5, "4038": 5, "0453": 5, "7815": 5, "0433": 5, "3768": 5, "0424": 5, "8343": 5, "2208": 5, "3519": 5, "0775": 5, "9122": 5, "1482": 5, "377": 5, "042": 5, "078": 5, "slower": [5, 7], "04": [5, 6], "interestingli": 5, "decoupl": 5, "reload": 5, "facilit": [5, 7], "promptfooconfig": 5, "model_comparison": 5, "pretti": [5, 7], "dump": 5, "default_flow_styl": 5, "sort_kei": 5, "prompt1": 5, "defaulttest": 5, "1000m": 5, "eval_data": 5, "latency_m": 5, "totallatencym": 5, "token_usag": 5, "tokenusag": 5, "assert_pass": 5, "assertpasscount": 5, "assert_fail": 5, "assertfailcount": 5, "prompt_token": [5, 6], "num_request": 5, "numrequest": 5, "2463": 5, "000035": 5, "3773": 5, "004620": 5, "1669": 5, "000091": 5, "1669m": 5, "highest": [5, 6, 8], "3773m": 5, "00462": 5, "promptfool": 5, "manual": [5, 6, 7, 8], "redefin": 5, "prompt_comparison": 5, "prompt2": 5, "prompt3": 5, "prompt_fil": 5, "prompt_cont": 5, "BE": 5, "again": 5, "prompt_id": 5, "promptid": 5, "gradingresult": 5, "df_raw": 5, "reset_index": [5, 7], "eas": [5, 6, 7, 8], "hf": [5, 6], "plain": [5, 6], "vanilla": 5, "defi": 5, "accustom": 5, "legaci": 5, "unsustain": 5, "prd": 5, "cultiv": [5, 7], "organiz": 5, "stagnat": 5, "alb": [5, 6], "loubna": [5, 6], "anton": [5, 6], "lozhkov": [5, 6], "bakouch": [5, 6], "gabriel": [5, 6, 7], "mart\u00edn": [5, 6, 7], "bl\u00e1zquez": [5, 6], "lewi": [5, 6], "tunstal": [5, 6], "agust\u00edn": [5, 6], "piquer": [5, 6], "andr": [5, 6], "marafioti": [5, 6], "cyril": [5, 6], "zakka": [5, 6], "leandro": [5, 6], "werra": [5, 6], "wolf": [5, 6], "are24": 5, "judgearena": 5, "bps99": 5, "salli": 5, "pott": 5, "barbara": 5, "557": [5, 7], "sciencedirect": 5, "s0377221798003646": 5, "doi": [5, 7, 8], "1016": 5, "s0377": 5, "2217": 5, "00364": 5, "ctj": 5, "jerri": [5, 7], "tworek": [5, 7], "heewoo": [5, 7], "jun": [5, 7], "qime": [5, 7], "henriqu": [5, 7], "pond": [5, 7], "de": [5, 7], "oliveira": [5, 7], "pinto": [5, 7], "harri": [5, 7], "yuri": 5, "burda": 5, "greg": [5, 7], "brockman": [5, 7], "raul": [5, 7], "puri": [5, 7], "gretchen": [5, 7], "krueger": [5, 7], "petrov": [5, 7], "heidi": 5, "khlaaf": 5, "girish": [5, 7], "sastri": [5, 7], "brook": [5, 7], "chan": [5, 7], "grai": [5, 7], "ryder": [5, 7], "mikhail": [5, 7], "pavlov": [5, 7], "alethea": [5, 7], "lukasz": 5, "kaiser": [5, 7], "mohammad": [5, 7], "bavarian": [5, 7], "clemen": [5, 7], "winter": [5, 7], "philipp": 5, "tillet": [5, 7], "felip": [5, 7], "petroski": [5, 7], "dave": [5, 7], "cum": [5, 7], "plappert": 5, "fotio": 5, "chantzi": [5, 7], "barn": 5, "ariel": 5, "herbert": 5, "voss": [5, 7], "hebgen": 5, "guss": 5, "nichol": 5, "paino": [5, 7], "nikola": [5, 7], "tezak": [5, 7], "babuschkin": [5, 7], "suchir": [5, 7], "balaji": [5, 7], "shantanu": [5, 7], "jain": [5, 7], "hess": [5, 7], "carr": 5, "josh": [5, 7], "achiam": [5, 7], "vedant": 5, "misra": 5, "evan": [5, 6, 7], "morikawa": [5, 7], "matthew": 5, "knight": [5, 7], "mile": [5, 7], "brundag": [5, 7], "mira": [5, 7], "murati": [5, 7], "kati": [5, 7], "mayer": [5, 7], "bob": [5, 7, 8], "mcgrew": [5, 7], "ilya": [5, 7], "sutskev": [5, 7], "wojciech": [5, 7], "zaremba": [5, 7], "2107": 5, "03374": 5, "cz": 5, "lianmin": 5, "ying": 5, "sheng": 5, "anastasio": 5, "angelopoulo": 5, "tianl": 5, "dacheng": 5, "banghua": 5, "jordan": [5, 7], "gonzalez": 5, "ion": 5, "stoica": 5, "04132": 5, "cho24a": 5, "francoi": 5, "arcpriz": 5, "cho24b": 5, "drcw": 5, "darshan": 5, "selvan": 5, "sunitha": 5, "ravi": 5, "sky": 5, "ch": 5, "bartosz": 5, "mielczarek": 5, "anand": [5, 7], "kannappan": [5, 7], "qian": [5, 7], "14140": 5, "dglh24": 5, "yann": 5, "bal\u00e1z": 5, "galambosi": 5, "tatsunori": 5, "hashimoto": 5, "debia": 5, "04475": 5, "fac24a": 5, "wiki": [5, 8], "fac24b": 5, "fac24c": 5, "model_doc": 5, "fac24d": 5, "cookbook": 5, "llm_judg": 5, "fac24f": 5, "fhwt23": 5, "cl\u00e9mentin": 5, "nathan": 5, "habib": 5, "gnh": 5, "julian": 5, "nyarko": 5, "ho": 5, "r\u00e9": 5, "adam": [5, 7], "chilton": 5, "aditya": [5, 7], "narayana": 5, "chohla": 5, "brandon": [5, 7, 8], "waldon": 5, "rockmor": 5, "diego": 5, "zambrano": 5, "dmitri": 5, "talisman": 5, "enam": 5, "hoqu": 5, "faiz": 5, "surani": 5, "frank": [5, 7], "fagan": 5, "galit": 5, "sarfati": 5, "gregori": 5, "dickinson": 5, "haggai": 5, "porat": 5, "hegland": 5, "jessica": [5, 7], "joe": [5, 7], "nudel": 5, "joel": [5, 7], "niklau": 5, "nai": 5, "choi": 5, "margaret": [5, 6], "hagan": 5, "megan": 5, "livermor": 5, "nikon": 5, "rasumov": 5, "rahe": 5, "nil": 5, "holzenberg": 5, "noam": 5, "kolt": 5, "henderson": 5, "rehaag": 5, "sharad": 5, "shang": 5, "spencer": 5, "sunni": 5, "gandhi": 5, "zur": 5, "varun": 5, "iyer": 5, "zehua": 5, "2308": 5, "11462": 5, "hbb": 5, "collin": 5, "burn": 5, "steven": [5, 7], "basart": [5, 7], "zou": [5, 7], "manta": [5, 7], "mazeika": [5, 7], "03300": 5, "hbd": 5, "maxwel": 5, "forb": 5, "yejin": 5, "curiou": 5, "neural": [5, 8], "degener": 5, "1904": 5, "09751": 5, "hyc": [5, 6], "binyuan": [5, 6], "zeyu": [5, 6], "cui": [5, 6], "jiaxi": [5, 6], "dayiheng": [5, 6], "tianyu": [5, 6], "jiajun": [5, 6], "kai": [5, 6, 7], "dang": [5, 6], "coder": [5, 6], "preprint": [5, 6, 8], "2409": [5, 6, 7], "12186": [5, 6], "lx": 5, "zhen": 5, "xiaohan": 5, "jia": 5, "yuxuan": 5, "lai": 5, "chongyang": 5, "shuai": 5, "nlg": 5, "07103": 5, "lbl": 5, "bommasani": 5, "toni": 5, "dimitri": 5, "tsipra": 5, "dilara": 5, "soylu": 5, "michihiro": 5, "yasunaga": 5, "yian": 5, "deepak": 5, "narayanan": 5, "yuhuai": 5, "newman": 5, "binhang": 5, "bobbi": 5, "ce": 5, "christian": [5, 7], "cosgrov": 5, "acosta": 5, "nava": [5, 7], "drew": 5, "hudson": 5, "zelikman": 5, "esin": 5, "durmu": 5, "faisal": 5, "ladhak": 5, "frieda": 5, "rong": 5, "ren": [5, 6], "huaxiu": 5, "yao": [5, 7, 8], "jue": 5, "keshav": 5, "santhanam": 5, "laurel": 5, "lucia": 5, "mert": 5, "yuksekgonul": 5, "mirac": 5, "suzgun": 5, "niladri": 5, "chatterji": 5, "omar": 5, "khattab": 5, "chi": [5, 8], "sang": 5, "shibani": [5, 7], "santurkar": [5, 7], "surya": 5, "icard": 5, "tianyi": 5, "vishrav": 5, "chaudhari": 5, "xuechen": 5, "yuhui": 5, "yuta": 5, "koreeda": 5, "2211": 5, "09110": 5, "lbc24": 5, "ronan": 5, "bra": 5, "allenai": 5, "lhe22": [5, 6, 7], "stephani": [5, 6, 7], "owain": [5, 6, 7], "mimic": [5, 6, 7], "falsehood": [5, 6, 7], "2109": [5, 6, 7], "07958": [5, 6, 7], "pzwg23": 5, "shishir": 5, "tianjun": 5, "xin": [5, 7], "gorilla": 5, "15334": 5, "pro24": 5, "dev": 5, "ras24": 5, "sebastian": 5, "scratch": 5, "1633437166": 5, "sll": 5, "bhaskarjit": 5, "mingshu": 5, "jingrao": 5, "lyu": 5, "nathalia": 5, "castellano": 5, "pasquali": 5, "dhagash": 5, "12148": 5, "srf": 5, "shivalika": 5, "angelika": 5, "roman": [5, 7], "adelani": 5, "ngui": 5, "vila": 5, "suero": 5, "peerat": 5, "limkonchotiwat": 5, "kelli": 5, "marchisio": 5, "qi": 5, "leong": 5, "yosephin": 5, "susanto": 5, "raymond": [5, 7], "ng": [5, 7], "shayn": 5, "longpr": 5, "ko": 5, "madelin": 5, "antoin": 5, "bosselut": 5, "oh": 5, "leshem": 5, "choshen": 5, "daphn": 5, "ippolito": 5, "enzo": [5, 8], "ferrant": 5, "marzieh": 5, "fadae": 5, "beyza": 5, "ermi": 5, "sara": 5, "hooker": 5, "linguist": [5, 7], "03304": 5, "srr": 5, "aarohi": 5, "abhinav": 5, "rastogi": 5, "abhishek": 5, "rao": 5, "abu": 5, "awal": 5, "shoeb": 5, "abubakar": 5, "abid": [5, 6], "fisch": 5, "santoro": 5, "gupta": 5, "adri\u00e0": 5, "garriga": 5, "alonso": 5, "agnieszka": 5, "kluska": 5, "aitor": 5, "lewkowycz": 5, "akshat": 5, "warstadt": 5, "alexand": [5, 7, 8], "kocurek": 5, "ali": [5, 7], "safaya": 5, "tazarv": 5, "aman": 5, "hussain": 5, "dsouza": 5, "ambros": 5, "slone": 5, "ameet": 5, "rahan": 5, "anantharaman": 5, "ander": 5, "andreassen": 5, "madotto": 5, "santilli": 5, "stuhlm\u00fcller": 5, "la": 5, "lampinen": 5, "angelica": 5, "anh": 5, "vuong": 5, "animesh": 5, "gottardi": 5, "antonio": 5, "norelli": 5, "anu": 5, "venkatesh": 5, "arash": 5, "gholamidavoodi": 5, "arfa": 5, "tabassum": 5, "arul": 5, "menez": 5, "arun": [5, 7], "kirubarajan": 5, "asher": 5, "mullokandov": 5, "ashish": 5, "sabharw": 5, "herrick": 5, "avia": 5, "efrat": 5, "aykut": 5, "erdem": 5, "ayla": 5, "karaka\u015f": 5, "bao": [5, 6, 7], "loe": 5, "barret": [5, 7], "zoph": [5, 7], "bart\u0142omiej": 5, "bojanowski": 5, "batuhan": 5, "\u00f6zyurt": 5, "behnam": 5, "hedayatnia": 5, "neyshabur": 5, "inden": 5, "benno": 5, "stein": 5, "berk": 5, "ekmekci": 5, "blake": 5, "howald": 5, "bryan": 5, "orinion": 5, "diao": 5, "dour": 5, "stinson": 5, "cedrick": 5, "argueta": 5, "c\u00e9sar": 5, "ferri": 5, "ram\u00edrez": 5, "chandan": 5, "charl": 5, "rathkopf": 5, "chenlin": 5, "meng": 5, "chitta": 5, "baral": 5, "chiyu": 5, "callison": 5, "burch": 5, "voigt": 5, "cindi": 5, "ramirez": 5, "clara": 5, "rivera": 5, "clemencia": 5, "siro": 5, "colin": [5, 6], "raffel": [5, 6], "courtnei": 5, "ashcraft": 5, "cristina": 5, "garbacea": 5, "damien": [5, 7], "sileo": 5, "garrett": 5, "kilman": 5, "freeman": 5, "khashabi": 5, "levi": [5, 7], "mosegu\u00ed": 5, "gonz\u00e1lez": 5, "perszyk": 5, "danqi": 5, "dar": 5, "gilboa": 5, "dohan": [5, 7], "drakard": 5, "jurgen": 5, "debajyoti": 5, "datta": 5, "deni": 5, "emelin": 5, "kleyko": 5, "deniz": 5, "yuret": 5, "derek": [5, 7], "tam": [5, 8], "dieuwk": 5, "hupk": 5, "diganta": 5, "dilyar": 5, "buzan": 5, "coelho": 5, "mollo": 5, "diyi": 5, "dylan": 5, "schrader": 5, "ekaterina": 5, "shutova": 5, "ekin": 5, "dogu": 5, "cubuk": 5, "elad": 5, "segal": 5, "eleanor": 5, "hagerman": 5, "donowai": 5, "elli": 5, "pavlick": 5, "rodola": 5, "emma": 5, "lam": 5, "chu": [5, 7], "erkut": 5, "erni": 5, "dyer": 5, "jerzak": 5, "eunic": 5, "engefu": 5, "manyasi": 5, "evgenii": 5, "zheltonozhskii": 5, "fanyu": 5, "fatemeh": 5, "siar": 5, "fernando": 5, "mart\u00ednez": 5, "plume": 5, "francesca": 5, "happ\u00e9": 5, "gaurav": 5, "genta": 5, "indra": 5, "winata": 5, "gerard": 5, "melo": 5, "germ\u00e1n": 5, "kruszewski": 5, "giambattista": [5, 7], "parascandolo": [5, 7], "giorgio": 5, "mariani": 5, "gloria": 5, "gonzalo": 5, "jaimovitch": 5, "l\u00f3pez": 5, "gregor": 5, "betz": 5, "gui": [5, 6], "gur": 5, "hana": 5, "galijasev": 5, "rashkin": 5, "hannaneh": 5, "hajishirzi": 5, "harsh": 5, "hayden": 5, "bogar": 5, "henri": [5, 7], "shevlin": 5, "hinrich": 5, "sch\u00fctze": 5, "hiromu": 5, "yakura": 5, "hongm": 5, "hugh": 5, "mee": 5, "wong": [5, 7], "isaac": 5, "nobl": 5, "jaap": 5, "jumelet": 5, "geissing": 5, "jaehoon": 5, "jaim": 5, "fern\u00e1ndez": 5, "fisac": 5, "simon": 5, "koppel": 5, "koco\u0144": 5, "jana": 5, "thompson": [5, 6, 7], "janel": 5, "wingfield": 5, "jarema": 5, "radom": 5, "jascha": 5, "sohl": [5, 7], "dickstein": 5, "phang": 5, "yosinski": 5, "jekaterina": 5, "novikova": 5, "jell": 5, "bosscher": 5, "jennif": 5, "marsh": 5, "jeroen": 5, "taal": 5, "engel": 5, "jesujoba": 5, "alabi": 5, "jiam": 5, "jillian": 5, "joan": 5, "waweru": 5, "burden": 5, "bali": 5, "batcheld": 5, "berant": 5, "j\u00f6rg": 5, "frohberg": 5, "jo": 5, "rozen": 5, "orallo": 5, "boudeman": 5, "guerr": 5, "tenenbaum": 5, "joyc": 5, "chua": 5, "kanclerz": 5, "karen": 5, "livescu": 5, "karl": 5, "krauth": 5, "karthik": 5, "gopalakrishnan": 5, "katerina": 5, "ignatyeva": 5, "katja": 5, "markert": 5, "kaustubh": 5, "dhole": 5, "gimpel": 5, "omondi": 5, "kori": 5, "mathewson": 5, "kristen": 5, "chiafullo": 5, "ksenia": 5, "shkaruta": 5, "shridhar": 5, "kyle": [5, 7], "mcdonel": 5, "richardson": 5, "laria": 5, "reynold": 5, "leo": [5, 7], "dugan": 5, "lianhui": 5, "lidia": 5, "contrera": 5, "ochando": 5, "morenc": 5, "moschella": 5, "luci": 5, "ludwig": 5, "schmidt": [5, 7], "luheng": 5, "olivero": 5, "col\u00f3n": 5, "metz": [5, 7], "l\u00fctfi": 5, "kerem": 5, "\u015fenel": 5, "maarten": [5, 7], "bosma": 5, "sap": [5, 7], "maartj": 5, "hoev": 5, "maheen": 5, "farooqi": 5, "manaal": 5, "faruqui": 5, "marco": 5, "baturan": 5, "marelli": 5, "maru": 5, "maria": 5, "quintana": 5, "tolkiehn": 5, "mario": [5, 7], "giulianelli": 5, "martha": 5, "potthast": 5, "leavitt": 5, "hagen": 5, "m\u00e1ty\u00e1": 5, "schubert": 5, "medina": [5, 7], "orduna": 5, "baitemirova": 5, "melodi": 5, "arnaud": 5, "melvin": 5, "mcelrath": 5, "yee": 5, "cohen": 5, "ivanitskii": 5, "starritt": 5, "strube": 5, "micha\u0142": 5, "sw\u0119drowski": 5, "michel": [5, 7], "bevilacqua": 5, "mihir": 5, "kale": 5, "cain": 5, "mime": 5, "mitch": 5, "walker": 5, "mo": 5, "tiwari": 5, "mohit": 5, "bansal": 5, "moin": 5, "aminnaseri": 5, "mor": 5, "geva": 5, "mozhdeh": 5, "gheini": 5, "mukund": 5, "varma": 5, "nanyun": 5, "peng": [5, 7], "nayeon": 5, "neta": 5, "krakov": 5, "doiron": 5, "nicol": 5, "martinez": 5, "nikita": 5, "nangia": 5, "nikla": 5, "decker": 5, "muennighoff": 5, "nitish": [5, 7], "shirish": [5, 7], "keskar": [5, 7], "niveditha": 5, "constant": 5, "fiedel": 5, "nuan": 5, "wen": 5, "oliv": [5, 7], "agha": 5, "elbaghdadi": 5, "omer": 5, "moreno": 5, "casar": 5, "parth": 5, "doshi": 5, "pascal": 5, "fung": 5, "pu": 5, "vicol": 5, "pegah": 5, "alipoormolabashi": 5, "peiyuan": 5, "eckerslei": 5, "phu": 5, "mon": 5, "htut": 5, "pinyu": 5, "hwang": 5, "piotr": 5, "mi\u0142kowski": 5, "piyush": 5, "pouya": 5, "pezeshkpour": 5, "priti": 5, "oli": 5, "qiaozhu": 5, "qing": 5, "qinlang": 5, "rabin": 5, "banjad": 5, "rachel": [5, 7], "etta": 5, "rudolph": 5, "raefer": 5, "rahel": 5, "haback": 5, "ramon": 5, "risco": 5, "rapha\u00ebl": 5, "milli\u00e8r": 5, "rhythm": 5, "garg": [5, 6], "rif": 5, "saurou": 5, "riku": 5, "arakawa": 5, "robb": 5, "raymaek": 5, "rohan": 5, "sikand": 5, "novak": 5, "sitelew": 5, "lebra": 5, "rosann": 5, "rowan": [5, 7], "ruslan": 5, "salakhutdinov": 5, "stoval": 5, "teehan": 5, "sahib": 5, "saif": 5, "sajant": 5, "dillav": 5, "shleifer": 5, "wiseman": 5, "gruetter": 5, "schoenholz": 5, "sanghyun": 5, "sanjeev": 5, "kwatra": 5, "sarik": 5, "ghazarian": 5, "sayan": 5, "casei": [5, 7], "bischoff": 5, "gehrmann": 5, "schuster": 5, "sepideh": 5, "sadeghi": 5, "shadi": 5, "hamdan": 5, "sharon": 5, "shashank": 5, "sherri": 5, "shi": 5, "shikhar": 5, "shima": 5, "asaadi": 5, "shubh": 5, "pachchigar": 5, "shubham": 5, "toshniw": 5, "shyam": [5, 7], "upadhyai": 5, "shyamolima": 5, "debnath": 5, "siamak": 5, "shakeri": 5, "thormey": 5, "melzi": 5, "siva": 5, "reddi": 5, "sneha": 5, "priscilla": 5, "makini": 5, "soo": 5, "hwan": 5, "toren": 5, "sriharsha": 5, "hatwar": 5, "stanisla": 5, "dehaen": 5, "stefan": 5, "divic": 5, "stella": 5, "biderman": 5, "stephen": 5, "prasad": 5, "piantadosi": 5, "stuart": [5, 7], "shieber": 5, "summer": [5, 7], "misherghi": 5, "svetlana": 5, "kiritchenko": 5, "swaroop": 5, "tal": 5, "linzen": 5, "tariq": 5, "tatsu": 5, "te": 5, "th\u00e9o": 5, "desbord": 5, "theodor": 5, "rothschild": 5, "phan": [5, 7], "tiberiu": 5, "nkinyili": 5, "timo": 5, "schick": 5, "timofei": 5, "kornev": 5, "titu": 5, "tunduni": 5, "gerstenberg": 5, "trenton": 5, "trishala": 5, "neeraj": 5, "tushar": 5, "khot": 5, "shultz": 5, "uri": 5, "shaham": 5, "vera": 5, "demberg": 5, "victoria": [5, 7], "nyamai": 5, "vika": 5, "raunak": 5, "vinai": 5, "ramasesh": 5, "udai": 5, "prabhu": 5, "vishakh": 5, "padmakumar": 5, "vivek": 5, "srikumar": 5, "fedu": [5, 7], "wout": 5, "vossen": 5, "xiaoyu": 5, "tong": [5, 7], "xinran": 5, "xinyi": 5, "yadollah": 5, "yaghoobzadeh": 5, "yair": 5, "lakretz": 5, "yangqiu": 5, "yasaman": 5, "bahri": 5, "yichi": 5, "yide": 5, "yifu": 5, "yonatan": 5, "belinkov": 5, "yufang": 5, "seid": 5, "zhuoy": 5, "zijian": 5, "ziji": 5, "zirui": 5, "ziyi": 5, "extrapol": 5, "2206": 5, "04615": 5, "wpn": 5, "yada": 5, "pruksachatkun": 5, "amanpreet": 5, "hill": 5, "stickier": 5, "wsm": 5, "1804": 5, "07461": 5, "wtb": 5, "tai": 5, "borgeaud": 5, "dani": 5, "yogatama": 5, "denni": [5, 7], "donald": 5, "metzler": 5, "ed": 5, "oriol": 5, "vinyal": 5, "dean": 5, "07682": 5, "wdr": 5, "doolei": 5, "manlei": 5, "arka": [5, 7], "pal": 5, "feuer": 5, "siddhartha": 5, "ravid": 5, "shwartz": [5, 7], "ziv": 5, "khalid": [5, 6], "saifullah": 5, "siddartha": 5, "naidu": 5, "chinmai": 5, "hegd": 5, "lecun": 5, "goldstein": 5, "willi": 5, "neiswang": 5, "micah": 5, "goldblum": 5, "19314": 5, "yyh": 5, "baosong": [5, 6], "chengpeng": 5, "chengyuan": [5, 6], "fei": [5, 6], "guant": 5, "haoran": [5, 6], "huan": [5, 6], "jialong": 5, "jialin": 5, "jianhong": [5, 6], "tu": [5, 6], "jianwei": [5, 6], "jianxin": [5, 6], "jin": [5, 7], "jingren": [5, 6], "jinz": 5, "jinzheng": 5, "junyang": [5, 6], "keme": [5, 6], "keqin": [5, 6], "kexin": [5, 6], "mingfeng": [5, 6], "xue": [5, 6, 7], "ni": 5, "pei": [5, 6], "ru": 5, "men": [5, 6], "ruiz": 5, "runji": [5, 6], "shiji": 5, "sinan": 5, "tianhang": 5, "wenbin": 5, "ge": 5, "xiaodong": 5, "deng": 5, "xiaohuan": 5, "xingzhang": [5, 6], "xinyu": [5, 7], "xipin": 5, "xuancheng": [5, 6], "yichang": [5, 6], "wan": [5, 6], "yunfei": 5, "yuqiong": [5, 6], "zhenru": [5, 6], "zhihao": 5, "10671": 5, "zcl24": 5, "zhihan": 5, "cao": 5, "lizi": 5, "openreview": 5, "forum": 5, "aegrf1uy0p": 5, "zc": 5, "siyuan": 5, "zhuang": [5, 7], "zhanghao": 5, "yonghao": 5, "zi": 5, "zhuohan": 5, "xing": [5, 7], "2306": 5, "05685": 5, "huggingface24": 5, "metaai24": 5, "di": 6, "hunter": 6, "photo": 6, "email": 6, "hipaa": 6, "properti": [6, 7], "gdpr": 6, "iot": 6, "unreli": 6, "impract": 6, "slm": 6, "viabl": 6, "sensor": 6, "interconnect": 6, "frontend": 6, "garner": 6, "traction": 6, "yourself": 6, "aw": [6, 7], "bedrock": 6, "sambanova": 6, "sla": 6, "veloc": 6, "roadmap": 6, "commodit": 6, "winner": 6, "loser": 6, "condens": 6, "clean": 6, "2024t": 6, "versatil": 6, "72b": 6, "med": 6, "bloomberggpt": 6, "underw": 6, "adept": 6, "toxigen": 6, "alnajjar": 6, "13b": [6, 7], "01": 6, "outperform": 6, "32b": 6, "feasibl": 6, "2m": 6, "unstructur": [6, 8], "modal": 6, "diagnosi": 6, "patient": 6, "necessit": 6, "flagship": 6, "405b": 6, "gemini": 6, "pack": 6, "cautious": 6, "isol": [6, 7], "cpot": 6, "cpit": 6, "tco": 6, "tpot": 6, "ttft": 6, "gpqa": 6, "ratio": 6, "median": 6, "afford": 6, "lite": 6, "micro": 6, "encod": [6, 7, 8], "cent": 6, "1m": 6, "cheapest": 6, "phi": 6, "half": [6, 7], "permiss": [6, 7], "apach": 6, "microsoft": 6, "simpler": [6, 7, 8], "fewer": [6, 7], "700m": 6, "100m": 6, "gemma": [6, 8], "deepseek": 6, "v2": [6, 7], "grown": 6, "withdraw": 6, "incomplet": [6, 7], "preprocess": [6, 8], "unclear": 6, "15t": 6, "8t": 6, "fineweb": 6, "penedo": 6, "96": [6, 7], "crawl": 6, "snapshot": 6, "codebas": 6, "ablat": 6, "vital": [6, 7], "favorit": 6, "spawn": 6, "ultrachat": 6, "2024u": 6, "created_job": 6, "fine_tun": 6, "training_fil": 6, "file_id": 6, "ultrachat_chunk_train": 6, "validation_fil": 6, "ultrachat_chunk_ev": 6, "training_step": 6, "0001": 6, "auto_start": 6, "job_id": 6, "toolkit": [6, 7], "sft": 6, "nemo": [6, 7], "codestr": 6, "2024v": 6, "enough": 6, "despit": [6, 8], "rewrit": 6, "smolvlm": 6, "mlx": [6, 8], "mlc": 6, "peft": 6, "programm": 6, "graphic": [6, 7], "vram": 6, "mathbf": 6, "x_1": [6, 8], "x_2": [6, 8], "x_n": [6, 8], "x_": [6, 8], "\u03b8": 6, "matrix": [6, 7], "cerebra": 6, "mozilla": 6, "docker": 6, "gerganov": 6, "georgi": 6, "hundr": 6, "overwhelm": [6, 8], "manifesto": 6, "enjoy": 6, "bog": 6, "exploratori": 6, "hacker": 6, "Will": [6, 7], "prototyp": 6, "prematur": 6, "besid": 6, "lighter": 6, "sacrific": 6, "unifi": [6, 8], "ggml": [6, 8], "ibm": [6, 7], "metadata": 6, "disk": 6, "backward": 6, "2024x": 6, "repo": 6, "easier": [6, 7, 8], "compil": 6, "linux": 6, "argument": [6, 7, 8], "sudo": 6, "apt": 6, "cmake": 6, "bind": 6, "betlen": 6, "cnv": 6, "llamacpp": 6, "succinct": 6, "ctrl": 6, "interject": 6, "philosoph": 6, "debat": 6, "fulfil": 6, "happi": 6, "responsibli": 6, "bye": 6, "goodby": 6, "port": 6, "127": 6, "curl": [6, 8], "localhost": 6, "v1": [6, 7], "bearer": 6, "finish_reason": 6, "deepli": 6, "1734627879": 6, "completion_token": 6, "total_token": 6, "chatcmpl": 6, "5wl2tzjzdmzupvxwp2gcedr8xbpsyhfm": 6, "prompt_n": 6, "prompt_m": 6, "132": 6, "prompt_per_token_m": 6, "prompt_per_second": 6, "77619878666999": 6, "predicted_n": 6, "predicted_m": 6, "1700": 6, "654": [6, 8], "predicted_per_token_m": 6, "36882142857143": 6, "predicted_per_second": 6, "92850867960208": 6, "gbnf": [6, 8], "8pm": 6, "appointmenttim": 6, "appointmentdetail": 6, "handi": 6, "model_path": 6, "llama_cpp": 6, "create_chat_complet": 6, "occupi": 6, "activist": 6, "justin": [6, 7], "tunnei": 6, "ocho": 6, "appach": 6, "cosmopolitan": 6, "libc": 6, "portabl": 6, "durabl": 6, "usabl": [6, 7, 8], "tinyllama": 6, "wget": 6, "jartin": 6, "q5_k_m": 6, "renam": 6, "ex": 6, "chmod": 6, "nobrows": 6, "registri": 6, "nativ": [6, 8], "container": 6, "trai": 6, "familiar": 6, "bare": 6, "ssfl": 6, "sh": [6, 8], "Or": 6, "11434": 6, "chatrespons": 6, "easiest": 6, "rich": [6, 7], "playground": 6, "simultan": [6, 7], "verif": [6, 8], "importantli": [6, 8], "intuit": 6, "beginn": 6, "tensorrt": 6, "trt": 6, "latex": 6, "voic": 6, "pwa": 6, "medium": [6, 7, 8], "gpt4all": 6, "rbac": 6, "q4_k": 6, "q6_k": 6, "mib": 6, "wikitext": 6, "salesforc": 6, "wikipedia": [6, 8], "min_prompt_length": 6, "input_texts_raw": 6, "2010": 6, "valkyria": 6, "chronicl": 6, "forgiv": 6, "newcom": 6, "raita": 6, "honjou": 6, "compos": [6, 7], "hitoshi": 6, "sakimoto": 6, "takeshi": 6, "ozawa": 6, "writer": 6, "theme": [6, 7], "sung": 6, "escap": 6, "escaped_text": 6, "block_scal": 6, "block": [6, 7], "parenthes": 6, "block_min": 6, "formula": 6, "superblock": 6, "5625": 6, "ieee": 6, "754": 6, "ppl": 6, "exp": 6, "sum_": 6, "log_2": 6, "x_i": [6, 8], "avg": 6, "_i": 6, "corr": 6, "ln": [6, 8], "kullback": 6, "leibler": 6, "entropi": 6, "logit": 6, "d_": 6, "softmax": [6, 8], "sum": 6, "kld": 6, "q2_kresult": 6, "q6": 6, "004": 6, "q2": 6, "112": 6, "q4": 6, "smallest": 6, "390": 6, "67": [6, 7], "81": [6, 7], "93": [6, 7], "462": 6, "614": 6, "170": 6, "q4_k_m": 6, "thread": 6, "16x": 6, "85x": 6, "79x": 6, "ubuntu": 6, "lt": 6, "x86_64": 6, "gnu": 6, "thank": [6, 8], "intel": 6, "i7": 6, "8550u": 6, "15gib": 6, "samsung": 6, "ssd": 6, "970": 6, "evo": 6, "500gb": 6, "1170": 6, "meant": 6, "ai4c": 6, "ai4a": 6, "paperswithcod": [6, 7], "ana24a": 6, "leaderboard": [6, 7], "artificialanalysi": 6, "ana24b": 6, "ana24c": 6, "bc24": 6, "andrei": [6, 7], "abetlen": 6, "fac4": 6, "optimum": 6, "concept_guid": 6, "fac4t": 6, "fac4u": 6, "200k": 6, "ultrachat_200k": 6, "fac4v": 6, "blogpost": 6, "gc24": 6, "ggerganov": [6, 8], "blob": [6, 8], "readm": [6, 8], "gc4a": 6, "gc4b": 6, "pka": 6, "guilherm": 6, "hynek": 6, "kydl\u00ed\u010dek": 6, "decant": 6, "finest": 6, "17557": 6, "qwe4b": 6, "qy": 6, "beichen": 6, "tingyu": 6, "zihan": 6, "qiu": 6, "15115": 6, "rev24": 6, "harvard": 6, "nyt": 6, "harvardlawreview": 6, "timess": 6, "zwa": 6, "wael": 6, "geoffrei": [6, 7], "angu": 6, "arnav": 6, "jefferi": 6, "kinnison": 6, "sherstinski": 6, "piero": 6, "molino": 6, "travi": 6, "addair": 6, "devvret": 6, "310": 6, "2405": 6, "00732": 6, "huggingface4xa": 6, "huggingface4xb": 6, "ibmthink24": 6, "lmstudio24": 6, "lmstudio": 6, "metaai4c": 6, "mozillaocho24": 6, "salesforce24": 6, "immens": 7, "commonplac": 7, "hartvigsen": 7, "societi": 7, "statement": 7, "alarm": 7, "openli": 7, "dolli": 7, "llama2": [7, 8], "emb": 7, "generalist": 7, "injustic": 7, "inequ": 7, "undermin": 7, "perpetu": 7, "displac": 7, "eros": 7, "fake": 7, "deepfak": 7, "distrust": 7, "cyberattack": 7, "spread": 7, "disinform": 7, "inadvert": 7, "interven": 7, "irrevers": 7, "uncheck": 7, "extinct": 7, "race": 7, "incentiv": 7, "shortcut": 7, "behind": 7, "stress": 7, "urgent": 7, "reorient": 7, "birth": 7, "siam": 7, "edgington": 7, "jailbreak": 7, "promptcraft": 7, "stealth": 7, "sutton": 7, "subtl": 7, "subtleti": 7, "exception": 7, "phrase": 7, "evad": 7, "hqve": 7, "frer": 7, "hplidai": 7, "pl": 7, "hyperion": 7, "coast": 7, "redwood": 7, "tallest": 7, "tree": [7, 8], "routin": 7, "prejudic": 7, "gallego": 7, "leak": 7, "poison": 7, "intention": 7, "inject": 7, "mislead": 7, "exabeam": 7, "finra": 7, "3110": 7, "mandat": 7, "supervisori": 7, "unicef": 7, "empow": 7, "contest": 7, "congress": 7, "enact": 7, "pictur": [7, 8], "territori": 7, "oversea": 7, "chines": 7, "legitim": 7, "consent": 7, "complaint": 7, "cooper": 7, "extraterritori": 7, "offshor": 7, "draft": 7, "voluntari": 7, "neutral": 7, "player": 7, "prepared": 7, "compris": 7, "cbrn": 7, "persuas": 7, "autonomi": 7, "gradat": 7, "scorecard": 7, "elig": 7, "advisori": 7, "sag": 7, "shut": 7, "prerequisit": 7, "harden": 7, "asl": 7, "biosafeti": 7, "elev": 7, "warn": [7, 8], "bioweapon": 7, "compartment": 7, "difficulti": 7, "4x": 7, "jump": 7, "paus": 7, "deepmind": 7, "biosecur": 7, "buffer": 7, "formul": [7, 8], "calibr": 7, "promin": 7, "taxonomi": 7, "llamaguard": 7, "20241022": 7, "3x": 7, "5x": 7, "alaga": 7, "substandard": 7, "oxford": 7, "wachter": 7, "blur": 7, "ill": 7, "stifl": 7, "suscept": 7, "aadc": 7, "outset": 7, "curricula": 7, "adversari": 7, "uncov": [7, 8], "appar": 7, "thoroughli": 7, "lm": [7, 8], "problemat": 7, "undergo": 7, "280b": 7, "cai": [7, 8], "utilis": 7, "minimis": 7, "enshrin": 7, "evas": 7, "resort": 7, "avenu": 7, "cambria": 7, "inherit": 7, "influenti": 7, "debias": 7, "occurr": 7, "phish": 7, "clarifi": 7, "toler": 7, "checklist": 7, "abus": 7, "ux": 7, "architect": 7, "diagram": 7, "retrofit": 7, "promptli": 7, "dashboard": 7, "misalign": 7, "star": 7, "postpon": 7, "combat": 7, "counter": 7, "traffic": 7, "frustrat": 7, "workaround": 7, "silo": 7, "hierarch": 7, "hierarchi": 7, "66": 7, "depth": 7, "mcq": 7, "regex": [7, 8], "joint": 7, "facet": 7, "purpl": 7, "circl": 7, "opensafetylab": 7, "salad_bench_dataset": 7, "base_set": 7, "gptfuzzer": 7, "auto": [7, 8], "qid": 7, "o1": 7, "supremaci": 7, "o53": 7, "o14": 7, "o5": 7, "o65": 7, "plagiar": 7, "o16": 7, "o6": 7, "o47": 7, "campaign": 7, "o12": 7, "o52": 7, "surveil": 7, "spous": 7, "know": [7, 8], "o13": 7, "breakdown": 7, "ncount": 7, "21318": 7, "8756": 7, "6486": 7, "o2": 7, "1717": 7, "o4": 7, "1477": 7, "o3": 7, "socioeconom": 7, "851": 7, "int64": 7, "gen": 7, "15433": 7, "hh": 7, "4184": 7, "659": 7, "advbench": 7, "230": 7, "189": 7, "toxicchat": 7, "anyth": 7, "817": 7, "misconcept": 7, "ingrain": 7, "mc1": 7, "singular": 7, "choices4": 7, "mc2": 7, "set4": 7, "scorer": 7, "correctli": [7, 8], "truthful_qa": 7, "truthfulqa_dataset": 7, "multiple_choic": 7, "best_answ": 7, "correct_answ": 7, "incorrect_answ": 7, "watermelon": 7, "digest": 7, "noth": 7, "stomach": 7, "sick": 7, "wonderopoli": 7, "wonder": 7, "belli": 7, "swallow": 7, "dream": 7, "die": 7, "indigest": 7, "unconsci": 7, "excret": 7, "asr": 7, "r2d2": 7, "wider": [7, 8], "mass": 7, "destruct": 7, "asynchron": 7, "webpurifi": 7, "protectai": 7, "comprehend": 7, "amazon": 7, "nvidia": [7, 8], "keyword": 7, "toolset": 7, "nemmo": 7, "synchron": 7, "nemoguardrail": 7, "llmrail": 7, "railsconfig": 7, "from_path": 7, "rail": 7, "hello": 7, "ministr": 7, "mistralai": 7, "mistral_api_kei": 7, "moderate_chat": 7, "omni": 7, "pprint": 7, "to_json": 7, "threaten": 7, "illicit": 7, "granit": 7, "guardian": 7, "consortium": 7, "v3": 7, "11b": 7, "begin_of_text": 7, "start_header_id": 7, "end_header_id": 7, "unsafe_categori": 7, "user_message_1": 7, "model_answer_1": 7, "comma": 7, "eot_id": 7, "eom_id": 7, "denot": 7, "s1": 7, "s2": 7, "s3": 7, "s4": 7, "s5": 7, "defam": 7, "s6": 7, "s7": 7, "s8": 7, "s9": 7, "s10": 7, "s11": 7, "s12": 7, "s13": 7, "atla": 7, "2b": 7, "hap": 7, "38m": 7, "125m": 7, "padhi": 7, "shieldgemma": 7, "judge_prompt": 7, "american": 7, "vandal": 7, "underag": 7, "drink": 7, "vulgar": 7, "obscen": 7, "racism": 7, "derogatori": 7, "firearm": 7, "safety_scor": 7, "IN": 7, "borderlin": 7, "verdict": 7, "boolean": [7, 8], "brief": 7, "rational": 7, "delimit": 7, "paramount": [7, 8], "evenli": 7, "good_sampl": 7, "bad_sampl": 7, "2024z": 7, "surg": 7, "scam": 7, "get_profanity_sampl": 7, "show_stat": 7, "bool": 7, "current_dir": 7, "getcwd": 7, "data_path": 7, "profanity_en": 7, "random_st": 7, "ncategori": 7, "category_count": 7, "category_1": 7, "1f": 7, "profanity_sampl": 7, "nsampl": 7, "anatomi": 7, "slur": 7, "182": 7, "bodili": 7, "fluid": 7, "excrement": 7, "insult": 7, "mental": 7, "disabl": 7, "jap": 7, "babi": [7, 8], "batter": 7, "crazi": 7, "sob": 7, "fukka": 7, "sh1t3": 7, "get_salad_sampl": 7, "salad_sampl": 7, "tortur": 7, "porn": 7, "sin": 7, "sight": 7, "god": 7, "embezzl": 7, "xanax": 7, "alcohol": 7, "get_good_sampl": 7, "min_scor": 7, "reichstag": 7, "profanity_data": 7, "salad_data": 7, "good_data": 7, "all_data": 7, "prompt_sampl": 7, "is_unsaf": 7, "counti": 7, "holli": 7, "ridg": 7, "nc": 7, "town": 7, "onslow": 7, "carolina": 7, "diver": 7, "underwat": 7, "maze": 7, "coral": 7, "treasur": 7, "vivid": 7, "sensori": 7, "emot": 7, "labyrinthin": 7, "passag": 7, "reef": 7, "suspens": 7, "obstacl": 7, "creatur": 7, "nomin": 7, "nobel": 7, "literatur": 7, "love": 7, "ny": [7, 8], "logo": 7, "thief": 7, "rob": 7, "famou": 7, "nstatist": 7, "source_stat": 7, "type_stat": 7, "tiktoken": 7, "plug": 7, "safetyvalid": 7, "validationresult": 7, "dataclass": 7, "abstractmethod": 7, "llmguardvalid": 7, "scanner": 7, "bantop": 7, "llm_guard": 7, "input_scann": 7, "scan_prompt": 7, "matchtyp": 7, "default_banned_top": 7, "banned_top": 7, "super": 7, "banned_topics_scann": 7, "use_onnx": 7, "toxicity_scann": 7, "match_typ": 7, "fail_fast": 7, "unsafe_scann": 7, "gun": 7, "cool": 7, "hunt": 7, "deer": 7, "dad": 7, "mistralvalid": 7, "hate_and_discrimin": 7, "violence_and_threat": 7, "dangerous_and_criminal_cont": 7, "selfharm": 7, "openaivalid": 7, "attr": 7, "dir": 7, "getattr": 7, "illicit_viol": 7, "llmjudgevalid": 7, "prompt_path": 7, "llmjudg": 7, "filenotfounderror": 7, "elaps": 7, "score_valid": 7, "id_": 7, "validator_inst": 7, "start_tim": 7, "validation_result": 7, "elapsed_tim": 7, "prompt_sample_id": 7, "validator_nam": 7, "scoring_prompt": 7, "scoring_result": 7, "01536": 7, "34098": 7, "497136": 7, "546416": 7, "calculate_validator_metr": 7, "bad_sourc": 7, "good_sourc": 7, "tpr": 7, "fpr": 7, "f1_score": 7, "tn": 7, "fp": 7, "fn": 7, "tp": 7, "255": 7, "74": 7, "238": 7, "130": 7, "256": 7, "79": 7, "149": 7, "213": 7, "190": 7, "235": 7, "63": 7, "222": 7, "480": 7, "157": 7, "487": 7, "495": 7, "482": 7, "667": 7, "248": 7, "466": 7, "143": 7, "355": 7, "slowest": 7, "room": 7, "false_posit": 7, "left_on": 7, "right_on": 7, "persian": 7, "teenag": 7, "\u0686\u0627\u0644\u0634": 7, "\u0645\u0648\u0645\u0648": 7, "\u06cc\u06a9": 7, "\u062d\u0642\u0647": 7, "\u0648": 7, "\u0627\u0641\u0633\u0627\u0646\u0647": 7, "\u0645\u062d\u0644\u06cc": 7, "\u0627\u0633\u062a": 7, "\u06a9\u0647": 7, "\u0634\u0627\u06cc\u0639\u0647": 7, "\u0622\u0646": 7, "\u062f\u0631": 7, "\u0634\u0628\u06a9\u0647": 7, "\u0647\u0627\u06cc": 7, "\u0627\u062c\u062a\u0645\u0627\u0639\u06cc": 7, "\u067e\u062e\u0634": 7, "\u0634\u062f": 7, "\u06af\u0632\u0627\u0631\u0634": 7, "\u06a9\u0627\u0631\u0628\u0631\u06cc": 7, "\u0628\u0627": 7, "\u0646\u0627\u0645": 7, "\u06a9\u0648\u062f\u06a9\u0627\u0646": 7, "\u0646\u0648\u062c\u0648\u0627\u0646\u0627\u0646": 7, "\u0631\u0627": 7, "\u0628\u0647": 7, "\u0627\u0639\u0645\u0627\u0644": 7, "\u062e\u0637\u0631\u0646\u0627\u06a9\u06cc": 7, "\u0645\u0627\u0646\u0646\u062f": 7, "\u062e\u0648\u062f\u0632\u0646\u06cc": 7, "\u062e\u0648\u062f\u06a9\u0634\u06cc": 7, "\u062a\u0631\u063a\u06cc\u0628": 7, "\u0645\u06cc": 7, "\u06a9\u0646\u062f": 7, "\u0648\u062c\u0648\u062f": 7, "\u0635\u062d\u0628\u062a": 7, "\u0632\u06cc\u0627\u062f\u06cc": 7, "\u0645\u0648\u0631\u062f": 7, "\u0627\u06cc\u0646": 7, "\u062a\u0639\u062f\u0627\u062f": 7, "\u0634\u06a9\u0627\u06cc\u0627\u062a": 7, "\u0648\u0627\u0642\u0639\u06cc": 7, "\u0628\u0633\u06cc\u0627\u0631": 7, "\u06a9\u0645": 7, "\u0628\u0648\u062f": 7, "\u067e\u0644\u06cc\u0633": 7, "\u0635\u062f\u0645\u0647": 7, "\u062f\u06cc\u062f\u0646": 7, "\u062f\u0644\u06cc\u0644": 7, "\u062a\u0623\u062b\u06cc\u0631": 7, "\u0645\u0633\u062a\u0642\u06cc\u0645": 7, "\u067e\u062f\u06cc\u062f\u0647": 7, "\u062a\u0623\u06cc\u06cc\u062f": 7, "\u0646\u06a9\u0631\u062f\u0647": 7, "\u062a\u0631\u0633": 7, "\u0646\u06af\u0631\u0627\u0646\u06cc": 7, "\u0627\u06cc\u062c\u0627\u062f": 7, "\u0634\u062f\u0647": 7, "\u0628\u06cc\u0634\u062a\u0631": 7, "\u0627\u0632": 7, "\u062e\u0648\u062f": 7, "\u0631\u0633\u0627\u0646\u0647": 7, "\u0647\u0627": 7, "\u0637\u0648\u0631\u06cc": 7, "\u062e\u06cc\u0631\u06cc\u0647": 7, "\u0647\u0634\u062f\u0627\u0631": 7, "\u062f\u0627\u062f\u0646\u062f": 7, "\u0622\u0633\u06cc\u0628": 7, "\u0627\u0646\u062a\u0638\u0627\u0631\u0627\u062a": 7, "\u0645\u062d\u062a\u0648\u0627\u06cc": 7, "\u062e\u0634\u0648\u0646\u062a": 7, "\u0622\u0645\u06cc\u0632": 7, "\u0627\u06cc\u0646\u062a\u0631\u0646\u062a": 7, "\u06af\u0641\u062a\u0647": 7, "\u0634\u0648\u062f": 7, "\u0627\u0648\u0644\u06cc\u0646": 7, "\u0628\u0627\u0631": 7, "\u0633\u0627\u0644": 7, "\u06f2\u06f0\u06f1\u06f8": 7, "\u067e\u0633": 7, "\u0622\u0646\u06a9\u0647": 7, "\u0631\u0648\u0632\u0646\u0627\u0645\u0647": 7, "\u0627\u0646\u062f\u0648\u0646\u0632\u06cc\u0627\u06cc\u06cc": 7, "\u062e\u0628\u0631": 7, "\u062f\u062e\u062a\u0631": 7, "\u06f1\u06f2": 7, "\u0633\u0627\u0644\u0647": 7, "\u062f\u0627\u062f": 7, "\u0645\u0648\u0636\u0648\u0639": 7, "\u062c\u0647\u0627\u0646\u06cc": 7, "\u062a\u0628\u062f\u06cc\u0644": 7, "\u0645\u062c\u0633\u0645\u0647": 7, "\u0647\u0646\u0631\u0645\u0646\u062f": 7, "\u0698\u0627\u067e\u0646\u06cc": 7, "\u0647\u0631": 7, "\u0686\u0646\u062f": 7, "\u0634\u0627\u06cc\u062f": 7, "\u0646\u06af\u0627\u0647": 7, "\u0628\u0639\u0636\u06cc": 7, "\u0632\u06cc\u0628\u0627": 7, "\u0646\u0628\u0627\u0634\u062f": 7, "\u0627\u0645\u0627": 7, "\u06a9\u0627\u0645\u0644\u0627": 7, "\u0628\u06cc": 7, "\u062e\u0637\u0631": 7, "\u0627\u06cc\u0631\u0627\u0646": 7, "\u0645\u062f\u062a": 7, "\u0628\u06cc\u0646": 7, "\u06a9\u0627\u0631\u0628\u0631\u0627\u0646": 7, "\u0645\u0637\u0631\u062d": 7, "\u0633\u0627\u0644\u06cc": 7, "\u0633\u0631\u0627\u0633\u0631": 7, "\u062c\u0647\u0627\u0646": 7, "\u0645\u0634\u0627\u0628\u0647\u06cc": 7, "\u0628\u0631\u0627\u06cc": 7, "\u0648\u0627\u0644\u062f\u06cc\u0646": 7, "\u06a9\u0631\u062f\u0647": 7, "\u0627\u0641\u0631\u0627\u062f": 7, "\u0686\u0647": 7, "\u06a9\u0627\u0631\u06cc": 7, "\u062f\u0639\u0648\u062a": 7, "tourist": 7, "distress": 7, "polish": 7, "galician": 7, "dzisiaj": 7, "szwecji": 7, "innych": 7, "bogatych": 7, "krajach": 7, "ludzi": 7, "u\u017cywaj\u0105": 7, "mn\u00f3stwo": 7, "najr\u00f3\u017cniejszych": 7, "urz\u0105dze\u0144": 7, "hox": 7, "suecia": 7, "outro": 7, "pa\u00eds": 7, "rico": 7, "xent": 7, "usa": [7, 8], "moita": 7, "m\u00e1quina": 7, "diferent": 7, "\u0142\u00f3dka": 7, "zaczyna": 7, "ton\u0105\u0107": 7, "tury\u015bci": 7, "wracaj\u0105": 7, "statek": 7, "dom\u00f3w": 7, "gdzie": 7, "opowiadaj\u0105": 7, "tym": 7, "jak": 7, "zostali": 7, "zaatakowani": 7, "surprisingli": 7, "unsettl": 7, "paradox": 7, "harbor": 7, "wisdom": 7, "aspir": 7, "technologist": 7, "disciplinari": 7, "ethicist": 7, "policymak": 7, "ai24": 7, "asa24": 7, "jide": 7, "jona": 7, "schuett": 7, "marku": 7, "anderljung": 7, "08751": 7, "bhy": 7, "hinton": 7, "pieter": 7, "abbeel": 7, "trevor": 7, "darrel": 7, "yuval": 7, "harari": 7, "ya": 7, "lan": 7, "shai": 7, "shalev": 7, "gillian": 7, "hadfield": 7, "clune": 7, "tegan": 7, "maharaj": 7, "hutter": 7, "at\u0131l\u0131m": 7, "g\u00fcne\u015f": 7, "baydin": 7, "sheila": 7, "mcilraith": 7, "qiqi": 7, "ashwin": 7, "acharya": 7, "anca": 7, "dragan": 7, "philip": 7, "torr": 7, "russel": 7, "kahneman": 7, "s\u00f6ren": 7, "mindermann": 7, "amid": 7, "384": 7, "6698": 7, "1126": 7, "adn0117": 7, "bbc": 7, "emili": 7, "braca": 7, "israel": 7, "carter": 7, "hafsa": 7, "kanchwala": 7, "khojasteh": 7, "charli": 7, "landow": 7, "luo": 7, "magarelli": 7, "mirin": 7, "averi": 7, "moyer": 7, "kayla": 7, "simpson": 7, "amelia": 7, "skawinski": 7, "heverin": 7, "23308": 7, "bmc": 7, "dillon": 7, "brendan": 7, "murphi": 7, "khachaturov": 7, "gleav": 7, "kellin": 7, "pelrin": 7, "2408": [7, 8], "02946": 7, "cmm": 7, "erik": 7, "lorenzo": 7, "malandri": 7, "fabio": 7, "mercorio": 7, "navid": 7, "nobani": 7, "seveso": 7, "15248": 7, "edg24": 7, "exa24": 7, "cyber": 7, "grb": 7, "rossi": 7, "barrow": 7, "mehrab": 7, "tanjim": 7, "sungchul": 7, "franck": 7, "dernoncourt": 7, "ruiyi": 7, "nesreen": 7, "2309": 7, "00770": 7, "h44z": 7, "hgp": 7, "saadia": 7, "hamid": 7, "palangi": 7, "dipankar": 7, "ec": 7, "kamar": 7, "oxi": 7, "smaranda": 7, "muresan": 7, "preslav": 7, "nakov": 7, "alin": 7, "villavicencio": 7, "editor": 7, "60th": 7, "3309": 7, "3326": 7, "dublin": 7, "aclanthologi": 7, "acl": 7, "18653": 7, "hym": 7, "weijiang": 7, "weitao": 7, "weihong": 7, "zhangyin": 7, "haotian": 7, "qianglong": 7, "weihua": 7, "xiaocheng": 7, "bing": 7, "dx": 7, "1145": [7, 8], "3703155": 7, "ldw": 7, "lijun": 7, "ruohui": 7, "xuhao": 7, "wangmeng": 7, "zuo": 7, "dahua": 7, "qiao": 7, "shao": 7, "05044": 7, "mpy": 7, "xuwang": 7, "zifan": 7, "norman": 7, "mu": 7, "elham": 7, "sakhae": 7, "nathaniel": 7, "forsyth": 7, "04249": 7, "mlc24": 7, "illumin": 7, "ailumin": 7, "oaa": 7, "adler": 7, "ahmad": 7, "ilg": 7, "akkaya": 7, "florencia": 7, "leoni": 7, "aleman": 7, "janko": 7, "altenschmidt": 7, "altman": 7, "shyamal": 7, "anadkat": 7, "avila": 7, "valeri": 7, "balcom": 7, "baltescu": 7, "haim": 7, "belgum": 7, "irwan": 7, "bello": 7, "jake": 7, "berdin": 7, "bernadett": 7, "shapiro": 7, "berner": 7, "lenni": 7, "bogdonoff": 7, "boiko": 7, "madelain": 7, "boyd": 7, "luisa": 7, "brakman": 7, "button": 7, "rosi": 7, "campbel": 7, "cann": 7, "brittani": 7, "carei": 7, "carlson": 7, "rori": 7, "carmichael": 7, "che": 7, "foti": 7, "sulli": 7, "rubi": 7, "chess": 7, "chester": 7, "cho": 7, "hyung": 7, "won": 7, "chung": 7, "jeremiah": 7, "currier": 7, "yunx": 7, "cori": 7, "decareaux": 7, "degri": 7, "deutsch": 7, "devil": 7, "dhar": 7, "steve": 7, "dowl": 7, "dun": 7, "adrien": 7, "ecoffet": 7, "atti": 7, "eleti": 7, "tyna": 7, "elound": 7, "farhi": 7, "niko": 7, "sim\u00f3n": 7, "posada": 7, "fishman": 7, "juston": 7, "isabella": 7, "fulford": 7, "georg": 7, "gibson": 7, "vik": 7, "tarun": 7, "gogineni": 7, "goh": 7, "rapha": 7, "gontijo": 7, "lope": 7, "gordon": 7, "morgan": 7, "grafstein": 7, "yufei": 7, "guo": 7, "hallaci": 7, "heaton": 7, "johann": 7, "heideck": 7, "hickei": 7, "wade": 7, "hoeschel": 7, "houghton": 7, "kenni": 7, "hsu": 7, "shengli": 7, "joost": 7, "huizinga": 7, "shawn": 7, "joann": 7, "jang": 7, "roger": 7, "haozhun": 7, "shino": 7, "jomoto": 7, "billi": 7, "jonn": 7, "tomer": 7, "kaftan": 7, "\u0142ukasz": 7, "kamali": 7, "ingmar": 7, "kanitscheid": 7, "tabarak": 7, "khan": 7, "logan": 7, "kilpatrick": 7, "jong": 7, "wook": 7, "christina": 7, "yongjik": 7, "hendrik": 7, "kirchner": 7, "kiro": 7, "matt": 7, "kokotajlo": 7, "kondraciuk": 7, "kondrich": 7, "konstantinidi": 7, "kosic": 7, "vishal": 7, "kuo": 7, "lamp": 7, "ikai": 7, "teddi": 7, "jade": 7, "leung": 7, "chak": 7, "ming": 7, "lim": 7, "molli": 7, "mateusz": 7, "litwin": 7, "theresa": 7, "lopez": 7, "patricia": 7, "lue": 7, "makanju": 7, "malfacini": 7, "markov": 7, "yaniv": 7, "markovski": 7, "bianca": 7, "mayn": 7, "mckinnei": 7, "christin": 7, "mcleavei": 7, "mcmillan": 7, "mcneil": 7, "aalok": 7, "menick": 7, "mishchenko": 7, "vinni": 7, "monaco": 7, "murk": 7, "m\u00e9ly": 7, "ashvin": 7, "nair": 7, "reiichiro": 7, "nakano": 7, "rajeev": 7, "nayak": 7, "arvind": 7, "neelakantan": 7, "hyeonwoo": 7, "noh": 7, "keef": 7, "jakub": 7, "pachocki": 7, "palermo": 7, "ashlei": 7, "pantuliano": 7, "parish": 7, "emi": 7, "parparita": 7, "passo": 7, "perelman": 7, "belbut": 7, "pere": 7, "pokorni": 7, "pokrass": 7, "vitchyr": 7, "pong": 7, "tolli": 7, "powel": 7, "bori": 7, "proehl": 7, "rae": 7, "ramesh": 7, "franci": 7, "kendra": 7, "rimbach": 7, "carl": 7, "rotst": 7, "roussez": 7, "saltarelli": 7, "ted": 7, "sander": 7, "schnurr": 7, "selsam": 7, "kyla": 7, "sheppard": 7, "toki": 7, "sherbakov": 7, "shieh": 7, "shoker": 7, "pranav": 7, "szymon": 7, "sidor": 7, "sigler": 7, "sitkin": 7, "sokolowski": 7, "natali": 7, "staudach": 7, "madelein": 7, "tootoonchian": 7, "tseng": 7, "preston": 7, "tuggl": 7, "turlei": 7, "juan": 7, "cer\u00f3n": 7, "urib": 7, "vallon": 7, "vijayvergiya": 7, "jai": 7, "alvin": 7, "ward": 7, "cj": 7, "weinmann": 7, "akila": 7, "welihinda": 7, "jiayi": 7, "weng": 7, "lilian": 7, "wiethoff": 7, "willner": 7, "wolrich": 7, "lauren": 7, "workman": 7, "sherwin": 7, "yoo": 7, "zeller": 7, "shengjia": 7, "juntang": 7, "zhuk": 7, "2303": 7, "08774": 7, "pnc": 7, "inkit": 7, "manish": 7, "nagireddi": 7, "giandomenico": 7, "cornacchia": 7, "subhajit": 7, "chaudhuri": 7, "tejaswini": 7, "pedapati": 7, "pierr": 7, "dognin": 7, "keerthiram": 7, "murugesan": 7, "miehl": 7, "santill\u00e1n": 7, "kieran": 7, "giulio": 7, "zizzo": 7, "muhammad": 7, "zaid": 7, "hame": 7, "purcel": 7, "desmond": 7, "pan": 7, "ing": 7, "vejsbjerg": 7, "dali": 7, "hind": 7, "werner": 7, "geyer": 7, "ambrish": 7, "rawat": 7, "kush": 7, "varshnei": 7, "prasanna": 7, "sattigeri": 7, "07724": 7, "saffron": 7, "ring": 7, "aslanid": 7, "glaes": 7, "nat": 7, "mcalees": 7, "irv": 7, "2202": 7, "03286": 7, "szw": 7, "qinghua": 7, "higham": 7, "gorban": 7, "bastouni": 7, "ivan": 7, "tyukin": 7, "12670": 7, "vsk": 7, "simplesafetytest": 7, "2311": 7, "08370": 7, "wmr24": 7, "sandra": 7, "brent": 7, "mittelstadt": 7, "duti": 7, "royal": 7, "240197": 7, "royalsocietypublish": 7, "1098": 7, "rso": 7, "ylx24": 7, "jiahao": 7, "xingwei": 7, "zyi": 7, "shune": 7, "lyumanshan": 7, "jingyu": 7, "shui": 7, "haobin": 7, "pengfei": 7, "hewu": 7, "ghost": 7, "14931": 7, "zho24": 7, "amazonwservices24": 7, "anthropic24": 7, "cdn": 7, "1adf000c8f675958c2ee23805d91aaade1cd4613": 7, "centerfasafety24a": 7, "centerforaisafeti": 7, "centerfasafety24b": 7, "deepmind24": 7, "googleapi": 7, "fsf": 7, "europeanmagency24": 7, "ema": 7, "europa": 7, "activities_en": 7, "financialirauthority24": 7, "ibm24": 7, "watsonx": 7, "saa": 7, "libraryocongress23": 7, "loc": 7, "gov": 7, "mistralai24": 7, "mlsteam24": 7, "mlsafeti": 7, "nationaliosatechnology24": 7, "nist": 7, "itl": 7, "nvidia24": 7, "openai24a": 7, "openai24b": 7, "opensafetylab24a": 7, "opensafetylab24b": 7, "protectai24": 7, "surgeai24": 7, "ukgovernment24": 7, "unicef24": 7, "innocenti": 7, "julia": 8, "easili": 8, "trial": 8, "wrangl": 8, "hoc": 8, "dataset": 8, "unwant": 8, "overflow": 8, "twitter": 8, "youtub": 8, "ldot": 8, "prod_": 8, "syntact": 8, "central": 8, "delic": 8, "heart": 8, "xml": 8, "invalid": 8, "ttt": 8, "itt": 8, "nousresearch": 8, "herm": 8, "json_format": 8, "person1": 8, "q1": 8, "person2": 8, "response_cont": 8, "is_json": 8, "myjson": 8, "nest": 8, "conceptu": 8, "overview": 8, "unend": 8, "whitespac": 8, "throw": 8, "somewher": 8, "json_object": 8, "circul": 8, "vertex": 8, "worri": 8, "enum": 8, "secextract": 8, "mentioned_ent": 8, "mentioned_plac": 8, "extract_from_sec_fil": 8, "sec_filing_text": 8, "parser": 8, "hint": 8, "prompt_extract": 8, "sec_extract": 8, "washington": 8, "beg": 8, "1652": 8, "171": 8, "unnorm": 8, "0325": 8, "strongest": 8, "bfloat16": 8, "device_map": 8, "src": 8, "python3": 8, "nvml": 8, "return_tensor": 8, "pt": 8, "inference_mod": 8, "last_token_logit": 8, "next_token_prob": 8, "nn": 8, "dim": 8, "top_k_prob": 8, "top_k_indic": 8, "topk": 8, "top_k_token": 8, "decod": 8, "idx": 8, "skip_special_token": 8, "prob": 8, "0305": 8, "0197": 8, "0106": 8, "0093": 8, "logitsprocessor": 8, "logits_processor": 8, "logitsprocessorlist": 8, "customlogitsprocessor": 8, "intermediari": 8, "input_id": 8, "__call__": 8, "longtensor": 8, "batch_siz": 8, "sequence_length": 8, "floattensor": 8, "vocab_s": 8, "mask": 8, "pick": 8, "greedi": 8, "yesnologitsprocessor": 8, "initial_length": 8, "fill_": 8, "inf": 8, "debug": 8, "yes_token": 8, "add_special_token": 8, "no_token": 8, "yes_no_logit": 8, "yes_no_prob": 8, "yes_prob": 8, "no_prob": 8, "yes_mask": 8, "1e4": 8, "NO": 8, "generation_output_control": 8, "uncontrol": 8, "generation_output": 8, "renorm": 8, "4263": 8, "5737": 8, "10407": 8, "4607": 8, "6250": 8, "9219": 8, "helper": 8, "model_output": 8, "gen_output": 8, "batch_decod": 8, "clean_up_tokenization_spac": 8, "classic": 8, "italian": 8, "willard": 8, "louf": 8, "reformul": 8, "finit": 8, "fsm": 8, "s_": 8, "s_t": 8, "s_1": 8, "tild": 8, "odot": 8, "rightarrow": 8, "wise": 8, "thien": 8, "automaton": 8, "dfa": 8, "outgo": 8, "yy": 8, "ever": 8, "aa": 8, "lwai": 8, "prop": 8, "yynnaa": 8, "malform": 8, "base_prompt": 8, "sec_extraction_outlin": 8, "zsp": 8, "zicorp": 8, "with_structured_output": 8, "runnabl": 8, "typeddict": 8, "qu": 8, "langchain_openai": 8, "chatopenai": 8, "langchain_cor": 8, "chatprompttempl": 8, "extract_from_sec_filing_langchain": 8, "structured_llm": 8, "prompt_templ": 8, "from_messag": 8, "llm_chain": 8, "invok": 8, "sec_extraction_langchain": 8, "bnf": 8, "backu": 8, "naur": 8, "fssl": 8, "extract_entities_from_sec_fil": 8, "ollama_structured_output_prompt_suffix": 8, "ollama_structured_output_temperatur": 8, "uncensor": 8, "model_json_schema": 8, "response_json": 8, "sharpli": 8, "wrapper": 8, "exllama2": 8, "zoo": 8, "furthermor": 8, "nonetheless": 8, "extran": 8, "dispar": 8, "preval": 8, "speak": 8, "aider": 8, "outweigh": 8, "rebutt": 8, "reproduct": 8, "paint": 8, "dottxt": 8, "flaw": 8, "uneven": 8, "conflat": 8, "drawback": 8, "pfiffer": 8, "wrestl": 8, "aid24": 8, "dot24": 8, "demo": 8, "gge24": 8, "lan4b": 8, "lww": 8, "xun": 8, "hanyu": 8, "yezhaohui": 8, "shichao": 8, "simin": 8, "shunyu": 8, "feiyu": 8, "xiong": 8, "12599": 8, "llf": 8, "xieyang": 8, "frederick": 8, "fiannaca": 8, "terri": 8, "koo": 8, "dixon": 8, "ea": 8, "machineri": 8, "3613905": 8, "3650756": 8, "xuan": 8, "hai": 8, "nguyen": 8, "ngoc": 8, "tiviati": 8, "hieu": 8, "dao": 8, "shafiq": 8, "joti": 8, "kenji": 8, "kawaguchi": 8, "nanci": 8, "min": 8, "kan": 8, "08656": 8, "out24": 8, "twt": 8, "zhi": 8, "cheng": 8, "kuang": 8, "tsai": 8, "chieh": 8, "hung": 8, "yun": 8, "nung": 8, "02442": 8, "tt24": 8, "vivien": 8, "vivien000": 8, "wl23": 8, "r\u00e9mi": 8, "09702": 8, "guidanceai24": 8, "nvidia4a": 8, "wikipediacontributors24": 8, "wiktionari": 8, "naur_form": 8}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"about": [0, 2], "book": [0, 2], "content": [0, 3, 4, 5, 6, 7, 8], "core": 0, "challeng": 0, "we": 0, "ll": 0, "address": 0, "A": [0, 2, 3, 4], "practic": [0, 2, 6, 8], "approach": [0, 4, 7], "an": 0, "open": [0, 2, 6], "sourc": [0, 2, 6], "note": [0, 3], "perspect": 0, "who": 0, "thi": 0, "i": [0, 3], "For": 0, "outcom": 0, "prerequisit": 0, "set": 0, "up": 0, "your": [0, 6], "environ": 0, "code": 0, "repositori": 0, "python": 0, "setup": [0, 3], "api": [0, 7], "kei": [0, 5], "configur": 0, "troubleshoot": 0, "common": [0, 7], "issu": 0, "author": 0, "": 0, "prefac": [1, 2], "tame": 2, "llm": [2, 4, 5, 6, 7], "guid": 2, "pitfal": [2, 7], "softwar": [2, 5], "chapter": 2, "1": [2, 7], "The": [2, 4, 5, 6], "eval": [2, 5, 7], "gap": [2, 5], "2": [2, 6, 7], "manag": 2, "input": 2, "data": [2, 3], "3": [2, 7], "structur": [2, 8], "output": [2, 8], "4": [2, 7], "safeti": [2, 7], "5": [2, 7], "prefer": [2, 3], "base": [2, 3, 5, 7], "align": [2, 3], "6": [2, 7], "local": [2, 6], "7": 2, "fall": [2, 4], "cost": [2, 4, 6], "paradox": [2, 4], "8": 2, "frontier": 2, "appendix": 2, "tool": [2, 5, 6, 7, 8], "resourc": 2, "introduct": [3, 5, 6, 7, 8], "from": 3, "raw": 3, "capabl": 3, "On": 3, "misalign": 3, "languag": 3, "model": [3, 5, 6], "human": 3, "supervis": 3, "fine": [3, 6, 8], "tune": [3, 6, 8], "sft": 3, "augment": 3, "post": [3, 8], "train": 3, "answer": 3, "limit": 3, "collaps": 3, "fake": 3, "case": [3, 6, 7], "studi": [3, 6, 7], "polici": [3, 7], "experiment": 3, "deliver": 3, "smollm2": 3, "dataset": [3, 5, 6, 7], "synthet": 3, "gener": [3, 5, 7], "user": [3, 7], "prompt": [3, 6, 8], "reject": 3, "respons": 3, "chosen": 3, "dpo": 3, "optim": [3, 4], "prepar": 3, "vibe": 3, "check": [3, 4], "evalu": [3, 5, 7], "discuss": [3, 8], "conclus": [3, 4, 5, 6, 7, 8], "citat": [3, 5, 7, 8], "refer": [3, 4, 5, 6, 7, 8], "why": 4, "matter": 4, "more": 4, "than": 4, "ever": 4, "right": 4, "size": 4, "strateg": 4, "metric": [4, 5], "requir": [4, 5], "busi": 4, "perform": [4, 6], "oper": 4, "technic": [4, 7], "quantiz": [4, 6], "list": 4, "non": 5, "determinist": 5, "machin": 5, "emerg": 5, "properti": 5, "problem": [5, 8], "statement": [5, 8], "tradit": 5, "v": [5, 6], "design": [5, 7], "applic": 5, "test": 5, "matrix": 5, "conceptu": 5, "overview": 5, "consider": 5, "task": [5, 6], "benchmark": [5, 6, 7], "leaderboard": 5, "lightev": 5, "mmlu": 5, "econometr": 5, "sampl": [5, 7], "famili": [5, 6], "us": 5, "langsmith": 5, "promptfoo": 5, "comparison": [5, 6, 8], "choos": 6, "suitabl": 6, "result": 6, "llama": 6, "licens": 6, "commun": 6, "support": 6, "custom": [6, 7], "mistral": [6, 7], "decemb": 6, "22": 6, "2024": 6, "deploy": 6, "serv": 6, "cpp": 6, "llamafil": 6, "ollama": [6, 8], "lama": 6, "ui": 6, "lm": 6, "studio": 6, "jan": 6, "webui": 6, "openwebui": 6, "effect": 6, "level": 6, "hardwar": 6, "takeawai": [6, 7], "risk": 7, "ai": 7, "amplifi": 7, "exist": 7, "harm": 7, "novel": 7, "associ": 7, "autonom": 7, "exacerb": 7, "factor": 7, "specif": 7, "guidanc": 7, "govern": 7, "organ": 7, "privat": 7, "sector": 7, "openai": 7, "anthrop": 7, "googl": 7, "rubric": 7, "mlcommon": 7, "centr": 7, "porquoi": 7, "red": 7, "team": 7, "constitut": 7, "explain": 7, "xai": 7, "plan": 7, "phase": 7, "definit": 7, "research": [7, 8], "identif": 7, "framework": [7, 8], "architectur": 7, "implement": 7, "select": 7, "go": 7, "market": 7, "compon": 7, "salad": 7, "bench": 7, "truthfulqa": 7, "harmbench": 7, "safebench": 7, "techniqu": [7, 8], "repres": 7, "layer": 7, "map": 7, "rule": 7, "filter": 7, "moder": 7, "bad": 7, "good": 7, "guard": 7, "judg": 7, "valid": 7, "engin": 8, "json": 8, "mode": 8, "logit": 8, "process": 8, "outlin": 8, "langchain": 8, "best": 8, "compar": 8, "solut": 8, "ongo": 8, "debat": 8, "acknowledg": 8}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinxcontrib.bibtex": 9, "sphinx": 57}, "alltitles": {"About the Book": [[0, "about-the-book"], [2, "about-the-book"]], "Contents": [[0, "contents"], [3, "contents"], [4, "contents"], [5, "contents"], [6, "contents"], [7, "contents"], [8, "contents"]], "Core Challenges We\u2019ll Address": [[0, "core-challenges-we-ll-address"]], "A Practical Approach": [[0, "a-practical-approach"]], "An Open Source Approach": [[0, "an-open-source-approach"]], "Open Source Book": [[0, "open-source-book"]], "A Note on Perspective": [[0, "a-note-on-perspective"]], "Who This Book Is For": [[0, "who-this-book-is-for"]], "Outcomes": [[0, "outcomes"]], "Prerequisites": [[0, "prerequisites"]], "Setting Up Your Environment": [[0, "setting-up-your-environment"]], "Code Repository": [[0, "code-repository"]], "Python Environment Setup": [[0, "python-environment-setup"]], "API Keys Configuration": [[0, "api-keys-configuration"]], "Troubleshooting Common Issues": [[0, "troubleshooting-common-issues"]], "About the Author(s)": [[0, "about-the-author-s"]], "Preface": [[1, "preface"], [2, "preface"]], "Taming LLMs": [[2, "taming-llms"]], "A Practical Guide to LLM Pitfalls with Open Source Software": [[2, "a-practical-guide-to-llm-pitfalls-with-open-source-software"]], "Chapter 1: The Evals Gap": [[2, "chapter-1-the-evals-gap"]], "Chapter 2: Managing Input Data": [[2, "chapter-2-managing-input-data"]], "Chapter 3: Structured Output": [[2, "chapter-3-structured-output"]], "Chapter 4: Safety": [[2, "chapter-4-safety"]], "Chapter 5: Preference-Based Alignment": [[2, "chapter-5-preference-based-alignment"]], "Chapter 6: Local LLMs in Practice": [[2, "chapter-6-local-llms-in-practice"]], "Chapter 7: The Falling Cost Paradox": [[2, "chapter-7-the-falling-cost-paradox"]], "Chapter 8: Frontiers": [[2, "chapter-8-frontiers"]], "Appendix A: Tools and Resources": [[2, "appendix-a-tools-and-resources"]], "Preference-Based Alignment": [[3, "preference-based-alignment"]], "Introduction": [[3, "introduction"], [5, "introduction"], [6, "introduction"], [7, "introduction"], [8, "introduction"]], "From Raw Capabilities to Preference Alignment": [[3, "from-raw-capabilities-to-preference-alignment"]], "On the Misalignment of Language Models": [[3, "on-the-misalignment-of-language-models"]], "Aligning Language Models with Human Preferences": [[3, "aligning-language-models-with-human-preferences"]], "Supervised Fine-Tuning (SFT) for Model Alignment": [[3, "supervised-fine-tuning-sft-for-model-alignment"]], "Augmenting SFT with Human Preferences": [[3, "augmenting-sft-with-human-preferences"]], "Is Post-Training the Answer?": [[3, "is-post-training-the-answer"]], "Limitations": [[3, "limitations"]], "Model Collapse": [[3, "model-collapse"]], "Faking Alignment": [[3, "faking-alignment"]], "Case Study: Aligning a Language Model to a Policy": [[3, "case-study-aligning-a-language-model-to-a-policy"]], "Experimental Setup": [[3, "experimental-setup"]], "Deliverables": [[3, "deliverables"]], "A Note on smolLM2 Models": [[3, "a-note-on-smollm2-models"]], "Policy": [[3, "policy"]], "Preference Dataset - Synthetic Dataset Generation": [[3, "preference-dataset-synthetic-dataset-generation"]], "User Prompts": [[3, "user-prompts"]], "Rejected Responses": [[3, "rejected-responses"]], "Chosen Responses": [[3, "chosen-responses"]], "Generate DPO Dataset": [[3, "generate-dpo-dataset"]], "DPO-Based Optimization": [[3, "dpo-based-optimization"]], "Data Preparation": [[3, "data-preparation"]], "Fine-Tuning": [[3, "fine-tuning"]], "Vibe Check": [[3, "vibe-check"]], "Alignment Evaluation": [[3, "alignment-evaluation"]], "Discussion and Conclusions": [[3, "discussion-and-conclusions"]], "Citation": [[3, "citation"], [5, "citation"], [7, "citation"], [8, "citation"]], "References": [[3, "references"], [4, "references"], [5, "references"], [6, "references"], [7, "references"], [8, "references"]], "The Falling Cost Paradox": [[4, "the-falling-cost-paradox"]], "Why Optimization Matters More Than Ever": [[4, "why-optimization-matters-more-than-ever"]], "Right-Sizing LLMs: A Strategic Approach": [[4, "right-sizing-llms-a-strategic-approach"]], "Metrics": [[4, "metrics"], [5, "metrics"]], "Requirements": [[4, "requirements"]], "Business Requirements": [[4, "business-requirements"]], "Performance Requirements": [[4, "performance-requirements"]], "Operational Requirements": [[4, "operational-requirements"]], "Technical Requirements": [[4, "technical-requirements"]], "Quantization": [[4, "quantization"], [6, "quantization"]], "Check-list": [[4, "check-list"]], "Conclusion": [[4, "conclusion"], [5, "conclusion"], [6, "conclusion"], [7, "conclusion"], [8, "conclusion"]], "The Evals Gap": [[5, "the-evals-gap"]], "Non-Deterministic Generative Machines": [[5, "non-deterministic-generative-machines"]], "Emerging Properties": [[5, "emerging-properties"]], "Problem Statement": [[5, "problem-statement"], [8, "problem-statement"]], "Evals of Traditional Software vs LLMs": [[5, "evals-table"]], "Evals Design": [[5, "evals-design"]], "LLM Application Testing Requirements Matrix": [[5, "validation-requirements"]], "Conceptual Overview": [[5, "conceptual-overview"]], "Design Considerations": [[5, "design-considerations"]], "Key Metrics for Evaluating Generative Tasks": [[5, "key-metrics"]], "Evaluators": [[5, "evaluators"]], "Model-Based Evaluation": [[5, "model-based-evaluation"]], "Evaluating Evaluators": [[5, "evaluating-evaluators"]], "Benchmarks and Leaderboards": [[5, "benchmarks-and-leaderboards"]], "Tools": [[5, "tools"], [8, "tools"]], "LightEval": [[5, "lighteval"]], "MMLU Econometrics Task Dataset sample": [[5, "mmlu-econometrics"]], "Model Families Evaluated Using LightEval": [[5, "model-families"]], "LangSmith": [[5, "langsmith"]], "PromptFoo": [[5, "promptfoo"]], "Comparison": [[5, "comparison"], [6, "comparison"], [6, "id36"]], "Comparison of Lighteval, LangSmith, and Promptfoo": [[5, "tool-comparison"]], "Local LLMs in Practice": [[6, "local-llms-in-practice"]], "Choosing your Model": [[6, "choosing-your-model"]], "Task Suitability": [[6, "task-suitability"]], "Benchmark results for Llama 2 family of models.": [[6, "llama2-benchmark"]], "Performance & Cost": [[6, "performance-cost"]], "Licensing": [[6, "licensing"]], "Open Source LLMs.": [[6, "open-source-llms"]], "Community Support": [[6, "community-support"]], "Customization": [[6, "customization"]], "Mistral fine-tuning costs as of December 22, 2024.": [[6, "mistral-costs"]], "Tools for Local LLM Deployment": [[6, "tools-for-local-llm-deployment"]], "Serving Models": [[6, "serving-models"]], "LLama.cpp": [[6, "llama-cpp"]], "Llamafile": [[6, "llamafile"]], "Ollama": [[6, "ollama"], [8, "ollama"]], "lama.cpp vs Ollama vs Llamafile Comparison": [[6, "feature-comparison-local"]], "UI": [[6, "ui"]], "LM Studio": [[6, "lm-studio"]], "Jan": [[6, "jan"]], "Open WebUI": [[6, "open-webui"]], "LM Studio vs Jan vs OpenWebUI Comparison": [[6, "feature-comparison-ui"]], "Case Study: The Effect of Quantization on LLM Performance": [[6, "case-study-the-effect-of-quantization-on-llm-performance"]], "Prompts Dataset": [[6, "prompts-dataset"]], "Quantization Levels": [[6, "quantization-levels"]], "Benchmarking": [[6, "benchmarking"], [7, "benchmarking"]], "Results": [[6, "results"]], "Quantization Benchmarks": [[6, "quantization-benchmarks"]], "Benchmarking Hardware": [[6, "benchmarking-hardware"]], "Takeaways": [[6, "takeaways"], [7, "takeaways"]], "Safety": [[7, "safety"]], "Safety Risks": [[7, "safety-risks"]], "General AI Safety Risks": [[7, "general-ai-safety-risks"]], "Amplified Existing Harms and Novel Risks": [[7, "amplified-existing-harms-and-novel-risks"]], "Risks Associated with Autonomous AI": [[7, "risks-associated-with-autonomous-ai"]], "Exacerbating Factors": [[7, "exacerbating-factors"]], "LLMs Specific Safety Risks": [[7, "llms-specific-safety-risks"]], "Guidance": [[7, "guidance"]], "Governments & Organizations": [[7, "governments-organizations"]], "Private Sector": [[7, "private-sector"]], "OpenAI": [[7, "openai"]], "Anthropic": [[7, "anthropic"]], "Google": [[7, "google"]], "Rubrics": [[7, "rubrics"]], "MLCommons AI Safety Benchmark": [[7, "mlcommons-ai-safety-benchmark"]], "Centre for the Governance of AI Rubric": [[7, "centre-for-the-governance-of-ai-rubric"]], "Porquoi": [[7, "porquoi"]], "Approaches": [[7, "approaches"]], "Red Teaming": [[7, "red-teaming"]], "Constitutional AI": [[7, "constitutional-ai"]], "Explainable AI (XAI)": [[7, "explainable-ai-xai"]], "Designing a Safety Plan": [[7, "designing-a-safety-plan"]], "Phase 1. Policy Definition": [[7, "phase-1-policy-definition"]], "Phase 2. User Research & Risk Identification": [[7, "phase-2-user-research-risk-identification"]], "Phase 3. Evaluation Framework": [[7, "phase-3-evaluation-framework"]], "Phase 4. Safety Architecture Design": [[7, "phase-4-safety-architecture-design"]], "Phase 5. Implementation & Tools Selection": [[7, "phase-5-implementation-tools-selection"]], "Phase 6. Go-to-Market": [[7, "phase-6-go-to-market"]], "Common Pitfalls": [[7, "common-pitfalls"]], "Technical Implementation Components": [[7, "technical-implementation-components"]], "Benchmarks & Datasets": [[7, "benchmarks-datasets"]], "SALAD-Bench": [[7, "salad-bench"]], "TruthfulQA": [[7, "truthfulqa"]], "HarmBench": [[7, "harmbench"]], "SafeBench": [[7, "safebench"]], "Tools & Techniques": [[7, "tools-techniques"]], "Representative Safety Layer Risk Map.": [[7, "safety-layer-table"]], "Rules-Based Safety Filtering": [[7, "rules-based-safety-filtering"]], "Rules-Based Safety Filtering Tools.": [[7, "safety-layer-tools"]], "LLM-Based Safety Filtering": [[7, "llm-based-safety-filtering"]], "Custom Moderation": [[7, "custom-moderation"]], "Case Study: Implementing a Safety Filter": [[7, "case-study-implementing-a-safety-filter"]], "Evals Dataset": [[7, "evals-dataset"]], "Bad Samples": [[7, "bad-samples"]], "Good Samples": [[7, "good-samples"]], "Safety Filters": [[7, "safety-filters"]], "LLM-Guard": [[7, "llm-guard"]], "Mistral Moderation API": [[7, "mistral-moderation-api"]], "OpenAI Moderation API": [[7, "openai-moderation-api"]], "Custom Judge Validator": [[7, "custom-judge-validator"]], "Structured Output": [[8, "structured-output"]], "Techniques": [[8, "techniques"]], "Prompt Engineering": [[8, "prompt-engineering"]], "JSON Mode (Fine-Tuned)": [[8, "json-mode-fine-tuned"]], "Logit Post-Processing": [[8, "logit-post-processing"]], "Outlines": [[8, "outlines"]], "LangChain": [[8, "langchain"]], "Discussion": [[8, "discussion"]], "Best Practices": [[8, "best-practices"]], "Comparing Solutions": [[8, "comparing-solutions"]], "Structured Output Frameworks Comparison": [[8, "structured-output-frameworks"]], "Research and Ongoing Debate": [[8, "research-and-ongoing-debate"]], "Acknowledgements": [[8, "acknowledgements"]]}, "indexentries": {}}) \ No newline at end of file diff --git a/tamingllms/_build/jupyter_execute/markdown/intro.ipynb b/tamingllms/_build/jupyter_execute/markdown/intro.ipynb index 98db741..028bf6f 100644 --- a/tamingllms/_build/jupyter_execute/markdown/intro.ipynb +++ b/tamingllms/_build/jupyter_execute/markdown/intro.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "b5247b8d", + "id": "f2846c09", "metadata": {}, "source": [ "(intro)=\n", diff --git a/tamingllms/_build/jupyter_execute/notebooks/alignment.ipynb b/tamingllms/_build/jupyter_execute/notebooks/alignment.ipynb index d39e04c..9f62297 100644 --- a/tamingllms/_build/jupyter_execute/notebooks/alignment.ipynb +++ b/tamingllms/_build/jupyter_execute/notebooks/alignment.ipynb @@ -2537,7 +2537,7 @@ "source": [ "## Discussion and Conclusions\n", "\n", - "LLMs are complex systems and alignment is a challenging problem. In this chapter, we discussed how post-training alignment techniques can be used to align a language model to human preferences. In the case study, we demonstrated how to use DPO to align a language model to a user-provider policy further automating the process via synthetic data generation and LLM-as-judge evaluation. Our approach does serve as a proof of concept, however, several considerations should be taken into account when using this methodology in practice.\n", + "LLMs are complex systems and alignment is a challenging problem. In this chapter, we discussed how post-training techniques can be used to align a language model to human preferences. In the case study, we demonstrated how to use DPO to align a language model to a user-provider policy further automating the process via synthetic data generation and LLM-as-judge evaluation. Our approach serves as a proof of concept and several considerations should be taken into account when using this methodology in practice.\n", "\n", "**Synthetic Data Generation**\n", "\n", diff --git a/tamingllms/_build/jupyter_execute/notebooks/cost.ipynb b/tamingllms/_build/jupyter_execute/notebooks/cost.ipynb new file mode 100644 index 0000000..b03dd48 --- /dev/null +++ b/tamingllms/_build/jupyter_execute/notebooks/cost.ipynb @@ -0,0 +1,450 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(cost)=\n", + "# The Falling Cost Paradox\n", + "```{epigraph}\n", + "It is a confusion of ideas to suppose that the economical use of fuel is equivalent to diminished consumption.
    \n", + "The very contrary is the truth. \n", + "\n", + "-- William Stanley Jevons\n", + "```\n", + "```{contents}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Why Optimization Matters More Than Ever\n", + "\n", + "According to recent analysis from a16z {cite}`a16z2024llmflation`, the cost of LLM inference is decreasing by approximately 10x every year - a rate that outpaces even Moore's Law in the PC revolution or Edholm's Law during the bandwidth explosion of the dot-com era.\n", + "\n", + "```{figure} ../_static/cost/llmflation.png\n", + "---\n", + "name: llmflation\n", + "alt: LLMflation\n", + "scale: 70%\n", + "align: center\n", + "---\n", + "LLMflation {cite}`a16z2024llmflation`: The cost of LLM inference is decreasing by approximately 10x every year.\n", + "```\n", + "\n", + "A model achieving an MMLU score of 42 that cost \\$60 per million tokens in late 2021 can now be run for just \\$0.06 per million tokens. For higher-capability models scoring 83 on MMLU, prices have fallen by a factor of 62 since GPT-4's introduction in March 2023. \n", + "\n", + "This dramatic decline stems from multiple compounding factors including:\n", + "\n", + "- Improved GPU efficiency through architectural advances and Moore's Law\n", + "- Model quantization progress, moving from 16-bit to 4-bit or lower precision\n", + "- Software optimizations reducing compute and memory bandwidth requirements\n", + "- Emergence of smaller yet similarly capable models\n", + "- Better instruction tuning techniques like RLHF and DPO\n", + "- Competition from open-source models and low-cost providers\n", + "\n", + "This trend raises a critical question: If LLM costs are plummeting so rapidly, why should businesses and developers invest precious time and resources in optimizing their LLM usage? Wouldn't it make more sense to simply wait for the next wave of cost improvements rather than optimize today? In two words: **Jevons Paradox**. \n", + "\n", + "The Jevons Paradox was first observed by English economist William Stanley Jevons in 1865. Studying coal consumption during the Industrial Revolution, Jevons made a counterintuitive discovery: as steam engines became more efficient and coal use became more economical, total coal consumption increased rather than decreased driving the (Industrial Revolution) and the total spending up.\n", + "\n", + "This pattern has repeated throughout technological history:\n", + "\n", + "- Computing Power: As cost per computation plummeted, we didn't spend less on computing, instead we found new creative uses for computers, from smartphones to cloud servers\n", + "- Network Bandwidth: As data transmission got cheaper, we shifted from text messaging to HD video streaming and real-time gaming\n", + "- Data Storage: As cost per gigabyte fell, we moved from storing documents to hosting entire media libraries and training massive AI models\n", + "\n", + "One could argue that LLMs and Generative AI more broadly are following a similar trajectory. As costs decline, we're seeing the emergence of new applications:\n", + "- Embedding AI capabilities into every application and workflow\n", + "- Real-time analysis of audio transcripts and conversations\n", + "- Running AI models directly on edge devices and smartphones\n", + "- Multimodal applications combining text, images, audio and video \n", + "\n", + "In this environment of rapidly falling costs but potential for exponential growth in usage, optimizing LLM costs becomes more, not less, important. Here's why:\n", + "\n", + "**A) Scale Magnifies Everything**. When operating at billions of tokens per day, even small inefficiencies have major effects:\n", + "- A single digit improvement in efficiency can save millions of dollars annually at scale\n", + "- Every 100 milliseconds of latency is about 8% difference in engagement rates (30% on mobile) [^groklatency]\n", + "[^groklatency]: Quote from Jonathan Ross, CEO of Groq, a company that specializes in AI Inference services.\n", + "\n", + "**B) Tiered Pricing Persists**. While average costs are declining, the market maintains a tiered structure:\n", + "- Different models offer varying price-performance tradeoffs\n", + "- ChatGPT Pro at \\$200 per month breaks the price drop trend perhaps triggering a new wave of premium models\n", + "- Cost optimization is still required to select the right model for each specific use case\n", + "\n", + "**C) Competition Drives Innovation**. Companies that master LLM efficiency gain significant advantages:\n", + "- Ability to offer more competitive pricing\n", + "- Capacity to handle larger scale operations\n", + "- Resources to invest in product improvement\n", + "\n", + "**D) Performance and Cost Are Linked**. Cost optimization often yields performance benefits:\n", + "- Resource efficiency enables handling larger user loads\n", + "- More efficiency and reduced latency leads to improved user experience\n", + "\n", + "In this environment, companies that master efficient LLM usage while exploiting new capabilities opened up by falling costs will be best positioned to innovate and scale. This dual focus - efficiency and innovation - will likely characterize successful AI companies in the years ahead.\n", + "\n", + "Motivated by this insight, in the next sections we will dive into the factors that drive LLM cost decay and how to optimize LLM usage in practical applications. The discussion will explore key optimization areas including inference optimization through techniques like Flash Attention and cached prompts, model compression via quantization and distillation, and practical implementation patterns such as response caching, batch processing, and early stopping - all aimed at achieving efficient usage and cost reductions while maintaining model performance and reliability.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Right-Sizing LLMs: A Strategic Approach\n", + "\n", + "Before implementing cost optimization strategies for LLMs, organizations must develop a comprehensive understanding of their own requirements and constraints. This systematic approach prevents both over-engineering and under-provisioning, leading to more efficient and cost-effective implementations.\n", + "\n", + "In this section, we define key performance and cost related metrics that will guide our discussion. Then we propose a set of requirements practitioners should consider before we dive into cost optimization techniques.\n", + "\n", + "\n", + "### Metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Requirements\n", + "\n", + "#### Business Requirements\n", + "\n", + "First, one needs to define the problem to be solved and to what extent it is worth to be solved. Use case requirements form the foundation of any LLM implementation project. A clear definition of the specific business problema and task to be accomplished must be established upfront, along with concrete performance metrics covering accuracy, latency and throughput. This should be accompanied by well-defined cost-per-transaction targets, clear ROI expectations, and a strategic allocation of budgets across different use cases to ensure resources are optimally distributed.\n", + "\n", + "Budget and ROI considerations are critical for ensuring the long-term viability of LLM implementations. Organizations must establish clear spending limits that align with their financial capabilities while defining realistic cost-per-transaction targets. ROI expectations need to be carefully established through detailed analysis, followed by a strategic allocation of budgets across various use cases based on their business impact and priority.\n", + "\n", + "Compliance and security requirements cannot be overlooked. This involves a thorough identification of all applicable regulatory requirements and the establishment of robust data handling standards. Organizations must specify comprehensive audit requirements to maintain transparency and accountability, while implementing appropriate security controls to protect sensitive data and system access.\n", + "\n", + "Future-proofing considerations help ensure the longevity and adaptability of LLM implementations. This requires careful planning for scale to accommodate future growth, along with the evaluation of multi-model strategies to reduce dependency on single solutions. Organizations should carefully assess vendor lock-in risks and explore open-source alternatives to maintain flexibility and control over their AI infrastructure.\n", + "\n", + "Chapter {ref}`local` provides a detailed discussion on relevant considerations when {ref}`local-model-selection`.\n", + "\n", + "#### Performance Requirements\n", + "\n", + "Accuracy and quality form the foundation of any LLM deployment's performance requirements. At its core, this involves determining the minimum level of accuracy that the model must achieve to be considered successful. This serves as a critical baseline for evaluating model performance and making deployment decisions. Establishing clear evaluation metrics, whether through automated measures or human evaluation processes, provides concrete ways to assess if these thresholds are being met. Continuous monitoring of these accuracy metrics ensures the system maintains its performance over time as usage patterns and data distributions evolve. Chapter {ref}`evals` provides a detailed discussion on how to evaluate the performance of LLM-based applications.\n", + "\n", + "Latency and throughput requirements are equally crucial for ensuring a positive user experience and system reliability. These specifications define how quickly the system must respond to requests and how many concurrent users it can handle. Response time requirements must be carefully balanced against the computational resources available, while peak load capabilities need to account for usage spikes and growth patterns. The decision between real-time processing for immediate responses versus batch processing for efficiency depends heavily on the use case and user expectations. \n", + "\n", + "\n", + "#### Operational Requirements\n", + "\n", + "Scale and capacity planning forms the foundation of operational requirements for LLM deployments. This involves a comprehensive analysis of expected system usage and growth patterns to ensure the infrastructure can handle both current and future demands. Organizations must carefully project their daily and monthly API call volumes while calculating the average number of tokens per request to accurately estimate resource needs. Understanding usage patterns, including seasonal variations, enables proper capacity planning. Additionally, developing 12-24 month growth projections helps ensure the infrastructure can scale appropriately as demand increases.\n", + "\n", + "Reliability and availability requirements are equally critical for maintaining consistent service quality. These specifications define the expected uptime percentage that the system must maintain, typically expressed as a percentage of total operational time. Organizations need to establish clear maintenance windows that minimize disruption to users while ensuring necessary system updates and optimizations can be performed. Comprehensive backup and failover requirements must be specified to ensure business continuity in case of failures. High availability needs should be clearly defined, including redundancy levels and recovery time objectives, to maintain service quality even during unexpected events.\n", + "\n", + "#### Technical Requirements\n", + "\n", + "System integration requirements define how the LLM system will interact and communicate with existing infrastructure and applications. This involves carefully mapping all integration points where the LLM system needs to connect with other systems, establishing standardized data formats and interfaces for seamless communication, implementing robust security measures to protect data in transit, and identifying any technical constraints that could impact integration. Getting these integration requirements right is crucial for ensuring the LLM system can function effectively within the broader technical ecosystem.\n", + "\n", + "Data management requirements address how information will be stored, processed, and maintained within the LLM system. This encompasses determining appropriate storage solutions for maintaining conversation context and history, selecting and configuring vector databases to enable efficient retrieval-augmented generation (RAG), creating comprehensive data retention policies that balance operational needs with resource constraints, and ensuring all data handling practices comply with relevant privacy regulations. Proper data management is essential for both system performance and regulatory compliance, making it a critical consideration in any LLM implementation.\n", + "\n", + "\n", + "This structured approach to requirements analysis enables organizations to:\n", + "1. Select appropriate models aligned with specific needs\n", + "2. Identify targeted optimization opportunities\n", + "3. Scale efficiently while controlling costs\n", + "4. Develop realistic resource allocation strategies\n", + "\n", + "The following sections explore specific optimization techniques, but their implementation should always be guided by these foundational requirements.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Quantization\n", + "\n", + "Quantization is a common and relevant technique in making LLMs more efficient and accessible. At a high level, quantization reduces the number of bits used to represent a model's parameters. The most common form of quantization is to represent model's weights at lower precision at post-training phase. It has become a standard technique to generate a series of quantized models given a large pre-trained base model.\n", + "\n", + "While a standard pre-trained LLM might use 32-bit floating-point (FP32) or 16-bit floating-point (FP16) numbers to store its weights, quantized versions can operate at lower precision levels such as 8, 4 or even 2 bits per parameter, reducing memory footprint without proportional losses in performance, necessarily. For instance, for a model of 30 billion parameters, using FP32 means 4 bytes per weight or 120 GB for the whole weights. If the model is quantized such that weights are represented in 1 byte, the memory needed for the model’s weights decreases to 30 GB, hence potentially fitting into consumer grade hardware. This is done at the cost of precision loss, but the trade-off is often worth it though require careful analysis." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at model weights of a language model (`SmolLM2-135M-Instruct`) that has been quantized to 2-bit and 16-bit precisions. We will use an utility function `load_gguf` from the `taming_utils` package to load model weights of the quantized models directly from Hugging Face." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from taming_utils import load_gguf\n", + "\n", + "MODEL_NAME = \"bartowski/SmolLM2-135M-Instruct-GGUF\"\n", + "GGUF_FILE_Q2_K = \"SmolLM2-135M-Instruct-Q2_K.gguf\"\n", + "GGUF_FILE_F16 = \"SmolLM2-135M-Instruct-F16.gguf\"\n", + "\n", + "model_q2_k = load_gguf(model_name=MODEL_NAME, \n", + " gguf_file=GGUF_FILE_Q2_K)\n", + "\n", + "model_f16 = load_gguf(model_name=MODEL_NAME, \n", + " gguf_file=GGUF_FILE_F16)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We extract the MLP weights from the first layer of the model as a proxy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mlp_weights_q2_k = model_q2_k.model.layers[0].mlp.gate_proj.weight\n", + "mlp_weights_f16 = model_f16.model.layers[0].mlp.gate_proj.weight" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Original weights at 16-bit precision:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Parameter containing:\n", + "tensor([[-0.0145, 0.1826, 0.1377, ..., 0.1719, -0.1387, -0.0298],\n", + " [-0.1631, 0.0781, -0.2051, ..., -0.2070, -0.0334, 0.2891],\n", + " [-0.1768, -0.0488, -0.2393, ..., -0.0396, -0.1348, -0.1533],\n", + " ...,\n", + " [ 0.0771, 0.0845, -0.0232, ..., 0.0178, -0.1040, -0.0771],\n", + " [ 0.1582, 0.1167, -0.0474, ..., 0.0845, 0.0359, -0.2500],\n", + " [ 0.0432, 0.0972, 0.0933, ..., 0.2188, 0.0776, 0.0674]],\n", + " requires_grad=True)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mlp_weights_f16" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Quantized weights at 2-bit precision:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Parameter containing:\n", + "tensor([[-0.0028, 0.1852, 0.1396, ..., 0.1506, -0.1635, -0.0043],\n", + " [-0.1768, 0.0680, -0.2257, ..., -0.1890, -0.0464, 0.2960],\n", + " [-0.1840, -0.0451, -0.2395, ..., -0.0413, -0.1446, -0.1446],\n", + " ...,\n", + " [ 0.0621, 0.0621, -0.0478, ..., 0.0038, -0.0830, -0.0830],\n", + " [ 0.1473, 0.0926, -0.0547, ..., 0.0824, 0.0429, -0.2737],\n", + " [ 0.0355, 0.0782, 0.0782, ..., 0.2043, 0.0740, 0.0740]],\n", + " requires_grad=True)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mlp_weights_q2_k" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How do they compare? We arrive at a Pearson correlation of 99.7% between the two sets of weights." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pearson correlation: 0.9970\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "# Convert tensors to numpy arrays (detach from computation graph if needed)\n", + "weights_f16 = mlp_weights_f16.detach().cpu().numpy()\n", + "weights_q2_k = mlp_weights_q2_k.detach().cpu().numpy()\n", + "\n", + "flat_f16 = weights_f16.flatten()\n", + "flat_q2_k = weights_q2_k.flatten()\n", + "\n", + "# Calculate correlation\n", + "correlation = np.corrcoef(flat_f16, flat_q2_k)[0,1]\n", + "print(f\"Pearson correlation: {correlation:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Quantization is a powerful technique for reducing the memory footprint of LLMs. This can be exemplified by the case of LLaMa 3.3 70B as quantized by {cite}`unsloth2024llama3` [^unsloth]. The model's memory requirements vary significantly based on the quantization level used as demonstrated in {numref}`quantized`.\n", + "\n", + "[^unsloth]: Unsloth runs a business of making LLMs fine-tuning streamlined. Check them out at [unsloth.ai](https://unsloth.ai).\n", + "\n", + "```{figure} ../_static/cost/quantized.png\n", + "---\n", + "name: quantized\n", + "alt: Quantized Model Size\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Quantized Model Size: `unsloth/Llama-3.3-70B-Instruct-GGUF`\n", + "```\n", + "\n", + "We observe that the quantization process yields remarkable reductions in model size, demonstrating a clear trade-off between precision and memory requirements. The transition from F16 (141.1 GB) to Q8_0 (75 GB) achieves a dramatic 47% reduction in model size while maintaining relatively high numerical precision. Further quantization levels reveal an interesting pattern of diminishing returns - each step down in precision yields progressively smaller absolute size reductions, though the cumulative effect remains significant. At the extreme end, the Q2_K model (26.4 GB) requires only 19% of the storage space of its F16 counterpart [^quantization-levels].\n", + "\n", + "[^quantization-levels]: You may have noticed quantization levels have a special notation. Including the bit width in the name of the model but also quantization types (e.g. _K, _0). You can find more information about the quantization levels in {cite}`huggingface2024quantization`.\n", + "\n", + "This wide spectrum of model sizes enables deployment across diverse hardware environments. The lightweight Q2_K variant opens possibilities for running inference on consumer-grade hardware like high-end laptops or desktop computers. In contrast, the full-precision F16 model demands enterprise-grade computing resources with substantial memory capacity. This flexibility in deployment options makes quantization a powerful tool for democratizing access to large language models while managing computational costs.\n", + " \n", + "While quantization has proven highly effective, there is a limit to how far it can be pushed - specifically, the 1-bit ceiling. A notable advancement in this space is BitNet {cite}`wang20241bitaiinfra11` which pushes the boundaries of extreme quantization.\n", + "\n", + "BitNet's implementation, bitnet.cpp, has demonstrated significant performance improvements across both ARM and x86 architectures (see {numref}`bitnet`). When compared to llama.cpp, the framework achieves speedups ranging from 1.37x to 5.07x on ARM processors and 2.37x to 6.17x on x86 systems. These performance gains scale with model size - larger models benefit more substantially from BitNet's optimizations. The efficiency improvements extend beyond raw speed: energy consumption drops by 55-70% on ARM and 71-82% on x86 processors. Perhaps most impressively, bitnet.cpp enables running a 100B parameter BitNet b1.58 model on a single CPU at speeds matching human reading pace (5-7 tokens per second).\n", + "\n", + "```{figure} ../_static/cost/bitnet.png\n", + "---\n", + "name: bitnet\n", + "alt: BitNet\n", + "scale: 30%\n", + "align: center\n", + "---\n", + "BitNet: {cite}`wang20241bitaiinfra11`\n", + "```\n", + "\n", + "The framework's initial release focused on CPU inference optimization, with particular emphasis on 1-bit LLM architectures (BitNet b1.58). While initial testing shows promising results, these findings are specific to the tested models and kernels (its specialized kernels are carefully crafted to exploit the unique characteristics of these extremely quantized models). Further validation is needed before generalizing these results across different architectures and use cases.\n", + "\n", + "As a relatively recent innovation, 1-bit LLMs represent an exciting frontier in model compression. However, their full potential and limitations require additional research and real-world validation. The technology demonstrates how creative approaches to quantization can continue pushing the boundaries of efficient AI deployment." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Beyond its memory footprint reduction, quantization delivers several compelling advantages: it accelerates computation through faster arithmetic operations and larger batch sizes, reduces costs by enabling deployment on less expensive hardware and making LLMs more accessible to smaller organizations, and improves energy efficiency by lowering memory bandwidth usage and power consumption - particularly beneficial for mobile and edge devices, ultimately contributing to more sustainable AI deployment.\n", + "\n", + "Each reduction in precision risks performance degradation. Finding optimal quantization schemes remains an active research area. See Case Study on Quantization for Local Models in Chapter {ref}`local` for more details.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Check-list\n", + "\n", + "**Planning and Requirements**\n", + "- [ ] Start with a clear understanding of your application's needs and the factors that contribute to LLM costs\n", + "- [ ] Choose the right model for your task, balancing performance and cost\n", + "- [ ] Be aware of the potential challenges and limitations of open-source LLMs and take appropriate measures to mitigate them\n", + "\n", + "**Model Optimization**\n", + "- [ ] Explore model compression and quantization to reduce model size and computational demands\n", + "- [ ] Fine-tune pre-trained models on domain-specific data to improve accuracy and efficiency\n", + "- [ ] Consider using RAG to enhance performance and reduce reliance on purely generative processes\n", + "\n", + "**Prompt Engineering**\n", + "- [ ] Optimize prompts and utilize prompt engineering techniques to minimize token usage\n", + "- [ ] Experiment with different prompting strategies to unlock the full potential of open-source LLMs\n", + "\n", + "**Infrastructure and Operations**\n", + "- [ ] Implement caching and batching strategies to optimize resource utilization\n", + "- [ ] Monitor LLM usage patterns and costs to identify areas for optimization\n", + "- [ ] Set up observability and logging to track model performance and costs\n", + "- [ ] Establish automated testing and evaluation pipelines\n", + "\n", + "**Cost Management**\n", + "- [ ] Track and analyze inference costs across different model variants\n", + "- [ ] Implement cost allocation and chargeback mechanisms\n", + "- [ ] Set up cost alerts and budgeting controls\n", + "- [ ] Regularly review and optimize resource utilization\n", + "\n", + "## Conclusion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]\n", + "\n", + "[cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/\n", + "[cc-by-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png\n", + "[cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC-BY--NC--SA-4.0-lightgrey.svg\n", + "\n", + "```\n", + "@misc{tharsistpsouza2024tamingllms,\n", + " author = {Tharsis T. P. Souza},\n", + " title = {Taming LLMs: A Practical Guide to LLM Pitfalls with Open Source Software},\n", + " year = {2024},\n", + " chapter = {The Falling Cost Paradox},\n", + " journal = {GitHub repository},\n", + " url = {https://github.com/souzatharsis/tamingLLMs)\n", + "}\n", + "```\n", + "## References\n", + "```{bibliography}\n", + ":filter: docname in docnames\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/tamingllms/_build/jupyter_execute/notebooks/local.ipynb b/tamingllms/_build/jupyter_execute/notebooks/local.ipynb index 7cdc5f1..59f7f13 100644 --- a/tamingllms/_build/jupyter_execute/notebooks/local.ipynb +++ b/tamingllms/_build/jupyter_execute/notebooks/local.ipynb @@ -4,6 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "(local)=\n", "# Local LLMs in Practice\n", "```{epigraph}\n", "Freedom is something that dies unless it's used.\n", @@ -40,7 +41,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Models Considerations\n", + "(local-model-selection)=\n", + "## Choosing your Model\n", "\n", "The landscape of open source LLMs is rapidly evolving, with new models emerging by the day. While proprietary LLMs have garnered significant attention, open source LLMs are gaining traction due to their flexibility, customization options, and cost-effectiveness. \n", "\n", @@ -1352,7 +1354,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Citation\n", "[![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]\n", "\n", "[cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/\n", diff --git a/tamingllms/_build/jupyter_execute/notebooks/structured_output.ipynb b/tamingllms/_build/jupyter_execute/notebooks/structured_output.ipynb index 090daa7..1d16cf6 100644 --- a/tamingllms/_build/jupyter_execute/notebooks/structured_output.ipynb +++ b/tamingllms/_build/jupyter_execute/notebooks/structured_output.ipynb @@ -467,9 +467,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/tobias/src/tamingLLMs/tamingllms/.venv/lib/python3.11/site-packages/torch/cuda/__init__.py:654: UserWarning: Can't initialize NVML\n", + " warnings.warn(\"Can't initialize NVML\")\n" + ] + } + ], "source": [ "MODEL_NAME = \"HuggingFaceTB/SmolLM2-1.7B-Instruct\"\n", "PROMPT = \"Is Enzo a good name for a baby?\"\n", @@ -1384,7 +1393,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.11.11" } }, "nbformat": 4, diff --git a/tamingllms/_config.yml b/tamingllms/_config.yml index a34b825..3c3a15b 100644 --- a/tamingllms/_config.yml +++ b/tamingllms/_config.yml @@ -5,26 +5,12 @@ title: "Taming Large Language Models: A Practical Guide to LLM Pitfalls with Pyt author: Tharsis T. P. Souza copyright: "Tharsis T. P. Souza, 2024" # Copyright year to be placed in the footer project: "Taming LLMs" -#logo: /home/tobias/src/tamingLLMs/tamingllms/tamingllms/_static/logo.png # Force re-execution of notebooks on each build. # See https://jupyterbook.org/content/execute.html execute: execute_notebooks: 'off' -#html: -# comments: -# hypothesis: true -# extra_navbar: | -#
    -# -#
    -# baseurl: https://souzatharsis.github.io/tamingllms/ -# extra_footer: | -#

    -# Apache 2.0 License -#

    - # Define the name of the latex output file for PDF builds latex: latex_documents: diff --git a/tamingllms/_static/cost/bitnet.png b/tamingllms/_static/cost/bitnet.png new file mode 100644 index 0000000..5d0d74e Binary files /dev/null and b/tamingllms/_static/cost/bitnet.png differ diff --git a/tamingllms/_static/cost/llmflation.png b/tamingllms/_static/cost/llmflation.png new file mode 100644 index 0000000..5061149 Binary files /dev/null and b/tamingllms/_static/cost/llmflation.png differ diff --git a/tamingllms/_static/cost/quantized.png b/tamingllms/_static/cost/quantized.png new file mode 100644 index 0000000..2dc6d44 Binary files /dev/null and b/tamingllms/_static/cost/quantized.png differ diff --git a/tamingllms/_static/cost/quantized.tsx b/tamingllms/_static/cost/quantized.tsx new file mode 100644 index 0000000..aef322c --- /dev/null +++ b/tamingllms/_static/cost/quantized.tsx @@ -0,0 +1,58 @@ +import React from 'react'; +import { LineChart, Line, XAxis, YAxis, CartesianGrid, Tooltip, ResponsiveContainer } from 'recharts'; + +const MemoryUsageChart = () => { + const data = [ + { name: 'F16', value: 141.1 }, + { name: 'Q8_0', value: 75.0 }, + { name: 'Q6_K', value: 59.9 }, + { name: 'Q5_K_M', value: 49.9 }, + { name: 'Q4_K_M', value: 42.5 }, + { name: 'Q3_K_M', value: 34.3 }, + { name: 'Q2_K', value: 26.4 } + ]; + + return ( +
    + + + + + + [`${value} GB`, 'Model Size']} + contentStyle={{ + backgroundColor: '#fff', + border: '1px solid #ccc', + fontWeight: 'bold' + }} + /> + + + +
    + ); +}; + +export default MemoryUsageChart; \ No newline at end of file diff --git a/tamingllms/_toc.yml b/tamingllms/_toc.yml index de492f3..778cdac 100644 --- a/tamingllms/_toc.yml +++ b/tamingllms/_toc.yml @@ -13,6 +13,8 @@ chapters: - file: notebooks/safety.ipynb - file: notebooks/alignment.ipynb - file: notebooks/local.ipynb +- file: notebooks/cost.ipynb +# - file: genindex #- file: notebooks/output_size_limit.ipynb #- file: markdown #- file: notebooks diff --git a/tamingllms/markdown/toc.md b/tamingllms/markdown/toc.md index 83c6895..1578091 100644 --- a/tamingllms/markdown/toc.md +++ b/tamingllms/markdown/toc.md @@ -32,7 +32,7 @@ Abstract: *The current discourse around Large Language Models (LLMs) tends to fo ## [Chapter 6: Local LLMs in Practice](https://www.souzatharsis.com/tamingLLMs/notebooks/local.html) -## Chapter 7: The Cost Factor +## Chapter 7: The Falling Cost Paradox ## Chapter 8: Frontiers diff --git a/tamingllms/notebooks/alignment.ipynb b/tamingllms/notebooks/alignment.ipynb index 552ad7f..9eeeffa 100644 --- a/tamingllms/notebooks/alignment.ipynb +++ b/tamingllms/notebooks/alignment.ipynb @@ -2537,7 +2537,7 @@ "source": [ "## Discussion and Conclusions\n", "\n", - "LLMs are complex systems and alignment is a challenging problem. In this chapter, we discussed how post-training alignment techniques can be used to align a language model to human preferences. In the case study, we demonstrated how to use DPO to align a language model to a user-provider policy further automating the process via synthetic data generation and LLM-as-judge evaluation. Our approach does serve as a proof of concept, however, several considerations should be taken into account when using this methodology in practice.\n", + "LLMs are complex systems and alignment is a challenging problem. In this chapter, we discussed how post-training techniques can be used to align a language model to human preferences. In the case study, we demonstrated how to use DPO to align a language model to a user-provider policy further automating the process via synthetic data generation and LLM-as-judge evaluation. Our approach serves as a proof of concept and several considerations should be taken into account when using this methodology in practice.\n", "\n", "**Synthetic Data Generation**\n", "\n", diff --git a/tamingllms/notebooks/cost.ipynb b/tamingllms/notebooks/cost.ipynb new file mode 100644 index 0000000..0bb1d48 --- /dev/null +++ b/tamingllms/notebooks/cost.ipynb @@ -0,0 +1,450 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(cost)=\n", + "# The Falling Cost Paradox\n", + "```{epigraph}\n", + "It is a confusion of ideas to suppose that the economical use of fuel is equivalent to diminished consumption.
    \n", + "The very contrary is the truth. \n", + "\n", + "-- William Stanley Jevons\n", + "```\n", + "```{contents}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Why Optimization Matters More Than Ever\n", + "\n", + "According to recent analysis from a16z {cite}`a16z2024llmflation`, the cost of LLM inference is decreasing by approximately 10x every year - a rate that outpaces even Moore's Law in the PC revolution or Edholm's Law during the bandwidth explosion of the dot-com era.\n", + "\n", + "```{figure} ../_static/cost/llmflation.png\n", + "---\n", + "name: llmflation\n", + "alt: LLMflation\n", + "scale: 30%\n", + "align: center\n", + "---\n", + "LLMflation {cite}`a16z2024llmflation`: The cost of LLM inference is decreasing by approximately 10x every year.\n", + "```\n", + "\n", + "A model achieving an MMLU score of 42 that cost \\$60 per million tokens in late 2021 can now be run for just \\$0.06 per million tokens. For higher-capability models scoring 83 on MMLU, prices have fallen by a factor of 62 since GPT-4's introduction in March 2023. \n", + "\n", + "This dramatic decline stems from multiple compounding factors including:\n", + "\n", + "- Improved GPU efficiency through architectural advances and Moore's Law\n", + "- Model quantization progress, moving from 16-bit to 4-bit or lower precision\n", + "- Software optimizations reducing compute and memory bandwidth requirements\n", + "- Emergence of smaller yet similarly capable models\n", + "- Better instruction tuning techniques like RLHF and DPO\n", + "- Competition from open-source models and low-cost providers\n", + "\n", + "This trend raises a critical question: If LLM costs are plummeting so rapidly, why should businesses and developers invest precious time and resources in optimizing their LLM usage? Wouldn't it make more sense to simply wait for the next wave of cost improvements rather than optimize today? In two words: **Jevons Paradox**. \n", + "\n", + "The Jevons Paradox was first observed by English economist William Stanley Jevons in 1865. Studying coal consumption during the Industrial Revolution, Jevons made a counterintuitive discovery: as steam engines became more efficient and coal use became more economical, total coal consumption increased rather than decreased driving the (Industrial Revolution) and the total spending up.\n", + "\n", + "This pattern has repeated throughout technological history:\n", + "\n", + "- Computing Power: As cost per computation plummeted, we didn't spend less on computing, instead we found new creative uses for computers, from smartphones to cloud servers\n", + "- Network Bandwidth: As data transmission got cheaper, we shifted from text messaging to HD video streaming and real-time gaming\n", + "- Data Storage: As cost per gigabyte fell, we moved from storing documents to hosting entire media libraries and training massive AI models\n", + "\n", + "One could argue that LLMs and Generative AI more broadly are following a similar trajectory. As costs decline, we're seeing the emergence of new applications:\n", + "- Embedding AI capabilities into every application and workflow\n", + "- Real-time analysis of audio transcripts and conversations\n", + "- Running AI models directly on edge devices and smartphones\n", + "- Multimodal applications combining text, images, audio and video \n", + "\n", + "In this environment of rapidly falling costs but potential for exponential growth in usage, optimizing LLM costs becomes more, not less, important. Here's why:\n", + "\n", + "**A) Scale Magnifies Everything**. When operating at billions of tokens per day, even small inefficiencies have major effects:\n", + "- A single digit improvement in efficiency can save millions of dollars annually at scale\n", + "- Every 100 milliseconds of latency is about 8% difference in engagement rates (30% on mobile) [^groklatency]\n", + "[^groklatency]: Quote from Jonathan Ross, CEO of Groq, a company that specializes in AI Inference services.\n", + "\n", + "**B) Tiered Pricing Persists**. While average costs are declining, the market maintains a tiered structure:\n", + "- Different models offer varying price-performance tradeoffs\n", + "- ChatGPT Pro at \\$200 per month breaks the price drop trend perhaps triggering a new wave of premium models\n", + "- Cost optimization is still required to select the right model for each specific use case\n", + "\n", + "**C) Competition Drives Innovation**. Companies that master LLM efficiency gain significant advantages:\n", + "- Ability to offer more competitive pricing\n", + "- Capacity to handle larger scale operations\n", + "- Resources to invest in product improvement\n", + "\n", + "**D) Performance and Cost Are Linked**. Cost optimization often yields performance benefits:\n", + "- Resource efficiency enables handling larger user loads\n", + "- More efficiency and reduced latency leads to improved user experience\n", + "\n", + "In this environment, companies that master efficient LLM usage while exploiting new capabilities opened up by falling costs will be best positioned to innovate and scale. This dual focus - efficiency and innovation - will likely characterize successful AI companies in the years ahead.\n", + "\n", + "Motivated by this insight, in the next sections we will dive into the factors that drive LLM cost decay and how to optimize LLM usage in practical applications. The discussion will explore key optimization areas including inference optimization through techniques like Flash Attention and cached prompts, model compression via quantization and distillation, and practical implementation patterns such as response caching, batch processing, and early stopping - all aimed at achieving efficient usage and cost reductions while maintaining model performance and reliability.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Right-Sizing LLMs: A Strategic Approach\n", + "\n", + "Before implementing cost optimization strategies for LLMs, organizations must develop a comprehensive understanding of their own requirements and constraints. This systematic approach prevents both over-engineering and under-provisioning, leading to more efficient and cost-effective implementations.\n", + "\n", + "In this section, we define key performance and cost related metrics that will guide our discussion. Then we propose a set of requirements practitioners should consider before we dive into cost optimization techniques.\n", + "\n", + "\n", + "### Metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Requirements\n", + "\n", + "#### Business Requirements\n", + "\n", + "First, one needs to define the problem to be solved and to what extent it is worth to be solved. Use case requirements form the foundation of any LLM implementation project. A clear definition of the specific business problema and task to be accomplished must be established upfront, along with concrete performance metrics covering accuracy, latency and throughput. This should be accompanied by well-defined cost-per-transaction targets, clear ROI expectations, and a strategic allocation of budgets across different use cases to ensure resources are optimally distributed.\n", + "\n", + "Budget and ROI considerations are critical for ensuring the long-term viability of LLM implementations. Organizations must establish clear spending limits that align with their financial capabilities while defining realistic cost-per-transaction targets. ROI expectations need to be carefully established through detailed analysis, followed by a strategic allocation of budgets across various use cases based on their business impact and priority.\n", + "\n", + "Compliance and security requirements cannot be overlooked. This involves a thorough identification of all applicable regulatory requirements and the establishment of robust data handling standards. Organizations must specify comprehensive audit requirements to maintain transparency and accountability, while implementing appropriate security controls to protect sensitive data and system access.\n", + "\n", + "Future-proofing considerations help ensure the longevity and adaptability of LLM implementations. This requires careful planning for scale to accommodate future growth, along with the evaluation of multi-model strategies to reduce dependency on single solutions. Organizations should carefully assess vendor lock-in risks and explore open-source alternatives to maintain flexibility and control over their AI infrastructure.\n", + "\n", + "Chapter {ref}`local` provides a detailed discussion on relevant considerations when {ref}`local-model-selection`.\n", + "\n", + "#### Performance Requirements\n", + "\n", + "Accuracy and quality form the foundation of any LLM deployment's performance requirements. At its core, this involves determining the minimum level of accuracy that the model must achieve to be considered successful. This serves as a critical baseline for evaluating model performance and making deployment decisions. Establishing clear evaluation metrics, whether through automated measures or human evaluation processes, provides concrete ways to assess if these thresholds are being met. Continuous monitoring of these accuracy metrics ensures the system maintains its performance over time as usage patterns and data distributions evolve. Chapter {ref}`evals` provides a detailed discussion on how to evaluate the performance of LLM-based applications.\n", + "\n", + "Latency and throughput requirements are equally crucial for ensuring a positive user experience and system reliability. These specifications define how quickly the system must respond to requests and how many concurrent users it can handle. Response time requirements must be carefully balanced against the computational resources available, while peak load capabilities need to account for usage spikes and growth patterns. The decision between real-time processing for immediate responses versus batch processing for efficiency depends heavily on the use case and user expectations. \n", + "\n", + "\n", + "#### Operational Requirements\n", + "\n", + "Scale and capacity planning forms the foundation of operational requirements for LLM deployments. This involves a comprehensive analysis of expected system usage and growth patterns to ensure the infrastructure can handle both current and future demands. Organizations must carefully project their daily and monthly API call volumes while calculating the average number of tokens per request to accurately estimate resource needs. Understanding usage patterns, including seasonal variations, enables proper capacity planning. Additionally, developing 12-24 month growth projections helps ensure the infrastructure can scale appropriately as demand increases.\n", + "\n", + "Reliability and availability requirements are equally critical for maintaining consistent service quality. These specifications define the expected uptime percentage that the system must maintain, typically expressed as a percentage of total operational time. Organizations need to establish clear maintenance windows that minimize disruption to users while ensuring necessary system updates and optimizations can be performed. Comprehensive backup and failover requirements must be specified to ensure business continuity in case of failures. High availability needs should be clearly defined, including redundancy levels and recovery time objectives, to maintain service quality even during unexpected events.\n", + "\n", + "#### Technical Requirements\n", + "\n", + "System integration requirements define how the LLM system will interact and communicate with existing infrastructure and applications. This involves carefully mapping all integration points where the LLM system needs to connect with other systems, establishing standardized data formats and interfaces for seamless communication, implementing robust security measures to protect data in transit, and identifying any technical constraints that could impact integration. Getting these integration requirements right is crucial for ensuring the LLM system can function effectively within the broader technical ecosystem.\n", + "\n", + "Data management requirements address how information will be stored, processed, and maintained within the LLM system. This encompasses determining appropriate storage solutions for maintaining conversation context and history, selecting and configuring vector databases to enable efficient retrieval-augmented generation (RAG), creating comprehensive data retention policies that balance operational needs with resource constraints, and ensuring all data handling practices comply with relevant privacy regulations. Proper data management is essential for both system performance and regulatory compliance, making it a critical consideration in any LLM implementation.\n", + "\n", + "\n", + "This structured approach to requirements analysis enables organizations to:\n", + "1. Select appropriate models aligned with specific needs\n", + "2. Identify targeted optimization opportunities\n", + "3. Scale efficiently while controlling costs\n", + "4. Develop realistic resource allocation strategies\n", + "\n", + "The following sections explore specific optimization techniques, but their implementation should always be guided by these foundational requirements.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Quantization\n", + "\n", + "Quantization is a common and relevant technique in making LLMs more efficient and accessible. At a high level, quantization reduces the number of bits used to represent a model's parameters. The most common form of quantization is to represent model's weights at lower precision at post-training phase. It has become a standard technique to generate a series of quantized models given a large pre-trained base model.\n", + "\n", + "While a standard pre-trained LLM might use 32-bit floating-point (FP32) or 16-bit floating-point (FP16) numbers to store its weights, quantized versions can operate at lower precision levels such as 8, 4 or even 2 bits per parameter, reducing memory footprint without proportional losses in performance, necessarily. For instance, for a model of 30 billion parameters, using FP32 means 4 bytes per weight or 120 GB for the whole weights. If the model is quantized such that weights are represented in 1 byte, the memory needed for the model’s weights decreases to 30 GB, hence potentially fitting into consumer grade hardware. This is done at the cost of precision loss, but the trade-off is often worth it though require careful analysis." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at model weights of a language model (`SmolLM2-135M-Instruct`) that has been quantized to 2-bit and 16-bit precisions. We will use an utility function `load_gguf` from the `taming_utils` package to load model weights of the quantized models directly from Hugging Face." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from taming_utils import load_gguf\n", + "\n", + "MODEL_NAME = \"bartowski/SmolLM2-135M-Instruct-GGUF\"\n", + "GGUF_FILE_Q2_K = \"SmolLM2-135M-Instruct-Q2_K.gguf\"\n", + "GGUF_FILE_F16 = \"SmolLM2-135M-Instruct-F16.gguf\"\n", + "\n", + "model_q2_k = load_gguf(model_name=MODEL_NAME, \n", + " gguf_file=GGUF_FILE_Q2_K)\n", + "\n", + "model_f16 = load_gguf(model_name=MODEL_NAME, \n", + " gguf_file=GGUF_FILE_F16)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We extract the MLP weights from the first layer of the model as a proxy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mlp_weights_q2_k = model_q2_k.model.layers[0].mlp.gate_proj.weight\n", + "mlp_weights_f16 = model_f16.model.layers[0].mlp.gate_proj.weight" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Original weights at 16-bit precision:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Parameter containing:\n", + "tensor([[-0.0145, 0.1826, 0.1377, ..., 0.1719, -0.1387, -0.0298],\n", + " [-0.1631, 0.0781, -0.2051, ..., -0.2070, -0.0334, 0.2891],\n", + " [-0.1768, -0.0488, -0.2393, ..., -0.0396, -0.1348, -0.1533],\n", + " ...,\n", + " [ 0.0771, 0.0845, -0.0232, ..., 0.0178, -0.1040, -0.0771],\n", + " [ 0.1582, 0.1167, -0.0474, ..., 0.0845, 0.0359, -0.2500],\n", + " [ 0.0432, 0.0972, 0.0933, ..., 0.2188, 0.0776, 0.0674]],\n", + " requires_grad=True)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mlp_weights_f16" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Quantized weights at 2-bit precision:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Parameter containing:\n", + "tensor([[-0.0028, 0.1852, 0.1396, ..., 0.1506, -0.1635, -0.0043],\n", + " [-0.1768, 0.0680, -0.2257, ..., -0.1890, -0.0464, 0.2960],\n", + " [-0.1840, -0.0451, -0.2395, ..., -0.0413, -0.1446, -0.1446],\n", + " ...,\n", + " [ 0.0621, 0.0621, -0.0478, ..., 0.0038, -0.0830, -0.0830],\n", + " [ 0.1473, 0.0926, -0.0547, ..., 0.0824, 0.0429, -0.2737],\n", + " [ 0.0355, 0.0782, 0.0782, ..., 0.2043, 0.0740, 0.0740]],\n", + " requires_grad=True)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mlp_weights_q2_k" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How do they compare? We arrive at a Pearson correlation of 99.7% between the two sets of weights." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pearson correlation: 0.9970\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "# Convert tensors to numpy arrays (detach from computation graph if needed)\n", + "weights_f16 = mlp_weights_f16.detach().cpu().numpy()\n", + "weights_q2_k = mlp_weights_q2_k.detach().cpu().numpy()\n", + "\n", + "flat_f16 = weights_f16.flatten()\n", + "flat_q2_k = weights_q2_k.flatten()\n", + "\n", + "# Calculate correlation\n", + "correlation = np.corrcoef(flat_f16, flat_q2_k)[0,1]\n", + "print(f\"Pearson correlation: {correlation:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Quantization is a powerful technique for reducing the memory footprint of LLMs. This can be exemplified by the case of LLaMa 3.3 70B as quantized by {cite}`unsloth2024llama3` [^unsloth]. The model's memory requirements vary significantly based on the quantization level used as demonstrated in {numref}`quantized`.\n", + "\n", + "[^unsloth]: Unsloth runs a business of making LLMs fine-tuning streamlined. Check them out at [unsloth.ai](https://unsloth.ai).\n", + "\n", + "```{figure} ../_static/cost/quantized.png\n", + "---\n", + "name: quantized\n", + "alt: Quantized Model Size\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Quantized Model Size: `unsloth/Llama-3.3-70B-Instruct-GGUF`\n", + "```\n", + "\n", + "We observe that the quantization process yields remarkable reductions in model size, demonstrating a clear trade-off between precision and memory requirements. The transition from F16 (141.1 GB) to Q8_0 (75 GB) achieves a dramatic 47% reduction in model size while maintaining relatively high numerical precision. Further quantization levels reveal an interesting pattern of diminishing returns - each step down in precision yields progressively smaller absolute size reductions, though the cumulative effect remains significant. At the extreme end, the Q2_K model (26.4 GB) requires only 19% of the storage space of its F16 counterpart [^quantization-levels].\n", + "\n", + "[^quantization-levels]: You may have noticed quantization levels have a special notation. Including the bit width in the name of the model but also quantization types (e.g. _K, _0). You can find more information about the quantization levels in {cite}`huggingface2024quantization`.\n", + "\n", + "This wide spectrum of model sizes enables deployment across diverse hardware environments. The lightweight Q2_K variant opens possibilities for running inference on consumer-grade hardware like high-end laptops or desktop computers. In contrast, the full-precision F16 model demands enterprise-grade computing resources with substantial memory capacity. This flexibility in deployment options makes quantization a powerful tool for democratizing access to large language models while managing computational costs.\n", + " \n", + "While quantization has proven highly effective, there is a limit to how far it can be pushed - specifically, the 1-bit ceiling. A notable advancement in this space is BitNet {cite}`wang20241bitaiinfra11` which pushes the boundaries of extreme quantization.\n", + "\n", + "BitNet's implementation, bitnet.cpp, has demonstrated significant performance improvements across both ARM and x86 architectures (see {numref}`bitnet`). When compared to llama.cpp, the framework achieves speedups ranging from 1.37x to 5.07x on ARM processors and 2.37x to 6.17x on x86 systems. These performance gains scale with model size - larger models benefit more substantially from BitNet's optimizations. The efficiency improvements extend beyond raw speed: energy consumption drops by 55-70% on ARM and 71-82% on x86 processors. Perhaps most impressively, bitnet.cpp enables running a 100B parameter BitNet b1.58 model on a single CPU at speeds matching human reading pace (5-7 tokens per second).\n", + "\n", + "```{figure} ../_static/cost/bitnet.png\n", + "---\n", + "name: bitnet\n", + "alt: BitNet\n", + "scale: 30%\n", + "align: center\n", + "---\n", + "BitNet: {cite}`wang20241bitaiinfra11`\n", + "```\n", + "\n", + "The framework's initial release focused on CPU inference optimization, with particular emphasis on 1-bit LLM architectures (BitNet b1.58). While initial testing shows promising results, these findings are specific to the tested models and kernels (its specialized kernels are carefully crafted to exploit the unique characteristics of these extremely quantized models). Further validation is needed before generalizing these results across different architectures and use cases.\n", + "\n", + "As a relatively recent innovation, 1-bit LLMs represent an exciting frontier in model compression. However, their full potential and limitations require additional research and real-world validation. The technology demonstrates how creative approaches to quantization can continue pushing the boundaries of efficient AI deployment." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Beyond its memory footprint reduction, quantization delivers several compelling advantages: it accelerates computation through faster arithmetic operations and larger batch sizes, reduces costs by enabling deployment on less expensive hardware and making LLMs more accessible to smaller organizations, and improves energy efficiency by lowering memory bandwidth usage and power consumption - particularly beneficial for mobile and edge devices, ultimately contributing to more sustainable AI deployment.\n", + "\n", + "Each reduction in precision risks performance degradation. Finding optimal quantization schemes remains an active research area. See Case Study on Quantization for Local Models in Chapter {ref}`local` for more details.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Check-list\n", + "\n", + "**Planning and Requirements**\n", + "- [ ] Start with a clear understanding of your application's needs and the factors that contribute to LLM costs\n", + "- [ ] Choose the right model for your task, balancing performance and cost\n", + "- [ ] Be aware of the potential challenges and limitations of open-source LLMs and take appropriate measures to mitigate them\n", + "\n", + "**Model Optimization**\n", + "- [ ] Explore model compression and quantization to reduce model size and computational demands\n", + "- [ ] Fine-tune pre-trained models on domain-specific data to improve accuracy and efficiency\n", + "- [ ] Consider using RAG to enhance performance and reduce reliance on purely generative processes\n", + "\n", + "**Prompt Engineering**\n", + "- [ ] Optimize prompts and utilize prompt engineering techniques to minimize token usage\n", + "- [ ] Experiment with different prompting strategies to unlock the full potential of open-source LLMs\n", + "\n", + "**Infrastructure and Operations**\n", + "- [ ] Implement caching and batching strategies to optimize resource utilization\n", + "- [ ] Monitor LLM usage patterns and costs to identify areas for optimization\n", + "- [ ] Set up observability and logging to track model performance and costs\n", + "- [ ] Establish automated testing and evaluation pipelines\n", + "\n", + "**Cost Management**\n", + "- [ ] Track and analyze inference costs across different model variants\n", + "- [ ] Implement cost allocation and chargeback mechanisms\n", + "- [ ] Set up cost alerts and budgeting controls\n", + "- [ ] Regularly review and optimize resource utilization\n", + "\n", + "## Conclusion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]\n", + "\n", + "[cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/\n", + "[cc-by-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png\n", + "[cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC-BY--NC--SA-4.0-lightgrey.svg\n", + "\n", + "```\n", + "@misc{tharsistpsouza2024tamingllms,\n", + " author = {Tharsis T. P. Souza},\n", + " title = {Taming LLMs: A Practical Guide to LLM Pitfalls with Open Source Software},\n", + " year = {2024},\n", + " chapter = {The Falling Cost Paradox},\n", + " journal = {GitHub repository},\n", + " url = {https://github.com/souzatharsis/tamingLLMs)\n", + "}\n", + "```\n", + "## References\n", + "```{bibliography}\n", + ":filter: docname in docnames\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tamingllms/notebooks/local.ipynb b/tamingllms/notebooks/local.ipynb index b451331..fde2739 100644 --- a/tamingllms/notebooks/local.ipynb +++ b/tamingllms/notebooks/local.ipynb @@ -4,6 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "(local)=\n", "# Local LLMs in Practice\n", "```{epigraph}\n", "Freedom is something that dies unless it's used.\n", @@ -40,7 +41,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Models Considerations\n", + "(local-model-selection)=\n", + "## Choosing your Model\n", "\n", "The landscape of open source LLMs is rapidly evolving, with new models emerging by the day. While proprietary LLMs have garnered significant attention, open source LLMs are gaining traction due to their flexibility, customization options, and cost-effectiveness. \n", "\n", @@ -1352,7 +1354,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Citation\n", "[![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]\n", "\n", "[cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/\n", diff --git a/tamingllms/notebooks/structured_output.ipynb b/tamingllms/notebooks/structured_output.ipynb index 4bc64db..64359b4 100644 --- a/tamingllms/notebooks/structured_output.ipynb +++ b/tamingllms/notebooks/structured_output.ipynb @@ -467,9 +467,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/tobias/src/tamingLLMs/tamingllms/.venv/lib/python3.11/site-packages/torch/cuda/__init__.py:654: UserWarning: Can't initialize NVML\n", + " warnings.warn(\"Can't initialize NVML\")\n" + ] + } + ], "source": [ "MODEL_NAME = \"HuggingFaceTB/SmolLM2-1.7B-Instruct\"\n", "PROMPT = \"Is Enzo a good name for a baby?\"\n", @@ -1384,7 +1393,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.11.11" } }, "nbformat": 4, diff --git a/tamingllms/notebooks/structured_output_original.ipynb b/tamingllms/notebooks/structured_output_original.ipynb deleted file mode 100644 index 79ee4c3..0000000 --- a/tamingllms/notebooks/structured_output_original.ipynb +++ /dev/null @@ -1,1167 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "(structure)=\n", - "# Wrestling with Structured Output\n", - "```{epigraph}\n", - "In limits, there is freedom. Creativity thrives within structure.\n", - "\n", - "-- Julia B. Cameron\n", - "```\n", - "```{contents}\n", - "```\n", - "\n", - "## Introduction\n", - "\n", - "Large language models (LLMs) excel at generating human-like text, but they often struggle to produce output in a structured format consistently. This poses a significant challenge when we need LLMs to generate data that can be easily processed by other systems, such as databases, APIs, or other software applications. Sometimes, even with a well-crafted prompt, an LLM might produce an unstructured response when a structured one is expected. This can be particularly challenging when integrating LLMs into systems that require specific data formats." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As a motivating example, consider the following simple task: Given a segment of a SEC financial filing, generate a two-person discussion about the key financial data from the text in JSON format, simulating what would be a real-world discussion about the underlying companies' disclosed financial information. We would like to generate a structured output that can be easily parsed and integrated with other systems." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Throughout this notebook, we will consider as input a segment of a sample SEC filing of Apple Inc." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "MAX_LENGTH = 10000 # We limit the input length to avoid token issues\n", - "with open('../data/apple.txt', 'r') as file:\n", - " sec_filing = file.read()\n", - "sec_filing = sec_filing[:MAX_LENGTH] " - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from dotenv import load_dotenv\n", - "import os\n", - "\n", - "# Load environment variables from .env file\n", - "load_dotenv(override=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from openai import OpenAI" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [], - "source": [ - "client = OpenAI()\n", - "# Define the prompt expecting a structured JSON response\n", - "prompt = f\"\"\"\n", - "Generate a two-person discussion about the key financial data from the following text in JSON format.\n", - "TEXT: {sec_filing}\n", - "\"\"\"\n", - "\n", - "response = client.chat.completions.create(\n", - " model=\"gpt-3.5-turbo\",\n", - " messages=[{\"role\": \"user\", \"content\": prompt}]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Person 1: Wow, Apple Inc. seems to have a lot of different products and services they offer. It's interesting to see the breakdown of their revenue streams in their Form 10-K.\n", - "\n", - "Person 2: Absolutely, they have a diverse portfolio with iPhones, Macs, iPads, wearables, and even services. It's impressive to see how they have capitalized on different technology trends.\n", - "\n", - "Person 1: I noticed that they have a large market value of over $2.6 trillion as of March 29, 2024. That's a huge amount, and it shows the confidence investors have in the company.\n", - "\n", - "Person 2: Definitely, that's a significant figure. It's also good to see that they are complying with all the required SEC regulations and filing their reports in a timely manner.\n", - "\n", - "Person 1: Yes, it's crucial for investors to have access to accurate and up-to-date financial information. It helps in making informed decisions about their investments in the company.\n", - "\n", - "Person 2: Absolutely, transparency and compliance with regulations are key in the financial industry. It's good to see that Apple Inc. is taking those aspects seriously.\n" - ] - } - ], - "source": [ - "response_content = response.choices[0].message.content\n", - "print(response_content)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "def is_json(myjson):\n", - " try:\n", - " json.loads(myjson)\n", - " except ValueError as e:\n", - " return False\n", - " return True" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "False\n" - ] - } - ], - "source": [ - "is_json(response_content)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this example, despite the prompt clearly asking for a JSON object, the LLM generates an unstructured natural language sentence instead. This simple example highlights the inconsistency and unpredictability of LLMs when it comes to producing structured output." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Problem Statement\n", - "\n", - "Obtaining structured output from LLMs presents several significant challenges:\n", - "\n", - "* **Inconsistency**: LLMs often produce unpredictable results, sometimes generating well-structured output and other times deviating from the expected format.\n", - "\n", - "* **Lack of Type Safety**: LLMs do not inherently understand data types, which can lead to errors when their output is integrated with systems requiring specific data formats.\n", - "\n", - "* **Prompt Engineering Complexity**: Crafting prompts that effectively guide LLMs to produce the correct structured output is complex and requires extensive experimentation.\n", - "\n", - "## User Needs\n", - "\n", - "What user needs drive the demand for LLM output constraints when building LLM-based applications? In a recent work by Google Research {cite}`10.1145/3613905.3650756`, the authors explore the user need for constraints on the output of large language models, drawing on a survey of 51 industry professionals who use LLMs in their work. These needs can be broadly categorized as follows:\n", - "\n", - "**1. Improving Developer Efficiency and Workflow**\n", - "\n", - "* **Reducing Trial and Error in Prompt Engineering**: Developers find the process of crafting prompts to elicit desired output formats to be time-consuming, often involving extensive testing and iteration. LLM output constraints could make this process more efficient and predictable.\n", - "* **Minimizing Post-processing of LLM Outputs**: Developers frequently have to write complex code to wrangle and process LLM outputs that don't conform to expected formats. LLM structured output would simplify this, reducing the need for ad-hoc post-processing code.\n", - "* **Streamlining Integration with Downstream Processes**: LLMs are often used within larger pipelines where their output serves as input for subsequent modules. Output constraints are crucial to ensure compatibility and prevent errors.\n", - "* **Enhancing the Quality of Synthetic Datasets**: LLMs are increasingly used to generate synthetic data for AI training. Constraints can ensure data integrity and prevent the inclusion of unwanted elements that could negatively impact training outcomes.\n", - "\n", - "**2. Meeting UI and Product Requirements**\n", - "\n", - "* **Adhering to UI Size Limitations**: LLM-generated content often needs to fit into specific UI elements with size restrictions, especially on mobile devices. Output length constraints prevent content overflow and ensure proper display within the UI.\n", - "* **Ensuring Output Consistency**: Consistent output length and format are crucial for user experience and UI clarity. Constraints help maintain this consistency, avoiding overwhelming variability in generated text.\n", - "* **Complying with Platform Character Limits**: Certain platforms, such as Twitter or YouTube Shorts, impose character limits on content. Length constraints allow LLMs to comply with these restrictions, ensuring content can be published successfully.\n", - "\n", - "**3. Enhancing User Trust and Experience**\n", - "\n", - "* **Mitigating Hallucinations**: Users expect LLM-powered tools to be reliable and truthful. Constraining LLM outputs to a set of possible outcomes can help mitigate hallucinations, ensuring the output is valid.\n", - "* **Driving User Adoption**: Users are more likely to engage with LLM-powered tools that provide reliable and consistent experiences. By ensuring output accuracy, consistency, and safety through constraints, developers can enhance user satisfaction and drive adoption.\n", - "\n", - "It is important to emphasize that the ability to constrain LLM output is not just a technical consideration but a fundamental user need, impacting developer efficiency, user experience, and the overall success of LLM-powered applications. \n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Solutions\n", - "\n", - "Several strategies and tools can be employed to address the challenges of structured output from LLMs.\n", - "\n", - "### Strategies\n", - "\n", - "* **Schema Guidance**: Providing the LLM with a clear schema or blueprint of the desired output structure helps to constrain its generation and improve consistency. This can be achieved by using tools like Pydantic to define the expected data structure and then using that definition to guide the LLM's output. \n", - "\n", - "* **Output Parsing**: When LLMs don't natively support structured output, parsing their text output using techniques like regular expressions or dedicated parsing libraries can extract the desired information. For example, you can use regular expressions to extract specific patterns from the LLM's output, or you can use libraries like Pydantic to parse the output into structured data objects.\n", - "\n", - "* **Type Enforcement**: Using tools that enforce data types, such as Pydantic in Python, can ensure that the LLM output adheres to the required data formats. This can help to prevent errors when integrating the LLM's output with other systems." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Techniques and Tools" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### One-Shot Prompts\n", - "\n", - "In one-shot prompting, you provide a single example of the desired output format within the prompt." - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "prompt = f\"\"\"\n", - "Generate a two-person discussion about the key financial data from the following text in JSON format.\n", - "\n", - "\n", - "{\n", - " \"Person1\": {\n", - " \"name\": \"Alice\",\n", - " \"statement\": \"The revenue for Q1 has increased by 20% compared to last year.\"\n", - " },\n", - " \"Person2\": {\n", - " \"name\": \"Bob\",\n", - " \"statement\": \"That's great news! What about the net profit margin?\"\n", - " }\n", - "}\n", - "\n", - "\n", - "TEXT: {sec_filing}\n", - "\"\"\"\n", - "\n", - "response = client.chat.completions.create(\n", - " model=\"gpt-3.5-turbo\",\n", - " messages=[{\"role\": \"user\", \"content\": prompt}]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"Person1\": {\n", - " \"name\": \"Alice\",\n", - " \"statement\": \"The revenue for Q1 has increased by 20% compared to last year.\"\n", - " },\n", - " \"Person2\": {\n", - " \"name\": \"Bob\",\n", - " \"statement\": \"That's great news! What about the net profit margin?\"\n", - " }\n", - "}\n" - ] - } - ], - "source": [ - "response_content = response.choices[0].message.content\n", - "print(response_content)" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "is_json(response_content)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Structured Output with Provider-Specific APIs\n", - "\n", - "One-shot prompting is a simple technique that can lead to material improvements in structured output, though may not be sufficient for complex (e.g. nested) structures and / or when the model's output needs to be restricted to a specific set of options or types.\n", - "\n", - "Provider-specific APIs can offer ways to handle those challenges. We will explore two approaches here using OpenAI's API:\n", - "\n", - "* **JSON Mode**: Most LLM APIs today offer features specifically designed for generating JSON output.\n", - "* **Structured Outputs**: Some LLM APIs offer features specifically designed for generating structured outputs with type safety.\n", - "\n", - "#### JSON Mode\n", - "\n", - "JSON mode is a feature provided by most LLM API providers, such as OpenAI, that allows the model to generate output in JSON format. This is particularly useful when you need structured data as a result, such as when parsing the output programmatically or integrating it with other systems that require JSON input. As depicted in {numref}`json-mode`, JSON mode is implemented by instructing theLLM model to use JSON as response format and optionally defining a target schema.\n", - "\n", - "\n", - "```{figure} ../_static/structured_output/json.png\n", - "---\n", - "name: json-mode\n", - "alt: JSON Mode\n", - "scale: 50%\n", - "align: center\n", - "---\n", - "Conceptual overview of JSON mode.\n", - "```\n", - "\n", - "When using JSON mode with OpenAI's API, it is recommended to instruct the model to produce JSON via some message in the conversation, for example via your system message. If you don't include an explicit instruction to generate JSON, the model may generate an unending stream of whitespace and the request may run continually until it reaches the token limit. To help ensure you don't forget, the API will throw an error if the string \"JSON\" does not appear somewhere in the context.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [], - "source": [ - "prompt = f\"\"\"\n", - "Generate a two-person discussion about the key financial data from the following text in JSON format.\n", - "TEXT: {sec_filing}\n", - "\"\"\"\n", - "response = client.chat.completions.create(\n", - " model=\"gpt-3.5-turbo\",\n", - " messages=[{\"role\": \"user\", \"content\": prompt}],\n", - "response_format = { \"type\": \"json_object\" }\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"person1\": \"I see that Apple Inc. reported a total market value of approximately $2,628,553,000,000 held by non-affiliates as of March 29, 2024. That's a significant amount!\",\n", - " \"person2\": \"Yes, it definitely shows the scale and value of the company in the market. It's impressive to see the sheer size of the market value.\",\n", - " \"person1\": \"Also, they mentioned having 15,115,823,000 shares of common stock issued and outstanding as of October 18, 2024. That's a large number of shares circulating in the market.\",\n", - " \"person2\": \"Absolutely, the number of shares outstanding plays a crucial role in determining the company's market capitalization and investor interest.\"\n", - "}\n" - ] - } - ], - "source": [ - "response_content = response.choices[0].message.content\n", - "print(response_content)" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 50, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "is_json(response_content)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This example solution is specific to OpenAI's API. Other LLM providers offer similar functionality, for example:\n", - "\n", - "* Google's Vertex AI offers a `parse` method for structured outputs.\n", - "* Anthropic offers a `structured` method for structured outputs." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "JSON mode will not guarantee the output matches any specific schema, only that it is valid and parses without errors. For that purpose, we can leverage a new feature recently released by OpenAI called \"Structured Outputs\" to ensure the output data matches a target schema with type safety.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Structured Output Mode**\n", - "\n", - "Structured Outputs is a feature that ensures the model will always generate responses that adhere to your supplied JSON Schema, so you don't need to worry about the model omitting a required key, or hallucinating an invalid enum value.\n", - "\n", - "Some benefits of Structured Outputs include:\n", - "- **Reliable type-safety**: No need to validate or retry incorrectly formatted responses.\n", - "- **Explicit refusals**: Safety-based model refusals are now programmatically detectable.\n", - "- **Simpler prompting**: No need for strongly worded prompts to achieve consistent formatting.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here's a Python example demonstrating how to use the OpenAI API to generate a structured output. In this example, we aim at extracting structured data from our sample SEC filing, in particular: (i) entities and (ii) places mentioned in the input doc. This example uses the `response_format` parameter within the OpenAI API call. This functionality is supported by GPT-4o models, specifically `gpt-4o-mini-2024-07-18`, `gpt-4o-2024-08-06`, and later versions." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "from pydantic import BaseModel\n", - "from openai import OpenAI\n", - "\n", - "class SECExtraction(BaseModel):\n", - " mentioned_entities: list[str]\n", - " mentioned_places: list[str]\n", - "\n", - "def extract_from_sec_filing(sec_filing_text: str, prompt: str) -> SECExtraction:\n", - " \"\"\"\n", - " Extracts structured data from an input SEC filing text.\n", - " \"\"\"\n", - " client = OpenAI()\n", - " completion = client.beta.chat.completions.parse(\n", - " model=\"gpt-4o-mini\",\n", - " messages=[\n", - " {\n", - " \"role\": \"system\",\n", - " \"content\": prompt\n", - " },\n", - " {\"role\": \"user\", \"content\": sec_filing_text}\n", - " ],\n", - " response_format=SECExtraction\n", - " )\n", - " return completion.choices[0].message.parsed" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Explanation:**\n", - "\n", - "* **Data Structures:** The code defines one Pydantic model, `SECExtraction`, to represent the structured output of our parser. This model provide type hints and structure for the response.\n", - "* **API Interaction:** The `extract_from_sec_filing` function uses the OpenAI client to send a chat completion request to the `gpt-4o-mini-2024-07-18` model. The prompt instructs the model to extract our target attributes from input text. The `response_format` is set to `SECExtraction`, ensuring the response conforms to the specified Pydantic model.\n", - "* **Output Processing:** The returned response is parsed into the `SECExtraction` model. The code then returns the parsed data." - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [], - "source": [ - "prompt_extraction = \"You are an expert at structured data extraction. You will be given unstructured text from a SEC filing and extracted names of mentioned entities and places and should convert the response into the given structure.\"\n", - "sec_extraction = extract_from_sec_filing(sec_filing, prompt_extraction)" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Extracted entities: ['Apple Inc.', 'The Nasdaq Stock Market LLC']\n", - "Extracted places: ['Washington, D.C.', 'California', 'Cupertino, California']\n" - ] - } - ], - "source": [ - "print(\"Extracted entities:\", sec_extraction.mentioned_entities)\n", - "print(\"Extracted places:\", sec_extraction.mentioned_places)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We observe that the model was able to extract the entities and places from the input text, and return them in the specified format." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Benefits**\n", - "\n", - "* **Structured Output:** The use of Pydantic models and the `response_format` parameter enforces the structure of the model's output, making it more reliable and easier to process.\n", - "\n", - "* **Schema Adherence:** Structured Outputs in OpenAI API guarantee that the response adheres to the provided schema.\n", - "\n", - "This structured approach improves the reliability and usability of your application by ensuring consistent, predictable output from the OpenAI API.\n", - "\n", - "This example solution is specific to OpenAI's API. That begs the question: How can we solve this problem generally for widely available LLM providers? In the next sections, we will explore how `LangChain` and `Outlines` may serve as general purpose tools that can help us do just that.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### LangChain\n", - "\n", - "LangChain is a framework designed to simplify the development of LLM applications. It provider an abstraction layer over many LLM providers, including OpenAI, that offers several tools for parsing structured output.\n", - "\n", - "In particular, LangChain offers the `with_structured_output` method, which can be used with LLMs that support structured output APIs, allowing you to enforce a schema directly within the prompt.\n", - "\n", - "> `with_structured_output` takes a schema as input which specifies the names, types, and descriptions of the desired output attributes. The method returns a model-like Runnable, except that instead of outputting strings or messages it outputs objects corresponding to the given schema. The schema can be specified as a TypedDict class, JSON Schema or a Pydantic class. If TypedDict or JSON Schema are used then a dictionary will be returned by the Runnable, and if a Pydantic class is used then a Pydantic object will be returned.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```bash\n", - "pip install -qU langchain-openai\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_openai import ChatOpenAI\n", - "from langchain_core.prompts import ChatPromptTemplate\n", - "def extract_from_sec_filing_langchain(sec_filing_text: str, prompt: str) -> SECExtraction:\n", - " \"\"\"\n", - " Extracts structured data from an input SEC filing text using LangChain.\n", - " \"\"\"\n", - " llm = ChatOpenAI(model=\"gpt-4o-mini\")\n", - "\n", - " structured_llm = llm.with_structured_output(SECExtraction)\n", - "\n", - " prompt_template = ChatPromptTemplate.from_messages(\n", - " [\n", - " (\"system\", prompt),\n", - " (\"human\", \"{sec_filing_text}\"),\n", - " ]\n", - " )\n", - "\n", - " llm_chain = prompt_template | structured_llm\n", - " \n", - " return llm_chain.invoke(sec_filing_text)" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "prompt_extraction = \"You are an expert at structured data extraction. You will be given unstructured text from a SEC filing and extracted names of mentioned entities and places and should convert the response into the given structure.\"\n", - "sec_extraction_langchain = extract_from_sec_filing_langchain(sec_filing, prompt_extraction)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 112, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Extracted entities: ['Apple Inc.']\n", - "Extracted places: ['California', 'Cupertino']\n" - ] - } - ], - "source": [ - "print(\"Extracted entities:\", sec_extraction_langchain.mentioned_entities)\n", - "print(\"Extracted places:\", sec_extraction_langchain.mentioned_places)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We observe that the model was able to extract the entities and places from the input text, and return them in the specified format. A full list of models that support `.with_structured_output()` can be found [here](https://python.langchain.com/docs/integrations/chat/#featured-providers)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Outlines\n", - "\n", - "Outlines {cite}`outlines2024` is a library specifically focused on structured text generation from LLMs. Under the hood, Outlines works by adjusting the probability distribution of the model's output logits - the raw scores from the final layer of the neural network that are normally converted into text tokens. By introducing carefully crafted logit biases, Outlines can guide the model to prefer certain tokens over others, effectively constraining its outputs to a predefined set of valid options. \n", - "\n", - "The authors solve the general guided generation problem {cite}`willard2023efficientguidedgenerationlarge`, which as a consequence solves the problem of structured output generation, in LLMs by introducing an efficient indexing approach that reformulates neural text generation using finite-state machines (FSMs).\n", - "\n", - "They define the next token generation as a random variable:\n", - "\n", - "$$s_{t+1} \\sim \\text{Categorical}(\\alpha) \\text{ where } \\alpha = \\text{LLM}(S_t, \\theta)$$\n", - "\n", - "Where:\n", - "\n", - "- $s_{t+1}$ is the next token to be generated\n", - "- $S_t = (s_1...s_t)$ represents a sequence of t tokens with $s_t \\in V$\n", - "- $V$ is the vocabulary with size $|V| = N$ (typically around $10^4$ or larger)\n", - "- $\\alpha \\in \\mathbb{R}^N$ is the output logits/probabilities over the vocabulary\n", - "- $\\theta$ is the set of trained parameters of the LLM\n", - "- $\\text{LLM}$ refers to a deep neural network trained on next-token-completion tasks\n", - "- $\\text{Categorical}(\\alpha)$ represents sampling from a categorical distribution with probabilities $\\alpha$\n", - "\n", - "When applying masking for guided generation, this becomes:\n", - "\n", - "$$\n", - "\\tilde{\\alpha} = m(S_t) \\odot \\alpha\n", - "$$\n", - "\n", - "$$\n", - "\\tilde{s}_{t+1} \\sim \\text{Categorical}(\\tilde{\\alpha})\n", - "$$\n", - "\n", - "Where:\n", - "\n", - "- $m: P(V) \\rightarrow {0,1}^N$ is a boolean mask function\n", - "- $\\odot$ represents element-wise multiplication\n", - "- $\\tilde{\\alpha}$ is the masked (constrained) probability distribution\n", - "- $\\tilde{s}_{t+1}$ is the next token sampled under constraints\n", - "\n", - "This formulation allows the masking operation to guide the generation process by zeroing out probabilities of invalid tokens according to the finite state machine states. But instead of checking the entire vocabulary (size N) at each generation step (O(N) complexity) to enforce output constraints, they convert constraints (regex/grammar) into FSM states and build an index mapping FSM states to valid vocabulary tokens. This achieves O(1) average complexity for token generation.\n", - "\n", - "In summary, there are two stages in the Outlines framework {cite}`vivien2024regex`:\n", - "\n", - "1. **Preprocessing Step**: Outlines converts a character-level deterministic finite automaton (DFA) testing whether a string matches a regex into a token-level DFA testing whether a token sequence is decoded in a string matching the regex.\n", - "\n", - "2. **Decoding Step**: At decoding time, the DFA is used to determine, for each new token, which potential tokens are allowed. Starting from the initial state of the DFA, the allowed tokens are determined by the outgoing transitions from the current state. The corresponding mask is applied to the next token probabilities and these probabilities are renormalized. A new token can then be sampled and the state of the DFA updated.\n", - "\n", - "At each step, the model's probability distribution is masked and renormalized according to the current state and valid transitions." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As an example, let's suppose we want to constrain the output of an LLM to the following set of options: \n", - "- Y/yes\n", - "- N/no\n", - "- N/never\n", - "- A/always\n", - "\n", - "\n", - "This can be done by creating a state machine that has a start state, an end state and a set of valid transitions between states with possible states represented as the following regex string: `r\"\\s*([Yy]es|[Nn]o|[Nn]ever|[Aa]lways)\"`.\n", - "\n", - "The state machine below illustrates how Outlines works under the hood {numref}`outlines_state_machine`, where:\n", - "- Prop: Represents the logit token probability given by the LLM\n", - "- Mask: Mask value of the transition as defined by the state machine\n", - "- Final: The renormalized token probability post-masking\n", - "\n", - "```{figure} ../_static/structured_output/outlines_state_machine.png\n", - "---\n", - "name: outlines_state_machine\n", - "alt: Outlines State Machine\n", - "scale: 50%\n", - "align: center\n", - "---\n", - "Outlines State Machine.\n", - "```\n", - "\n", - "The initial \"Start\" state contains a masking table that controls which tokens can begin the sequence. In this example, only characters from the set `[YyNnAa]` are allowed as valid first characters, with each having an assigned probability and mask value. The masking mechanism effectively filters out invalid tokens by setting their mask values to 0, ensuring only permitted transitions to the \"First\" state.\n", - "\n", - "After transitioning to the \"First\" state, the system continues to use probability masking to guide the sequence. For example, when receiving 'Y' as input, the masking table adjusts token probabilities to ensure valid continuations.\n", - "\n", - "This finite state machine architecture serves multiple purposes in controlling text generation:\n", - "\n", - "1. Managing token probabilities through strategic masking\n", - "2. Preventing invalid token sequences \n", - "3. Enforcing specific token patterns\n", - "4. Providing fine-grained control over token generation and validation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This provides fine-grained control over the model's generation process. In that way, Outlines, the Python package, provides several powerful controlled generation features:\n", - "\n", - "* **Regex-based structured generation**: Guide the generation process using regular expressions.\n", - "* **Multiple Choice Generation**: Restrict the LLM output to a predefined set of options.\n", - "* **Pydantic model**: Ensure the LLM output follows a Pydantic model.\n", - "* **JSON Schema**: Ensure the LLM output follows a JSON Schema.\n", - "\n", - "Outlines can support major proprietary LLM APIs (e.g. OpenAI's via vLLM). However, one of its key advantages is the ability to ensure structured output for Open Source models, which often lack such guarantees by default." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```bash\n", - "pip install outlines\n", - "pip install transformers\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this example, we will use a `Qwen2.5-0.5B` model, a lightweight open source model from Alibaba Cloud known for its strong performance despite its small size." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "import outlines\n", - "\n", - "model = outlines.models.transformers(\"Qwen/Qwen2.5-0.5B-Instruct\")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Negative\n" - ] - } - ], - "source": [ - "TOP = 100\n", - "prompt = f\"\"\"You are a sentiment-labelling assistant specialized in Financial Statements.\n", - "Is the following document positive or negative?\n", - "\n", - "Document: {sec_filing[:TOP]}\n", - "\"\"\"\n", - "\n", - "generator = outlines.generate.choice(model, [\"Positive\", \"Negative\"])\n", - "answer = generator(prompt)\n", - "print(answer)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this simple example, we use Outlines' `choice` method to constrain the model output to a predefined set of options (\"Positive\" or \"Negative\"). This ensures the model can only return one of these values, avoiding any unexpected or malformed responses.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Outlines allows to guide the generation process so the output is guaranteed to follow a JSON schema or Pydantic model. Now we will go back to our example of extracting entities and places from a SEC filing. In order to do so, we simply need to pass our Pydantic model to the `json` method in Outlines' `generate` module." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "BASE_PROMPT = \"You are an expert at structured data extraction. You will be given unstructured text from a SEC filing and extracted names of mentioned entities and places and should convert the response into the given structure.\"" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "prompt = f\"{BASE_PROMPT} Document: {sec_filing[:TOP]}\"\n", - "generator = outlines.generate.json(model, SECExtraction)\n", - "sec_extraction_outlines = generator(prompt)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Extracted entities: ['Zsp', 'ZiCorp']\n", - "Extracted places: ['California']\n" - ] - } - ], - "source": [ - "print(\"Extracted entities:\", sec_extraction_outlines.mentioned_entities)\n", - "print(\"Extracted places:\", sec_extraction_outlines.mentioned_places)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We observe that the model was able to extract the entities and places from the input text, and return them in the specified format. However, it is interesting to see that the model hallucinates a few entities, a phenomenon that is common for smaller Open Source models that were not fine-tuned on the task of entity extraction.\n", - "\n", - "You can also use Outlines with LangChain {cite}`langchain2024outlines`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Ollama\n", - "\n", - "Ollama is a popular tool that allows you to run large language models (LLMs) locally. It has recently added support for structured output generation. The current `ollama` implementation leverages llama.cpp GBNF (GGML BNF) grammars {cite}`llama_cpp_grammars` to enable structured output generation. \n", - "\n", - "llama.cpp GBNF forces language models to generate output in specific, predefined formats by constraining their outputs to follow precise rules and patterns. The system accomplishes this through a formal grammar specification that defines exactly how valid outputs can be constructed. It's essentially an extension of BNF (Backus-Naur Form) {cite}`backus_naur_form` with some modern regex-like features added. These rules carefully define what elements are allowed, how they can be combined, and what patterns of repetition and sequencing are valid. By enforcing these constraints during generation, GBNF ensures the model's output strictly adheres to the desired format.\n", - "\n", - "Ollama first introduced structured output generation in version 0.5.1 providing support for JSON output but highlighting additional formats are coming soon.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's replicate our previous structured output generation example with Ollama. First, make sure you have Ollama installed. You can find installation instructions [here](https://ollama.com/docs/installation).\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```bash\n", - "curl -fsSL https://ollama.com/install.sh | sh\n", - "pip install ollama\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The code below demonstrates how to use Ollama's structured output capabilities with a Pydantic model as we did before with OpenAI, LangChain and Outlines. The SECExtraction model defines the expected structure with two fields: mentioned_entities and mentioned_places as lists of strings we expect the model to return given an input SEC filing. The `extract_entities_from_sec_filing` function uses Ollama's chat API to analyze SEC filings and extract entities in a structured format, with temperature set to 0 for deterministic results. We pass the Pydantic model's JSON schema to Ollama via the `format` parameter. We append a suffix to the prompt instructing the model to return the response as JSON (\"Return as JSON.\") as recommended by Ollama maintainers.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "from ollama import chat\n", - "from pydantic import BaseModel\n", - "\n", - "class SECExtraction(BaseModel):\n", - " mentioned_entities: list[str]\n", - " mentioned_places: list[str]\n", - "\n", - "OLLAMA_STRUCTURED_OUTPUT_PROMPT_SUFFIX = \"Return as JSON.\"\n", - "OLLAMA_STRUCTURED_OUTPUT_TEMPERATURE = 0\n", - "\n", - "def extract_entities_from_sec_filing(doc: str, model: str) -> dict:\n", - " \"\"\"\n", - " Extract entities and places from an SEC filing using Ollama chat.\n", - " \n", - " Args:\n", - " doc: The SEC filing text to analyze\n", - " model: The Ollama model to use for extraction\n", - " \n", - " Returns:\n", - " The raw response from the chat model\n", - " \"\"\"\n", - " response = chat(\n", - " messages=[\n", - " {\n", - " 'role': 'user',\n", - " 'content': f\"\"\"{BASE_PROMPT}\n", - " {OLLAMA_STRUCTURED_OUTPUT_PROMPT_SUFFIX}\n", - " \n", - " Document: {doc}\"\"\"\n", - " }\n", - " ],\n", - " model=model, # You can also use other models like 'mistral' or 'llama2-uncensored'\n", - " format=SECExtraction.model_json_schema(),\n", - " options={'temperature': OLLAMA_STRUCTURED_OUTPUT_TEMPERATURE} # Set to 0 for more deterministic output\n", - " )\n", - " return response\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can now run the function and print the extracted entities and places. But first we need to start the Ollama server with our target LLM model (Qwen2.5-0.5B) running locally." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```bash\n", - "ollama run qwen2.5:0.5b\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "doc = sec_filing[:TOP]\n", - "model = \"qwen2.5:0.5b\"\n", - "\n", - "response = extract_entities_from_sec_filing(doc, model)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "import json" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "response_json = json.loads(response.message.content)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Extracted entities: ['United States', 'SECURITIES AND EXCHANGE COMMISSION']\n", - "Extracted places: []\n" - ] - } - ], - "source": [ - "print(\"Extracted entities:\", response_json.get('mentioned_entities'))\n", - "print(\"Extracted places:\", response_json.get('mentioned_places'))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The extracted entities and places were quite different from those previously extracted using Outlines and Langchain, as expected since this depends mostly on the underlying model which is quite small. We do observe though that we have successfully obtained results in JSON format as specified, even with such a small underlying model (0.5B parameters).\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Discussion\n", - "\n", - "### Comparing Solutions\n", - "\n", - "\n", - "The choice of framework for structured LLM output depends heavily on specific constraints, requirements and use cases. LangChain is the most used LLM framework today with a large developer community base however its structured output support depends on the underlying LLM provider support. Ollama enables straightforward local deployment and experimentation democratizing access to LLMs while fostering privacy and control, however today it only offers JSON format with further formats to come. Outlines emerges as a solution with great flexibility and control over output structure while providing support for a wide range of LLMs. {numref}`structured_output_frameworks` provides a summary comparison of the different frameworks.\n", - "\n", - "```{table} Structured Output Frameworks Comparison\n", - ":name: structured_output_frameworks\n", - "| Feature | LangChain | Outlines | Ollama |\n", - "|---------|-----------|----------|---------|\n", - "| **Implementation Approach** | Wrapper around LLM's native structured output APIs using with_structured_output method | Adjusts probability distribution of model's output logits to guide generation | Uses llama.cpp GBNF grammars to constrain output format |\n", - "| **Model Support** | Limited to LLMs with built-in structured output APIs | Broad support for open-source models via transformers, llama.cpp, exllama2, mlx-lm and vllm | Focused on running open-source models locally |\n", - "| **Output Format Support** | - TypedDict
    - JSON Schema
    - Pydantic class | - Multiple choice generation
    - Regex-based structure
    - Pydantic model
    - JSON Schema | - Currently JSON only
    - Additional formats planned |\n", - "| **Key Advantages** | - Simple integration with supported LLMs
    - Good for production environments using major LLM providers | - Most flexible output structure options
    - Fine-grained control over generation
    - Strong open-source model support | - Excellent for local deployment
    - Simple setup and usage
    - Built-in model serving |\n", - "| **Use Case Focus** | Enterprise applications using commercial LLMs | Applications requiring precise output control or using open-source models | Local deployment and/or experimentation |\n", - "| **Complexity Level** | Medium - requires understanding of LangChain abstractions | Low | Low |\n", - "```\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Best Practices\n", - "\n", - "\n", - "* **Clear Schema Definition**: Define the desired output structure clearly. This can be done in several ways including schemas, types, or Pydantic models as appropriate. This ensures the LLM knows exactly what format is expected.\n", - "\n", - "* **Descriptive Naming**: Use meaningful names for fields and elements in your schema. This makes the output more understandable and easier to work with.\n", - "\n", - "* **Detailed Prompting**: Guide the LLM with well-crafted prompts that include examples and clear instructions. A well-structured prompt improves the chances of getting the desired output.\n", - "\n", - "* **Integration**: If you are connecting the model to tools, functions, data, etc. in your system, then you are highly encouraged to use a typed structured output (e.g. Pydantic models) to ensure the model's output can be processed correctly by downstream systems.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Research and Ongoing Debate\n", - "\n", - "The use of structured output for Large Language Models (LLMs) is a developing area. While the ability to constrain LLM outputs offer clear benefits in parsing, robustness, and integration, there is growing debate on whether it also potentially comes at the cost of performance as well as reasoning abilities. Research in this area should be taken with a grain of salt since findings are mixed and often depend on the specific task and model family at hand furthermore model families are not always comparable and are getting updated by the day! Nonetheless, early findings provide some interesting insights as to why there is no one-size-fits-all solution when it comes to LLMs structured output.\n", - "\n", - "\n", - "There is some evidence indicating that LLMs may have bias in their handling of different output formats {cite}`long2024llms`. The study examined common output structures like multiple-choice answers, wrapped text, lists, and key-value mappings. The authors analyzed key LLM model families, namely Gemma, Mistral, and ChatGPT, uncovering bias across multiple tasks and formats. The researchers attributed these biases to the models' underlying token distributions for different formats. An example of this format bias emerged in the comparison between JSON and YAML outputs. While models like Mistral and Gemma excelled at generating JSON structures, they performed notably worse with YAML. Their YAML outputs often contained extraneous information that degrades output quality. This disparity likely stems from JSON's prevalence in training data, highlighting how a format's popularity directly influences model performance. While the studied models can be probably considered outdated by now since models are getting updated on a rapidly fashion, it is important to remark that addressing format bias is critical for advancing LLMs and ensuring their reliable application in real-world scenarios.\n", - "\n", - "Recent research \"Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models\" {cite}`tam2024letspeakfreelystudy` suggests that imposing format restrictions on LLMs might impact their performance, particularly in reasoning-intensive tasks. Further evidence {cite}`aider2024codejson` suggests LLMs may produce lower quality code if they’re asked to return it as part of a structured JSON response, in particular:\n", - "\n", - "* **Potential performance degradation:** Enforcing structured output, especially through constrained decoding methods like JSON-mode, can negatively impact an LLM's reasoning abilities. This is particularly evident in tasks that require multi-step reasoning or complex thought processes.\n", - "\n", - "* **Overly restrictive schemas:** Imposing strict schemas can limit the expressiveness of LLM outputs and may hinder their ability to generate creative or nuanced responses. In certain cases, the strictness of the schema might outweigh the benefits of structured output.\n", - "\n", - "* **Increased complexity in prompt engineering:** Crafting prompts that effectively guide LLMs to generate structured outputs while maintaining performance can be challenging. It often requires careful consideration of the schema, the task instructions, and the desired level of detail in the response.\n", - "\n", - "On the other hand, those findings are not without criticism. The .txt team challenges the work of {cite}`tam2024letspeakfreelystudy`. The rebuttal argues that **structured generation, when done correctly, actually *improves* performance**.\n", - "\n", - "\n", - "```{figure} ../_static/structured_output/rebuttal.png\n", - "---\n", - "name: structured_vs_unstructured\n", - "alt: Structured vs Unstructured Results by .txt team\n", - "scale: 50%\n", - "align: center\n", - "---\n", - "Structured vs Unstructured Results by .txt team.\n", - "```\n", - "\n", - "The .txt team presents compelling evidence through their reproduction of the paper's experiments. While their unstructured results align with the original paper's findings, their structured results paint a dramatically different picture - demonstrating that structured generation actually improves performance (see {numref}`structured_vs_unstructured`). The team has made their experimental notebooks publicly available on GitHub for independent verification {cite}`dottxt2024demos`.\n", - "\n", - "\n", - ".txt team identifies several flaws in the methodology of \"Let Me Speak Freely?\" that they believe led to inaccurate conclusions:\n", - "\n", - "* The paper finds that structured output improves performance on classification tasks but doesn't reconcile this finding with its overall negative conclusion about structured output. \n", - "* The prompts used for unstructured generation were different from those used for structured generation, making the comparison uneven. \n", - "* The prompts used for structured generation, particularly in JSON-mode, didn't provide the LLM with sufficient information to properly complete the task. \n", - "* The paper conflates \"structured generation\" with \"JSON-mode\", when they are not the same thing. \n", - "\n", - "It is important to note that while .txt provides a compelling and verifiable argument in favor of (proper) structured output generation in LLMs further research and exploration are needed to comprehensively understand the nuances and trade-offs involved in using structured output for various LLM tasks and applications.\n", - "\n", - "In summary, the debate surrounding structured output highlights the ongoing challenges in balancing LLM capabilities with real-world application requirements. While structured outputs offer clear benefits in parsing, robustness, and integration, their potential impact on performance, particularly in reasoning tasks is a topic of ongoing debate. \n", - "\n", - "The ideal approach likely involves a nuanced strategy that considers the specific task, the desired level of structure, and the available LLM capabilities. Further research and development efforts are needed to mitigate potential drawbacks and unlock the full potential of LLMs for a wider range of applications. \n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Conclusion\n", - "\n", - "Extracting structured output from LLMs is crucial for integrating them into real-world applications. By understanding the challenges and employing appropriate strategies and tools, developers can improve the reliability and usability of LLM-powered systems, unlocking their potential to automate complex tasks and generate valuable insights. \n", - "\n", - "## Acknowledgements\n", - "\n", - "We would like to thank [Cameron Pfiffer](https://x.com/cameron_pfiffer) from the .txt team for his insightful review and feedback.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Citation\n", - "[![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]\n", - "\n", - "[cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/\n", - "[cc-by-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png\n", - "[cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC-BY--NC--SA-4.0-lightgrey.svg\n", - "\n", - "```\n", - "@misc{tharsistpsouza2024tamingllms,\n", - " author = {Tharsis T. P. Souza},\n", - " title = {Taming LLMs: A Practical Guide to LLM Pitfalls with Open Source Software},\n", - " year = {2024},\n", - " chapter = {Wrestling with Structured Output},\n", - " journal = {GitHub repository},\n", - " url = {https://github.com/souzatharsis/tamingLLMs)\n", - "}\n", - "```\n", - "## References\n", - "```{bibliography}\n", - ":filter: docname in docnames\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tamingllms/notebooks/taming_utils.py b/tamingllms/notebooks/taming_utils.py index a70db65..4ed012a 100644 --- a/tamingllms/notebooks/taming_utils.py +++ b/tamingllms/notebooks/taming_utils.py @@ -350,3 +350,26 @@ def calculate_validator_metrics(scoring_results, scoring_prompts, bad_sources, g # Create a DataFrame from the results results_df = pd.DataFrame(results) return results_df + + +from transformers import AutoModelForCausalLM, AutoTokenizer +import torch + +def load_gguf(model_name="bartowski/SmolLM2-135M-Instruct-GGUF", + gguf_file="SmolLM2-135M-Instruct-Q2_K.gguf", + device_map="auto"): + """ + Load a pre-trained language model. + + Args: + model_name (str): Name/path of the model to load + gguf_file (str): Name of the GGUF file to use + device_map (str): Device mapping strategy + + Returns: + AutoModelForCausalLM: Loaded model + """ + model = AutoModelForCausalLM.from_pretrained(model_name, + gguf_file=gguf_file, + device_map=device_map) + return model diff --git a/tamingllms/references.bib b/tamingllms/references.bib index 3e50dc3..e5a56fb 100644 --- a/tamingllms/references.bib +++ b/tamingllms/references.bib @@ -702,6 +702,16 @@ @misc{deshpande2024glidergradingllminteractions url={https://arxiv.org/abs/2412.14140}, } +@misc{unsloth2024llama3, + title={Llama-3.3-70B-Instruct-GGUF}, + author={{Unsloth}}, + year={2024}, + howpublished={Hugging Face Model}, + url={https://huggingface.co/unsloth/Llama-3.3-70B-Instruct-GGUF}, + note={GGUF quantized version of Meta's Llama 3.3 70B instruction-tuned model} +} + + @misc{nvidia2024logitsprocessorzoo, title={Logits Processor Zoo}, author={{NVIDIA}}, @@ -750,6 +760,26 @@ @misc{salesforce2024wikitext note={Large-scale dataset derived from verified Good and Featured articles on Wikipedia} } +@misc{wang20241bitaiinfra11, + title={1-bit AI Infra: Part 1.1, Fast and Lossless BitNet b1.58 Inference on CPUs}, + author={Jinheng Wang and Hansong Zhou and Ting Song and Shaoguang Mao and Shuming Ma and Hongyu Wang and Yan Xia and Furu Wei}, + year={2024}, + eprint={2410.16144}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2410.16144}, +} + +@misc{a16z2024llmflation, + title={LLMflation: Understanding and Mitigating LLM Inference Cost}, + author={{Andreessen Horowitz}}, + year={2024}, + howpublished={Blog Post}, + url={https://a16z.com/llmflation-llm-inference-cost/}, + note={Analysis of LLM inference costs and strategies for optimization} +} + + @misc{huggingface2024quantization, title={GGUF Quantization Types}, author={{Hugging Face}},

    Table 4.1 Structured Output Frameworks Comparison