diff --git a/README.md b/README.md
index 1f34fcb..06524f5 100644
--- a/README.md
+++ b/README.md
@@ -22,8 +22,8 @@ Abstract: **The current discourse around Large Language Models (LLMs) tends to f
 | Chapter 4: Safety              | [html](https://www.souzatharsis.com/tamingLLMs/notebooks/safety.html)  |  [ipynb](https://github.com/souzatharsis/tamingLLMs/blob/master/tamingllms/notebooks/safety.ipynb) | *Ready for Review*     |
 | Chapter 5: Preference-Based Alignment     | [html](https://www.souzatharsis.com/tamingLLMs/notebooks/alignment.html) | [ipynb](https://github.com/souzatharsis/tamingLLMs/blob/master/tamingllms/notebooks/alignment.ipynb) | *Ready for Review*     |
 | Chapter 6: Local LLMs in Practice |   [html](https://www.souzatharsis.com/tamingLLMs/notebooks/local.html) | [ipynb](https://github.com/souzatharsis/tamingLLMs/blob/master/tamingllms/notebooks/local.ipynb) | *Ready for Review*     |
-| Chapter 7: The Cost Factor                |              |               |                      |
-| Chapter 8: Frontiers                |              |               |     WIP            |
+| Chapter 7: The Falling Cost Paradox               |              |               |        WIP              |
+| Chapter 8: Frontiers                |              |               |                 |
 | Appendix A: Tools and Resources           |              |               |                      |
 
 
diff --git a/poetry.lock b/poetry.lock
index 69cc5dc..9067dd2 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -13,13 +13,13 @@ files = [
 
 [[package]]
 name = "accelerate"
-version = "1.1.1"
+version = "1.2.1"
 description = "Accelerate"
 optional = false
 python-versions = ">=3.9.0"
 files = [
-    {file = "accelerate-1.1.1-py3-none-any.whl", hash = "sha256:61edd81762131b8d4bede008643fa1e1f3bf59bec710ebda9771443e24feae02"},
-    {file = "accelerate-1.1.1.tar.gz", hash = "sha256:0d39dfac557052bc735eb2703a0e87742879e1e40b88af8a2f9a93233d4cd7db"},
+    {file = "accelerate-1.2.1-py3-none-any.whl", hash = "sha256:be1cbb958cf837e7cdfbde46b812964b1b8ae94c9c7d94d921540beafcee8ddf"},
+    {file = "accelerate-1.2.1.tar.gz", hash = "sha256:03e161fc69d495daf2b9b5c8d5b43d06e2145520c04727b5bda56d49f1a43ab5"},
 ]
 
 [package.dependencies]
@@ -60,18 +60,6 @@ pygments = ">=1.5"
 dev = ["pillow", "pkginfo (>=1.10)", "playwright", "pre-commit", "setuptools", "twine (>=5.0)"]
 tests = ["hypothesis", "pytest"]
 
-[[package]]
-name = "aenum"
-version = "3.1.15"
-description = "Advanced Enumerations (compatible with Python's stdlib Enum), NamedTuples, and NamedConstants"
-optional = false
-python-versions = "*"
-files = [
-    {file = "aenum-3.1.15-py2-none-any.whl", hash = "sha256:27b1710b9d084de6e2e695dab78fe9f269de924b51ae2850170ee7e1ca6288a5"},
-    {file = "aenum-3.1.15-py3-none-any.whl", hash = "sha256:e0dfaeea4c2bd362144b87377e2c61d91958c5ed0b4daf89cb6f45ae23af6288"},
-    {file = "aenum-3.1.15.tar.gz", hash = "sha256:8cbd76cd18c4f870ff39b24284d3ea028fbe8731a58df3aa581e434c575b9559"},
-]
-
 [[package]]
 name = "aiohappyeyeballs"
 version = "2.4.3"
@@ -590,17 +578,6 @@ files = [
 [package.dependencies]
 pycparser = "*"
 
-[[package]]
-name = "chardet"
-version = "5.2.0"
-description = "Universal encoding detector for Python 3"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"},
-    {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"},
-]
-
 [[package]]
 name = "charset-normalizer"
 version = "3.4.0"
@@ -953,25 +930,6 @@ diagnostics = ["bokeh (>=3.1.0)", "jinja2 (>=2.10.3)"]
 distributed = ["distributed (==2024.11.2)"]
 test = ["pandas[test]", "pre-commit", "pytest", "pytest-cov", "pytest-rerunfailures", "pytest-timeout", "pytest-xdist"]
 
-[[package]]
-name = "dataproperty"
-version = "1.0.1"
-description = "Python library for extract property from data."
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "DataProperty-1.0.1-py3-none-any.whl", hash = "sha256:0b8b07d4fb6453fcf975b53d35dea41f3cfd69c9d79b5010c3cf224ff0407a7a"},
-    {file = "DataProperty-1.0.1.tar.gz", hash = "sha256:723e5729fa6e885e127a771a983ee1e0e34bb141aca4ffe1f0bfa7cde34650a4"},
-]
-
-[package.dependencies]
-mbstrdecoder = ">=1.0.0,<2"
-typepy = {version = ">=1.2.0,<2", extras = ["datetime"]}
-
-[package.extras]
-logging = ["loguru (>=0.4.1,<1)"]
-test = ["pytest (>=6.0.1)", "pytest-md-report (>=0.3)", "tcolorpy (>=0.1.2)"]
-
 [[package]]
 name = "datasets"
 version = "3.1.0"
@@ -1508,6 +1466,23 @@ files = [
 [package.dependencies]
 attrs = ">=19.3"
 
+[[package]]
+name = "gguf"
+version = "0.13.0"
+description = "Read and write ML models in GGUF for GGML"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "gguf-0.13.0-py3-none-any.whl", hash = "sha256:4e92a73b827c4618b55a615547b5120a904c2b34d02d5179dad5093c680623f6"},
+    {file = "gguf-0.13.0.tar.gz", hash = "sha256:9f29ccbb21fc6b6cf8b4741e88aaa563f0a1c748c26b5f7e304bb48612bf41b8"},
+]
+
+[package.dependencies]
+numpy = ">=1.17"
+pyyaml = ">=5.1"
+sentencepiece = ">=0.1.98,<=0.2.0"
+tqdm = ">=4.27"
+
 [[package]]
 name = "gitdb"
 version = "4.0.11"
@@ -2538,54 +2513,6 @@ files = [
     {file = "latexcodec-3.0.0.tar.gz", hash = "sha256:917dc5fe242762cc19d963e6548b42d63a118028cdd3361d62397e3b638b6bc5"},
 ]
 
-[[package]]
-name = "lighteval"
-version = "0.6.2"
-description = "A lightweight and configurable evaluation package"
-optional = false
-python-versions = ">=3.10"
-files = [
-    {file = "lighteval-0.6.2-py3-none-any.whl", hash = "sha256:1832fff4ca76d4ec617b5242c60e5dcaa1df8966f9b8352af105386fb6c910ba"},
-    {file = "lighteval-0.6.2.tar.gz", hash = "sha256:e48caf17c4136f973b5b9ee0692171b797692e068bd6c8efed14657b81500956"},
-]
-
-[package.dependencies]
-accelerate = {version = "*", optional = true, markers = "extra == \"accelerate\""}
-aenum = "3.1.15"
-colorama = "*"
-datasets = ">=2.14.0"
-fsspec = ">=2023.12.2"
-GitPython = ">=3.1.41"
-huggingface-hub = ">=0.23.0"
-nltk = "3.9.1"
-protobuf = "==3.20.*"
-pycountry = "*"
-pytablewriter = "*"
-rouge-score = "0.1.2"
-sacrebleu = "*"
-scikit-learn = "*"
-sentencepiece = ">=0.1.99"
-spacy = "3.7.2"
-termcolor = "2.3.0"
-torch = ">=2.0,<2.5"
-transformers = ">=4.38.0"
-
-[package.extras]
-accelerate = ["accelerate"]
-adapters = ["peft (==0.3.0)"]
-dev = ["lighteval[accelerate,multilingual,quality,tests]"]
-extended-tasks = ["langdetect", "openai", "tiktoken"]
-multilingual = ["jieba", "pyvi", "spacy[ja,ko,th]", "stanza"]
-nanotron = ["nanotron", "tensorboardX"]
-optimum = ["optimum (==1.12.0)"]
-quality = ["pre-commit", "ruff (==v0.2.2)"]
-quantization = ["auto-gptq (>=0.4.2)", "bitsandbytes (>=0.41.0)"]
-s3 = ["s3fs"]
-tensorboardx = ["tensorboardX"]
-tests = ["pytest (==7.4.0)"]
-tgi = ["text-generation (==0.6.0)"]
-vllm = ["more-itertools", "ray", "vllm"]
-
 [[package]]
 name = "linkify-it-py"
 version = "2.0.3"
@@ -2699,160 +2626,6 @@ files = [
     {file = "locket-1.0.0.tar.gz", hash = "sha256:5c0d4c052a8bbbf750e056a8e65ccd309086f4f0f18a2eac306a8dfa4112a632"},
 ]
 
-[[package]]
-name = "lxml"
-version = "5.3.0"
-description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "lxml-5.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:dd36439be765e2dde7660212b5275641edbc813e7b24668831a5c8ac91180656"},
-    {file = "lxml-5.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ae5fe5c4b525aa82b8076c1a59d642c17b6e8739ecf852522c6321852178119d"},
-    {file = "lxml-5.3.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:501d0d7e26b4d261fca8132854d845e4988097611ba2531408ec91cf3fd9d20a"},
-    {file = "lxml-5.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb66442c2546446944437df74379e9cf9e9db353e61301d1a0e26482f43f0dd8"},
-    {file = "lxml-5.3.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9e41506fec7a7f9405b14aa2d5c8abbb4dbbd09d88f9496958b6d00cb4d45330"},
-    {file = "lxml-5.3.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f7d4a670107d75dfe5ad080bed6c341d18c4442f9378c9f58e5851e86eb79965"},
-    {file = "lxml-5.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41ce1f1e2c7755abfc7e759dc34d7d05fd221723ff822947132dc934d122fe22"},
-    {file = "lxml-5.3.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:44264ecae91b30e5633013fb66f6ddd05c006d3e0e884f75ce0b4755b3e3847b"},
-    {file = "lxml-5.3.0-cp310-cp310-manylinux_2_28_ppc64le.whl", hash = "sha256:3c174dc350d3ec52deb77f2faf05c439331d6ed5e702fc247ccb4e6b62d884b7"},
-    {file = "lxml-5.3.0-cp310-cp310-manylinux_2_28_s390x.whl", hash = "sha256:2dfab5fa6a28a0b60a20638dc48e6343c02ea9933e3279ccb132f555a62323d8"},
-    {file = "lxml-5.3.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:b1c8c20847b9f34e98080da785bb2336ea982e7f913eed5809e5a3c872900f32"},
-    {file = "lxml-5.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:2c86bf781b12ba417f64f3422cfc302523ac9cd1d8ae8c0f92a1c66e56ef2e86"},
-    {file = "lxml-5.3.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:c162b216070f280fa7da844531169be0baf9ccb17263cf5a8bf876fcd3117fa5"},
-    {file = "lxml-5.3.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:36aef61a1678cb778097b4a6eeae96a69875d51d1e8f4d4b491ab3cfb54b5a03"},
-    {file = "lxml-5.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f65e5120863c2b266dbcc927b306c5b78e502c71edf3295dfcb9501ec96e5fc7"},
-    {file = "lxml-5.3.0-cp310-cp310-win32.whl", hash = "sha256:ef0c1fe22171dd7c7c27147f2e9c3e86f8bdf473fed75f16b0c2e84a5030ce80"},
-    {file = "lxml-5.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:052d99051e77a4f3e8482c65014cf6372e61b0a6f4fe9edb98503bb5364cfee3"},
-    {file = "lxml-5.3.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:74bcb423462233bc5d6066e4e98b0264e7c1bed7541fff2f4e34fe6b21563c8b"},
-    {file = "lxml-5.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a3d819eb6f9b8677f57f9664265d0a10dd6551d227afb4af2b9cd7bdc2ccbf18"},
-    {file = "lxml-5.3.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5b8f5db71b28b8c404956ddf79575ea77aa8b1538e8b2ef9ec877945b3f46442"},
-    {file = "lxml-5.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c3406b63232fc7e9b8783ab0b765d7c59e7c59ff96759d8ef9632fca27c7ee4"},
-    {file = "lxml-5.3.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ecdd78ab768f844c7a1d4a03595038c166b609f6395e25af9b0f3f26ae1230f"},
-    {file = "lxml-5.3.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:168f2dfcfdedf611eb285efac1516c8454c8c99caf271dccda8943576b67552e"},
-    {file = "lxml-5.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa617107a410245b8660028a7483b68e7914304a6d4882b5ff3d2d3eb5948d8c"},
-    {file = "lxml-5.3.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:69959bd3167b993e6e710b99051265654133a98f20cec1d9b493b931942e9c16"},
-    {file = "lxml-5.3.0-cp311-cp311-manylinux_2_28_ppc64le.whl", hash = "sha256:bd96517ef76c8654446fc3db9242d019a1bb5fe8b751ba414765d59f99210b79"},
-    {file = "lxml-5.3.0-cp311-cp311-manylinux_2_28_s390x.whl", hash = "sha256:ab6dd83b970dc97c2d10bc71aa925b84788c7c05de30241b9e96f9b6d9ea3080"},
-    {file = "lxml-5.3.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:eec1bb8cdbba2925bedc887bc0609a80e599c75b12d87ae42ac23fd199445654"},
-    {file = "lxml-5.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6a7095eeec6f89111d03dabfe5883a1fd54da319c94e0fb104ee8f23616b572d"},
-    {file = "lxml-5.3.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:6f651ebd0b21ec65dfca93aa629610a0dbc13dbc13554f19b0113da2e61a4763"},
-    {file = "lxml-5.3.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:f422a209d2455c56849442ae42f25dbaaba1c6c3f501d58761c619c7836642ec"},
-    {file = "lxml-5.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:62f7fdb0d1ed2065451f086519865b4c90aa19aed51081979ecd05a21eb4d1be"},
-    {file = "lxml-5.3.0-cp311-cp311-win32.whl", hash = "sha256:c6379f35350b655fd817cd0d6cbeef7f265f3ae5fedb1caae2eb442bbeae9ab9"},
-    {file = "lxml-5.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:9c52100e2c2dbb0649b90467935c4b0de5528833c76a35ea1a2691ec9f1ee7a1"},
-    {file = "lxml-5.3.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:e99f5507401436fdcc85036a2e7dc2e28d962550afe1cbfc07c40e454256a859"},
-    {file = "lxml-5.3.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:384aacddf2e5813a36495233b64cb96b1949da72bef933918ba5c84e06af8f0e"},
-    {file = "lxml-5.3.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:874a216bf6afaf97c263b56371434e47e2c652d215788396f60477540298218f"},
-    {file = "lxml-5.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:65ab5685d56914b9a2a34d67dd5488b83213d680b0c5d10b47f81da5a16b0b0e"},
-    {file = "lxml-5.3.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aac0bbd3e8dd2d9c45ceb82249e8bdd3ac99131a32b4d35c8af3cc9db1657179"},
-    {file = "lxml-5.3.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b369d3db3c22ed14c75ccd5af429086f166a19627e84a8fdade3f8f31426e52a"},
-    {file = "lxml-5.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c24037349665434f375645fa9d1f5304800cec574d0310f618490c871fd902b3"},
-    {file = "lxml-5.3.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:62d172f358f33a26d6b41b28c170c63886742f5b6772a42b59b4f0fa10526cb1"},
-    {file = "lxml-5.3.0-cp312-cp312-manylinux_2_28_ppc64le.whl", hash = "sha256:c1f794c02903c2824fccce5b20c339a1a14b114e83b306ff11b597c5f71a1c8d"},
-    {file = "lxml-5.3.0-cp312-cp312-manylinux_2_28_s390x.whl", hash = "sha256:5d6a6972b93c426ace71e0be9a6f4b2cfae9b1baed2eed2006076a746692288c"},
-    {file = "lxml-5.3.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:3879cc6ce938ff4eb4900d901ed63555c778731a96365e53fadb36437a131a99"},
-    {file = "lxml-5.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:74068c601baff6ff021c70f0935b0c7bc528baa8ea210c202e03757c68c5a4ff"},
-    {file = "lxml-5.3.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ecd4ad8453ac17bc7ba3868371bffb46f628161ad0eefbd0a855d2c8c32dd81a"},
-    {file = "lxml-5.3.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7e2f58095acc211eb9d8b5771bf04df9ff37d6b87618d1cbf85f92399c98dae8"},
-    {file = "lxml-5.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e63601ad5cd8f860aa99d109889b5ac34de571c7ee902d6812d5d9ddcc77fa7d"},
-    {file = "lxml-5.3.0-cp312-cp312-win32.whl", hash = "sha256:17e8d968d04a37c50ad9c456a286b525d78c4a1c15dd53aa46c1d8e06bf6fa30"},
-    {file = "lxml-5.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:c1a69e58a6bb2de65902051d57fde951febad631a20a64572677a1052690482f"},
-    {file = "lxml-5.3.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8c72e9563347c7395910de6a3100a4840a75a6f60e05af5e58566868d5eb2d6a"},
-    {file = "lxml-5.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e92ce66cd919d18d14b3856906a61d3f6b6a8500e0794142338da644260595cd"},
-    {file = "lxml-5.3.0-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d04f064bebdfef9240478f7a779e8c5dc32b8b7b0b2fc6a62e39b928d428e51"},
-    {file = "lxml-5.3.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c2fb570d7823c2bbaf8b419ba6e5662137f8166e364a8b2b91051a1fb40ab8b"},
-    {file = "lxml-5.3.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0c120f43553ec759f8de1fee2f4794452b0946773299d44c36bfe18e83caf002"},
-    {file = "lxml-5.3.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:562e7494778a69086f0312ec9689f6b6ac1c6b65670ed7d0267e49f57ffa08c4"},
-    {file = "lxml-5.3.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:423b121f7e6fa514ba0c7918e56955a1d4470ed35faa03e3d9f0e3baa4c7e492"},
-    {file = "lxml-5.3.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:c00f323cc00576df6165cc9d21a4c21285fa6b9989c5c39830c3903dc4303ef3"},
-    {file = "lxml-5.3.0-cp313-cp313-manylinux_2_28_ppc64le.whl", hash = "sha256:1fdc9fae8dd4c763e8a31e7630afef517eab9f5d5d31a278df087f307bf601f4"},
-    {file = "lxml-5.3.0-cp313-cp313-manylinux_2_28_s390x.whl", hash = "sha256:658f2aa69d31e09699705949b5fc4719cbecbd4a97f9656a232e7d6c7be1a367"},
-    {file = "lxml-5.3.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:1473427aff3d66a3fa2199004c3e601e6c4500ab86696edffdbc84954c72d832"},
-    {file = "lxml-5.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a87de7dd873bf9a792bf1e58b1c3887b9264036629a5bf2d2e6579fe8e73edff"},
-    {file = "lxml-5.3.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:0d7b36afa46c97875303a94e8f3ad932bf78bace9e18e603f2085b652422edcd"},
-    {file = "lxml-5.3.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:cf120cce539453ae086eacc0130a324e7026113510efa83ab42ef3fcfccac7fb"},
-    {file = "lxml-5.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:df5c7333167b9674aa8ae1d4008fa4bc17a313cc490b2cca27838bbdcc6bb15b"},
-    {file = "lxml-5.3.0-cp313-cp313-win32.whl", hash = "sha256:c802e1c2ed9f0c06a65bc4ed0189d000ada8049312cfeab6ca635e39c9608957"},
-    {file = "lxml-5.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:406246b96d552e0503e17a1006fd27edac678b3fcc9f1be71a2f94b4ff61528d"},
-    {file = "lxml-5.3.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:8f0de2d390af441fe8b2c12626d103540b5d850d585b18fcada58d972b74a74e"},
-    {file = "lxml-5.3.0-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1afe0a8c353746e610bd9031a630a95bcfb1a720684c3f2b36c4710a0a96528f"},
-    {file = "lxml-5.3.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56b9861a71575f5795bde89256e7467ece3d339c9b43141dbdd54544566b3b94"},
-    {file = "lxml-5.3.0-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:9fb81d2824dff4f2e297a276297e9031f46d2682cafc484f49de182aa5e5df99"},
-    {file = "lxml-5.3.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:2c226a06ecb8cdef28845ae976da407917542c5e6e75dcac7cc33eb04aaeb237"},
-    {file = "lxml-5.3.0-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:7d3d1ca42870cdb6d0d29939630dbe48fa511c203724820fc0fd507b2fb46577"},
-    {file = "lxml-5.3.0-cp36-cp36m-win32.whl", hash = "sha256:094cb601ba9f55296774c2d57ad68730daa0b13dc260e1f941b4d13678239e70"},
-    {file = "lxml-5.3.0-cp36-cp36m-win_amd64.whl", hash = "sha256:eafa2c8658f4e560b098fe9fc54539f86528651f61849b22111a9b107d18910c"},
-    {file = "lxml-5.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:cb83f8a875b3d9b458cada4f880fa498646874ba4011dc974e071a0a84a1b033"},
-    {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:25f1b69d41656b05885aa185f5fdf822cb01a586d1b32739633679699f220391"},
-    {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23e0553b8055600b3bf4a00b255ec5c92e1e4aebf8c2c09334f8368e8bd174d6"},
-    {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ada35dd21dc6c039259596b358caab6b13f4db4d4a7f8665764d616daf9cc1d"},
-    {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:81b4e48da4c69313192d8c8d4311e5d818b8be1afe68ee20f6385d0e96fc9512"},
-    {file = "lxml-5.3.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:2bc9fd5ca4729af796f9f59cd8ff160fe06a474da40aca03fcc79655ddee1a8b"},
-    {file = "lxml-5.3.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:07da23d7ee08577760f0a71d67a861019103e4812c87e2fab26b039054594cc5"},
-    {file = "lxml-5.3.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:ea2e2f6f801696ad7de8aec061044d6c8c0dd4037608c7cab38a9a4d316bfb11"},
-    {file = "lxml-5.3.0-cp37-cp37m-win32.whl", hash = "sha256:5c54afdcbb0182d06836cc3d1be921e540be3ebdf8b8a51ee3ef987537455f84"},
-    {file = "lxml-5.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:f2901429da1e645ce548bf9171784c0f74f0718c3f6150ce166be39e4dd66c3e"},
-    {file = "lxml-5.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c56a1d43b2f9ee4786e4658c7903f05da35b923fb53c11025712562d5cc02753"},
-    {file = "lxml-5.3.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ee8c39582d2652dcd516d1b879451500f8db3fe3607ce45d7c5957ab2596040"},
-    {file = "lxml-5.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fdf3a3059611f7585a78ee10399a15566356116a4288380921a4b598d807a22"},
-    {file = "lxml-5.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:146173654d79eb1fc97498b4280c1d3e1e5d58c398fa530905c9ea50ea849b22"},
-    {file = "lxml-5.3.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0a7056921edbdd7560746f4221dca89bb7a3fe457d3d74267995253f46343f15"},
-    {file = "lxml-5.3.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:9e4b47ac0f5e749cfc618efdf4726269441014ae1d5583e047b452a32e221920"},
-    {file = "lxml-5.3.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:f914c03e6a31deb632e2daa881fe198461f4d06e57ac3d0e05bbcab8eae01945"},
-    {file = "lxml-5.3.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:213261f168c5e1d9b7535a67e68b1f59f92398dd17a56d934550837143f79c42"},
-    {file = "lxml-5.3.0-cp38-cp38-win32.whl", hash = "sha256:218c1b2e17a710e363855594230f44060e2025b05c80d1f0661258142b2add2e"},
-    {file = "lxml-5.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:315f9542011b2c4e1d280e4a20ddcca1761993dda3afc7a73b01235f8641e903"},
-    {file = "lxml-5.3.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1ffc23010330c2ab67fac02781df60998ca8fe759e8efde6f8b756a20599c5de"},
-    {file = "lxml-5.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2b3778cb38212f52fac9fe913017deea2fdf4eb1a4f8e4cfc6b009a13a6d3fcc"},
-    {file = "lxml-5.3.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4b0c7a688944891086ba192e21c5229dea54382f4836a209ff8d0a660fac06be"},
-    {file = "lxml-5.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:747a3d3e98e24597981ca0be0fd922aebd471fa99d0043a3842d00cdcad7ad6a"},
-    {file = "lxml-5.3.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86a6b24b19eaebc448dc56b87c4865527855145d851f9fc3891673ff97950540"},
-    {file = "lxml-5.3.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b11a5d918a6216e521c715b02749240fb07ae5a1fefd4b7bf12f833bc8b4fe70"},
-    {file = "lxml-5.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68b87753c784d6acb8a25b05cb526c3406913c9d988d51f80adecc2b0775d6aa"},
-    {file = "lxml-5.3.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:109fa6fede314cc50eed29e6e56c540075e63d922455346f11e4d7a036d2b8cf"},
-    {file = "lxml-5.3.0-cp39-cp39-manylinux_2_28_ppc64le.whl", hash = "sha256:02ced472497b8362c8e902ade23e3300479f4f43e45f4105c85ef43b8db85229"},
-    {file = "lxml-5.3.0-cp39-cp39-manylinux_2_28_s390x.whl", hash = "sha256:6b038cc86b285e4f9fea2ba5ee76e89f21ed1ea898e287dc277a25884f3a7dfe"},
-    {file = "lxml-5.3.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:7437237c6a66b7ca341e868cda48be24b8701862757426852c9b3186de1da8a2"},
-    {file = "lxml-5.3.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:7f41026c1d64043a36fda21d64c5026762d53a77043e73e94b71f0521939cc71"},
-    {file = "lxml-5.3.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:482c2f67761868f0108b1743098640fbb2a28a8e15bf3f47ada9fa59d9fe08c3"},
-    {file = "lxml-5.3.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:1483fd3358963cc5c1c9b122c80606a3a79ee0875bcac0204149fa09d6ff2727"},
-    {file = "lxml-5.3.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2dec2d1130a9cda5b904696cec33b2cfb451304ba9081eeda7f90f724097300a"},
-    {file = "lxml-5.3.0-cp39-cp39-win32.whl", hash = "sha256:a0eabd0a81625049c5df745209dc7fcef6e2aea7793e5f003ba363610aa0a3ff"},
-    {file = "lxml-5.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:89e043f1d9d341c52bf2af6d02e6adde62e0a46e6755d5eb60dc6e4f0b8aeca2"},
-    {file = "lxml-5.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7b1cd427cb0d5f7393c31b7496419da594fe600e6fdc4b105a54f82405e6626c"},
-    {file = "lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51806cfe0279e06ed8500ce19479d757db42a30fd509940b1701be9c86a5ff9a"},
-    {file = "lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee70d08fd60c9565ba8190f41a46a54096afa0eeb8f76bd66f2c25d3b1b83005"},
-    {file = "lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:8dc2c0395bea8254d8daebc76dcf8eb3a95ec2a46fa6fae5eaccee366bfe02ce"},
-    {file = "lxml-5.3.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6ba0d3dcac281aad8a0e5b14c7ed6f9fa89c8612b47939fc94f80b16e2e9bc83"},
-    {file = "lxml-5.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:6e91cf736959057f7aac7adfc83481e03615a8e8dd5758aa1d95ea69e8931dba"},
-    {file = "lxml-5.3.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:94d6c3782907b5e40e21cadf94b13b0842ac421192f26b84c45f13f3c9d5dc27"},
-    {file = "lxml-5.3.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c300306673aa0f3ed5ed9372b21867690a17dba38c68c44b287437c362ce486b"},
-    {file = "lxml-5.3.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78d9b952e07aed35fe2e1a7ad26e929595412db48535921c5013edc8aa4a35ce"},
-    {file = "lxml-5.3.0-pp37-pypy37_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:01220dca0d066d1349bd6a1726856a78f7929f3878f7e2ee83c296c69495309e"},
-    {file = "lxml-5.3.0-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:2d9b8d9177afaef80c53c0a9e30fa252ff3036fb1c6494d427c066a4ce6a282f"},
-    {file = "lxml-5.3.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:20094fc3f21ea0a8669dc4c61ed7fa8263bd37d97d93b90f28fc613371e7a875"},
-    {file = "lxml-5.3.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ace2c2326a319a0bb8a8b0e5b570c764962e95818de9f259ce814ee666603f19"},
-    {file = "lxml-5.3.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:92e67a0be1639c251d21e35fe74df6bcc40cba445c2cda7c4a967656733249e2"},
-    {file = "lxml-5.3.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd5350b55f9fecddc51385463a4f67a5da829bc741e38cf689f38ec9023f54ab"},
-    {file = "lxml-5.3.0-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c1fefd7e3d00921c44dc9ca80a775af49698bbfd92ea84498e56acffd4c5469"},
-    {file = "lxml-5.3.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:71a8dd38fbd2f2319136d4ae855a7078c69c9a38ae06e0c17c73fd70fc6caad8"},
-    {file = "lxml-5.3.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:97acf1e1fd66ab53dacd2c35b319d7e548380c2e9e8c54525c6e76d21b1ae3b1"},
-    {file = "lxml-5.3.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:68934b242c51eb02907c5b81d138cb977b2129a0a75a8f8b60b01cb8586c7b21"},
-    {file = "lxml-5.3.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b710bc2b8292966b23a6a0121f7a6c51d45d2347edcc75f016ac123b8054d3f2"},
-    {file = "lxml-5.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18feb4b93302091b1541221196a2155aa296c363fd233814fa11e181adebc52f"},
-    {file = "lxml-5.3.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:3eb44520c4724c2e1a57c0af33a379eee41792595023f367ba3952a2d96c2aab"},
-    {file = "lxml-5.3.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:609251a0ca4770e5a8768ff902aa02bf636339c5a93f9349b48eb1f606f7f3e9"},
-    {file = "lxml-5.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:516f491c834eb320d6c843156440fe7fc0d50b33e44387fcec5b02f0bc118a4c"},
-    {file = "lxml-5.3.0.tar.gz", hash = "sha256:4e109ca30d1edec1ac60cdbe341905dc3b8f55b16855e03a54aaf59e51ec8c6f"},
-]
-
-[package.extras]
-cssselect = ["cssselect (>=0.7)"]
-html-clean = ["lxml-html-clean"]
-html5 = ["html5lib"]
-htmlsoup = ["BeautifulSoup4"]
-source = ["Cython (>=3.0.11)"]
-
 [[package]]
 name = "marisa-trie"
 version = "1.2.1"
@@ -3115,23 +2888,6 @@ files = [
 [package.dependencies]
 traitlets = "*"
 
-[[package]]
-name = "mbstrdecoder"
-version = "1.1.3"
-description = "mbstrdecoder is a Python library for multi-byte character string decoder"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "mbstrdecoder-1.1.3-py3-none-any.whl", hash = "sha256:d66c1ed3f2dc4e7c5d87cd44a75be10bc5af4250f95b38bbaedd7851308ce938"},
-    {file = "mbstrdecoder-1.1.3.tar.gz", hash = "sha256:dcfd2c759322eb44fe193a9e0b1b86c5b87f3ec5ea8e1bb43b3e9ae423f1e8fe"},
-]
-
-[package.dependencies]
-chardet = ">=3.0.4,<6"
-
-[package.extras]
-test = ["Faker (>=1.0.2)", "pytest (>=6.0.1)", "pytest-md-report (>=0.1)"]
-
 [[package]]
 name = "mdit-py-plugins"
 version = "0.4.2"
@@ -3722,46 +3478,50 @@ files = [
 
 [[package]]
 name = "nvidia-cublas-cu12"
-version = "12.1.3.1"
+version = "12.4.5.8"
 description = "CUBLAS native runtime libraries"
 optional = false
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:ee53ccca76a6fc08fb9701aa95b6ceb242cdaab118c3bb152af4e579af792728"},
-    {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-win_amd64.whl", hash = "sha256:2b964d60e8cf11b5e1073d179d85fa340c120e99b3067558f3cf98dd69d02906"},
+    {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3"},
+    {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b"},
+    {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-win_amd64.whl", hash = "sha256:5a796786da89203a0657eda402bcdcec6180254a8ac22d72213abc42069522dc"},
 ]
 
 [[package]]
 name = "nvidia-cuda-cupti-cu12"
-version = "12.1.105"
+version = "12.4.127"
 description = "CUDA profiling tools runtime libs."
 optional = false
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:e54fde3983165c624cb79254ae9818a456eb6e87a7fd4d56a2352c24ee542d7e"},
-    {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:bea8236d13a0ac7190bd2919c3e8e6ce1e402104276e6f9694479e48bb0eb2a4"},
+    {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a"},
+    {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb"},
+    {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:5688d203301ab051449a2b1cb6690fbe90d2b372f411521c86018b950f3d7922"},
 ]
 
 [[package]]
 name = "nvidia-cuda-nvrtc-cu12"
-version = "12.1.105"
+version = "12.4.127"
 description = "NVRTC native runtime libraries"
 optional = false
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:339b385f50c309763ca65456ec75e17bbefcbbf2893f462cb8b90584cd27a1c2"},
-    {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:0a98a522d9ff138b96c010a65e145dc1b4850e9ecb75a0172371793752fd46ed"},
+    {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198"},
+    {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338"},
+    {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:a961b2f1d5f17b14867c619ceb99ef6fcec12e46612711bcec78eb05068a60ec"},
 ]
 
 [[package]]
 name = "nvidia-cuda-runtime-cu12"
-version = "12.1.105"
+version = "12.4.127"
 description = "CUDA Runtime native Libraries"
 optional = false
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:6e258468ddf5796e25f1dc591a31029fa317d97a0a94ed93468fc86301d61e40"},
-    {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:dfb46ef84d73fababab44cf03e3b83f80700d27ca300e537f85f636fac474344"},
+    {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3"},
+    {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5"},
+    {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:09c2e35f48359752dfa822c09918211844a3d93c100a715d79b59591130c5e1e"},
 ]
 
 [[package]]
@@ -3780,35 +3540,41 @@ nvidia-cublas-cu12 = "*"
 
 [[package]]
 name = "nvidia-cufft-cu12"
-version = "11.0.2.54"
+version = "11.2.1.3"
 description = "CUFFT native runtime libraries"
 optional = false
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl", hash = "sha256:794e3948a1aa71fd817c3775866943936774d1c14e7628c74f6f7417224cdf56"},
-    {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-win_amd64.whl", hash = "sha256:d9ac353f78ff89951da4af698f80870b1534ed69993f10a4cf1d96f21357e253"},
+    {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399"},
+    {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9"},
+    {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-win_amd64.whl", hash = "sha256:d802f4954291101186078ccbe22fc285a902136f974d369540fd4a5333d1440b"},
 ]
 
+[package.dependencies]
+nvidia-nvjitlink-cu12 = "*"
+
 [[package]]
 name = "nvidia-curand-cu12"
-version = "10.3.2.106"
+version = "10.3.5.147"
 description = "CURAND native runtime libraries"
 optional = false
 python-versions = ">=3"
 files = [
-    {file = "nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:9d264c5036dde4e64f1de8c50ae753237c12e0b1348738169cd0f8a536c0e1e0"},
-    {file = "nvidia_curand_cu12-10.3.2.106-py3-none-win_amd64.whl", hash = "sha256:75b6b0c574c0037839121317e17fd01f8a69fd2ef8e25853d826fec30bdba74a"},
+    {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9"},
+    {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b"},
+    {file = "nvidia_curand_cu12-10.3.5.147-py3-none-win_amd64.whl", hash = "sha256:f307cc191f96efe9e8f05a87096abc20d08845a841889ef78cb06924437f6771"},
 ]
 
 [[package]]
 name = "nvidia-cusolver-cu12"
-version = "11.4.5.107"
+version = "11.6.1.9"
 description = "CUDA solver native runtime libraries"
 optional = false
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl", hash = "sha256:8a7ec542f0412294b15072fa7dab71d31334014a69f953004ea7a118206fe0dd"},
-    {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-win_amd64.whl", hash = "sha256:74e0c3a24c78612192a74fcd90dd117f1cf21dea4822e66d89e8ea80e3cd2da5"},
+    {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e"},
+    {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260"},
+    {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-win_amd64.whl", hash = "sha256:e77314c9d7b694fcebc84f58989f3aa4fb4cb442f12ca1a9bde50f5e8f6d1b9c"},
 ]
 
 [package.dependencies]
@@ -3818,13 +3584,14 @@ nvidia-nvjitlink-cu12 = "*"
 
 [[package]]
 name = "nvidia-cusparse-cu12"
-version = "12.1.0.106"
+version = "12.3.1.170"
 description = "CUSPARSE native runtime libraries"
 optional = false
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:f3b50f42cf363f86ab21f720998517a659a48131e8d538dc02f8768237bd884c"},
-    {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-win_amd64.whl", hash = "sha256:b798237e81b9719373e8fae8d4f091b70a0cf09d9d85c95a557e11df2d8e9a5a"},
+    {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3"},
+    {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1"},
+    {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-win_amd64.whl", hash = "sha256:9bc90fb087bc7b4c15641521f31c0371e9a612fc2ba12c338d3ae032e6b6797f"},
 ]
 
 [package.dependencies]
@@ -3832,13 +3599,12 @@ nvidia-nvjitlink-cu12 = "*"
 
 [[package]]
 name = "nvidia-nccl-cu12"
-version = "2.20.5"
+version = "2.21.5"
 description = "NVIDIA Collective Communication Library (NCCL) Runtime"
 optional = false
 python-versions = ">=3"
 files = [
-    {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01"},
-    {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:057f6bf9685f75215d0c53bf3ac4a10b3e6578351de307abad9e18a99182af56"},
+    {file = "nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8579076d30a8c24988834445f8d633c697d42397e92ffc3f63fa26766d25e0a0"},
 ]
 
 [[package]]
@@ -3855,13 +3621,14 @@ files = [
 
 [[package]]
 name = "nvidia-nvtx-cu12"
-version = "12.1.105"
+version = "12.4.127"
 description = "NVIDIA Tools Extension"
 optional = false
 python-versions = ">=3"
 files = [
-    {file = "nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:dc21cf308ca5691e7c04d962e213f8a4aa9bbfa23d95412f452254c2caeb09e5"},
-    {file = "nvidia_nvtx_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:65f4d98982b31b60026e0e6de73fbdfc09d08a96f4656dd3665ca616a11e1e82"},
+    {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3"},
+    {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a"},
+    {file = "nvidia_nvtx_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485"},
 ]
 
 [[package]]
@@ -4293,22 +4060,6 @@ toolz = "*"
 [package.extras]
 complete = ["blosc", "numpy (>=1.20.0)", "pandas (>=1.3)", "pyzmq"]
 
-[[package]]
-name = "pathvalidate"
-version = "3.2.1"
-description = "pathvalidate is a Python library to sanitize/validate a string such as filenames/file-paths/etc."
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "pathvalidate-3.2.1-py3-none-any.whl", hash = "sha256:9a6255eb8f63c9e2135b9be97a5ce08f10230128c4ae7b3e935378b82b22c4c9"},
-    {file = "pathvalidate-3.2.1.tar.gz", hash = "sha256:f5d07b1e2374187040612a1fcd2bcb2919f8db180df254c9581bb90bf903377d"},
-]
-
-[package.extras]
-docs = ["Sphinx (>=2.4)", "sphinx-rtd-theme (>=1.2.2)", "urllib3 (<2)"]
-readme = ["path (>=13,<17)", "readmemaker (>=1.1.0)"]
-test = ["Faker (>=1.0.8)", "allpairspy (>=2)", "click (>=6.2)", "pytest (>=6.0.1)", "pytest-md-report (>=0.6.2)"]
-
 [[package]]
 name = "pexpect"
 version = "4.9.0"
@@ -4442,25 +4193,6 @@ docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-a
 test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"]
 type = ["mypy (>=1.11.2)"]
 
-[[package]]
-name = "portalocker"
-version = "3.0.0"
-description = "Wraps the portalocker recipe for easy usage"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "portalocker-3.0.0-py3-none-any.whl", hash = "sha256:211916b539a0dc3c128a3d9e86893ecfefec5379c4ff684e798f0a00f99db406"},
-    {file = "portalocker-3.0.0.tar.gz", hash = "sha256:21f535de2e7a82c94c130c054adb5c7421d480d5619d61073996e2f89bcb879b"},
-]
-
-[package.dependencies]
-pywin32 = {version = ">=226", markers = "platform_system == \"Windows\""}
-
-[package.extras]
-docs = ["sphinx (>=1.7.1)"]
-redis = ["redis"]
-tests = ["pytest (>=5.4.1)", "pytest-cov (>=2.8.1)", "pytest-mypy (>=0.8.0)", "pytest-timeout (>=2.1.0)", "redis", "sphinx (>=6.0.0)", "types-redis"]
-
 [[package]]
 name = "preshed"
 version = "3.0.9"
@@ -4677,37 +4409,6 @@ files = [
     {file = "propcache-0.2.0.tar.gz", hash = "sha256:df81779732feb9d01e5d513fad0122efb3d53bbc75f61b2a4f29a020bc985e70"},
 ]
 
-[[package]]
-name = "protobuf"
-version = "3.20.3"
-description = "Protocol Buffers"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "protobuf-3.20.3-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:f4bd856d702e5b0d96a00ec6b307b0f51c1982c2bf9c0052cf9019e9a544ba99"},
-    {file = "protobuf-3.20.3-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:9aae4406ea63d825636cc11ffb34ad3379335803216ee3a856787bcf5ccc751e"},
-    {file = "protobuf-3.20.3-cp310-cp310-win32.whl", hash = "sha256:28545383d61f55b57cf4df63eebd9827754fd2dc25f80c5253f9184235db242c"},
-    {file = "protobuf-3.20.3-cp310-cp310-win_amd64.whl", hash = "sha256:67a3598f0a2dcbc58d02dd1928544e7d88f764b47d4a286202913f0b2801c2e7"},
-    {file = "protobuf-3.20.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:899dc660cd599d7352d6f10d83c95df430a38b410c1b66b407a6b29265d66469"},
-    {file = "protobuf-3.20.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e64857f395505ebf3d2569935506ae0dfc4a15cb80dc25261176c784662cdcc4"},
-    {file = "protobuf-3.20.3-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:d9e4432ff660d67d775c66ac42a67cf2453c27cb4d738fc22cb53b5d84c135d4"},
-    {file = "protobuf-3.20.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:74480f79a023f90dc6e18febbf7b8bac7508420f2006fabd512013c0c238f454"},
-    {file = "protobuf-3.20.3-cp37-cp37m-win32.whl", hash = "sha256:b6cc7ba72a8850621bfec987cb72623e703b7fe2b9127a161ce61e61558ad905"},
-    {file = "protobuf-3.20.3-cp37-cp37m-win_amd64.whl", hash = "sha256:8c0c984a1b8fef4086329ff8dd19ac77576b384079247c770f29cc8ce3afa06c"},
-    {file = "protobuf-3.20.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:de78575669dddf6099a8a0f46a27e82a1783c557ccc38ee620ed8cc96d3be7d7"},
-    {file = "protobuf-3.20.3-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:f4c42102bc82a51108e449cbb32b19b180022941c727bac0cfd50170341f16ee"},
-    {file = "protobuf-3.20.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:44246bab5dd4b7fbd3c0c80b6f16686808fab0e4aca819ade6e8d294a29c7050"},
-    {file = "protobuf-3.20.3-cp38-cp38-win32.whl", hash = "sha256:c02ce36ec760252242a33967d51c289fd0e1c0e6e5cc9397e2279177716add86"},
-    {file = "protobuf-3.20.3-cp38-cp38-win_amd64.whl", hash = "sha256:447d43819997825d4e71bf5769d869b968ce96848b6479397e29fc24c4a5dfe9"},
-    {file = "protobuf-3.20.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:398a9e0c3eaceb34ec1aee71894ca3299605fa8e761544934378bbc6c97de23b"},
-    {file = "protobuf-3.20.3-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:bf01b5720be110540be4286e791db73f84a2b721072a3711efff6c324cdf074b"},
-    {file = "protobuf-3.20.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:daa564862dd0d39c00f8086f88700fdbe8bc717e993a21e90711acfed02f2402"},
-    {file = "protobuf-3.20.3-cp39-cp39-win32.whl", hash = "sha256:819559cafa1a373b7096a482b504ae8a857c89593cf3a25af743ac9ecbd23480"},
-    {file = "protobuf-3.20.3-cp39-cp39-win_amd64.whl", hash = "sha256:03038ac1cfbc41aa21f6afcbcd357281d7521b4157926f30ebecc8d4ea59dcb7"},
-    {file = "protobuf-3.20.3-py2.py3-none-any.whl", hash = "sha256:a7ca6d488aa8ff7f329d4c545b2dbad8ac31464f1d8b1c87ad1346717731e4db"},
-    {file = "protobuf-3.20.3.tar.gz", hash = "sha256:2e3427429c9cffebf259491be0af70189607f365c2f41c7c3764af6f337105f2"},
-]
-
 [[package]]
 name = "psutil"
 version = "6.1.0"
@@ -5090,42 +4791,6 @@ files = [
 [package.extras]
 diagrams = ["jinja2", "railroad-diagrams"]
 
-[[package]]
-name = "pytablewriter"
-version = "1.2.0"
-description = "pytablewriter is a Python library to write a table in various formats: AsciiDoc / CSV / Elasticsearch / HTML / JavaScript / JSON / LaTeX / LDJSON / LTSV / Markdown / MediaWiki / NumPy / Excel / Pandas / Python / reStructuredText / SQLite / TOML / TSV / YAML."
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "pytablewriter-1.2.0-py3-none-any.whl", hash = "sha256:4a30e2bb4bf5bc1069b1d2b2bc41947577c4517ab0875b23a5b194d296f543d8"},
-    {file = "pytablewriter-1.2.0.tar.gz", hash = "sha256:0204a4bb684a22140d640f2599f09e137bcdc18b3dd49426f4a555016e246b46"},
-]
-
-[package.dependencies]
-DataProperty = ">=1.0.1,<2"
-mbstrdecoder = ">=1.0.0,<2"
-pathvalidate = ">=2.3.0,<4"
-setuptools = ">=38.3.0"
-tabledata = ">=1.3.1,<2"
-tcolorpy = ">=0.0.5,<1"
-typepy = {version = ">=1.3.2,<2", extras = ["datetime"]}
-
-[package.extras]
-all = ["PyYAML (>=3.11,<7)", "SimpleSQLite (>=1.3.2,<2)", "XlsxWriter (>=0.9.6,<4)", "dominate (>=2.1.5,<3)", "elasticsearch (>=8.0.1,<9)", "loguru (>=0.4.1,<1)", "pandas (>=0.25.3,<3)", "pytablereader (>=0.31.3,<2)", "pytablewriter-altcol-theme (>=0.1.0,<1)", "pytablewriter-altrow-theme (>=0.2.0,<1)", "simplejson (>=3.8.1,<4)", "toml (>=0.9.3,<1)", "xlwt"]
-docs = ["PyYAML (>=3.11,<7)", "SimpleSQLite (>=1.3.2,<2)", "Sphinx (>=2.4)", "XlsxWriter (>=0.9.6,<4)", "dominate (>=2.1.5,<3)", "elasticsearch (>=8.0.1,<9)", "loguru (>=0.4.1,<1)", "pandas (>=0.25.3,<3)", "pytablereader (>=0.31.3,<2)", "pytablewriter-altcol-theme (>=0.1.0,<1)", "pytablewriter-altrow-theme (>=0.2.0,<1)", "simplejson (>=3.8.1,<4)", "sphinx-rtd-theme (>=1.2.2)", "toml (>=0.9.3,<1)", "xlwt"]
-es = ["elasticsearch (>=8.0.1,<9)"]
-es8 = ["elasticsearch (>=8.0.1,<9)"]
-excel = ["XlsxWriter (>=0.9.6,<4)", "xlwt"]
-from = ["pytablereader (>=0.31.3,<2)"]
-html = ["dominate (>=2.1.5,<3)"]
-logging = ["loguru (>=0.4.1,<1)"]
-pandas = ["pandas (>=0.25.3,<3)"]
-sqlite = ["SimpleSQLite (>=1.3.2,<2)"]
-test = ["PyYAML (>=3.11,<7)", "SimpleSQLite (>=1.3.2,<2)", "XlsxWriter (>=0.9.6,<4)", "beautifulsoup4 (>=4.10)", "dominate (>=2.1.5,<3)", "elasticsearch (>=8.0.1,<9)", "loguru (>=0.4.1,<1)", "pandas (>=0.25.3,<3)", "pytablereader (>=0.31.3,<2)", "pytablereader[excel,sqlite] (>=0.31.3)", "pytablewriter-altcol-theme (>=0.1.0,<1)", "pytablewriter-altrow-theme (>=0.2.0,<1)", "pytest (>=6.0.1)", "pytest-md-report (>=0.4.1)", "simplejson (>=3.8.1,<4)", "sqliteschema (>=1.3.0)", "tablib (>=3.2.0)", "toml (>=0.9.3,<1)", "xlwt"]
-theme = ["pytablewriter-altcol-theme (>=0.1.0,<1)", "pytablewriter-altrow-theme (>=0.2.0,<1)"]
-toml = ["toml (>=0.9.3,<1)"]
-yaml = ["PyYAML (>=3.11,<7)"]
-
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
@@ -5693,30 +5358,6 @@ files = [
     {file = "rpds_py-0.21.0.tar.gz", hash = "sha256:ed6378c9d66d0de903763e7706383d60c33829581f0adff47b6535f1802fa6db"},
 ]
 
-[[package]]
-name = "sacrebleu"
-version = "2.4.3"
-description = "Hassle-free computation of shareable, comparable, and reproducible BLEU, chrF, and TER scores"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "sacrebleu-2.4.3-py3-none-any.whl", hash = "sha256:a976fd6998d8ced267a722120ec7fc47083c8e9745d8808ccee6424464a0aa31"},
-    {file = "sacrebleu-2.4.3.tar.gz", hash = "sha256:e734b1e0baeaea6ade0fefc9d23bac3df50bf15775d8b78edc108db63654192a"},
-]
-
-[package.dependencies]
-colorama = "*"
-lxml = "*"
-numpy = ">=1.17"
-portalocker = "*"
-regex = "*"
-tabulate = ">=0.8.9"
-
-[package.extras]
-dev = ["lxml-stubs", "mypy", "pytest", "setuptools", "types-tabulate", "wheel"]
-ja = ["ipadic (>=1.0,<2.0)", "mecab-python3 (>=1.0.9,<2.0.0)"]
-ko = ["mecab-ko (>=1.0.0,<=1.0.1)", "mecab-ko-dic (>=1.0,<2.0)"]
-
 [[package]]
 name = "safetensors"
 version = "0.4.5"
@@ -5849,106 +5490,6 @@ tensorflow = ["safetensors[numpy]", "tensorflow (>=2.11.0)"]
 testing = ["h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "hypothesis (>=6.70.2)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "safetensors[numpy]", "setuptools-rust (>=1.5.2)"]
 torch = ["safetensors[numpy]", "torch (>=1.10)"]
 
-[[package]]
-name = "scikit-learn"
-version = "1.5.2"
-description = "A set of python modules for machine learning and data mining"
-optional = false
-python-versions = ">=3.9"
-files = [
-    {file = "scikit_learn-1.5.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:299406827fb9a4f862626d0fe6c122f5f87f8910b86fe5daa4c32dcd742139b6"},
-    {file = "scikit_learn-1.5.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:2d4cad1119c77930b235579ad0dc25e65c917e756fe80cab96aa3b9428bd3fb0"},
-    {file = "scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c412ccc2ad9bf3755915e3908e677b367ebc8d010acbb3f182814524f2e5540"},
-    {file = "scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a686885a4b3818d9e62904d91b57fa757fc2bed3e465c8b177be652f4dd37c8"},
-    {file = "scikit_learn-1.5.2-cp310-cp310-win_amd64.whl", hash = "sha256:c15b1ca23d7c5f33cc2cb0a0d6aaacf893792271cddff0edbd6a40e8319bc113"},
-    {file = "scikit_learn-1.5.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:03b6158efa3faaf1feea3faa884c840ebd61b6484167c711548fce208ea09445"},
-    {file = "scikit_learn-1.5.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:1ff45e26928d3b4eb767a8f14a9a6efbf1cbff7c05d1fb0f95f211a89fd4f5de"},
-    {file = "scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f763897fe92d0e903aa4847b0aec0e68cadfff77e8a0687cabd946c89d17e675"},
-    {file = "scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8b0ccd4a902836493e026c03256e8b206656f91fbcc4fde28c57a5b752561f1"},
-    {file = "scikit_learn-1.5.2-cp311-cp311-win_amd64.whl", hash = "sha256:6c16d84a0d45e4894832b3c4d0bf73050939e21b99b01b6fd59cbb0cf39163b6"},
-    {file = "scikit_learn-1.5.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f932a02c3f4956dfb981391ab24bda1dbd90fe3d628e4b42caef3e041c67707a"},
-    {file = "scikit_learn-1.5.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:3b923d119d65b7bd555c73be5423bf06c0105678ce7e1f558cb4b40b0a5502b1"},
-    {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"},
-    {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"},
-    {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"},
-    {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"},
-    {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"},
-    {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"},
-    {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"},
-    {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"},
-    {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"},
-    {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"},
-    {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"},
-    {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca64b3089a6d9b9363cd3546f8978229dcbb737aceb2c12144ee3f70f95684b7"},
-    {file = "scikit_learn-1.5.2-cp39-cp39-win_amd64.whl", hash = "sha256:3bed4909ba187aca80580fe2ef370d9180dcf18e621a27c4cf2ef10d279a7efe"},
-    {file = "scikit_learn-1.5.2.tar.gz", hash = "sha256:b4237ed7b3fdd0a4882792e68ef2545d5baa50aca3bb45aa7df468138ad8f94d"},
-]
-
-[package.dependencies]
-joblib = ">=1.2.0"
-numpy = ">=1.19.5"
-scipy = ">=1.6.0"
-threadpoolctl = ">=3.1.0"
-
-[package.extras]
-benchmark = ["matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "pandas (>=1.1.5)"]
-build = ["cython (>=3.0.10)", "meson-python (>=0.16.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)"]
-docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pydata-sphinx-theme (>=0.15.3)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)", "sphinx (>=7.3.7)", "sphinx-copybutton (>=0.5.2)", "sphinx-design (>=0.5.0)", "sphinx-design (>=0.6.0)", "sphinx-gallery (>=0.16.0)", "sphinx-prompt (>=1.4.0)", "sphinx-remove-toctrees (>=1.0.0.post1)", "sphinxcontrib-sass (>=0.3.4)", "sphinxext-opengraph (>=0.9.1)"]
-examples = ["matplotlib (>=3.3.4)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)"]
-install = ["joblib (>=1.2.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)", "threadpoolctl (>=3.1.0)"]
-maintenance = ["conda-lock (==2.5.6)"]
-tests = ["black (>=24.3.0)", "matplotlib (>=3.3.4)", "mypy (>=1.9)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pyarrow (>=12.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.2.1)", "scikit-image (>=0.17.2)"]
-
-[[package]]
-name = "scipy"
-version = "1.14.1"
-description = "Fundamental algorithms for scientific computing in Python"
-optional = false
-python-versions = ">=3.10"
-files = [
-    {file = "scipy-1.14.1-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:b28d2ca4add7ac16ae8bb6632a3c86e4b9e4d52d3e34267f6e1b0c1f8d87e389"},
-    {file = "scipy-1.14.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d0d2821003174de06b69e58cef2316a6622b60ee613121199cb2852a873f8cf3"},
-    {file = "scipy-1.14.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8bddf15838ba768bb5f5083c1ea012d64c9a444e16192762bd858f1e126196d0"},
-    {file = "scipy-1.14.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:97c5dddd5932bd2a1a31c927ba5e1463a53b87ca96b5c9bdf5dfd6096e27efc3"},
-    {file = "scipy-1.14.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ff0a7e01e422c15739ecd64432743cf7aae2b03f3084288f399affcefe5222d"},
-    {file = "scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8e32dced201274bf96899e6491d9ba3e9a5f6b336708656466ad0522d8528f69"},
-    {file = "scipy-1.14.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8426251ad1e4ad903a4514712d2fa8fdd5382c978010d1c6f5f37ef286a713ad"},
-    {file = "scipy-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:a49f6ed96f83966f576b33a44257d869756df6cf1ef4934f59dd58b25e0327e5"},
-    {file = "scipy-1.14.1-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:2da0469a4ef0ecd3693761acbdc20f2fdeafb69e6819cc081308cc978153c675"},
-    {file = "scipy-1.14.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:c0ee987efa6737242745f347835da2cc5bb9f1b42996a4d97d5c7ff7928cb6f2"},
-    {file = "scipy-1.14.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3a1b111fac6baec1c1d92f27e76511c9e7218f1695d61b59e05e0fe04dc59617"},
-    {file = "scipy-1.14.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8475230e55549ab3f207bff11ebfc91c805dc3463ef62eda3ccf593254524ce8"},
-    {file = "scipy-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:278266012eb69f4a720827bdd2dc54b2271c97d84255b2faaa8f161a158c3b37"},
-    {file = "scipy-1.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fef8c87f8abfb884dac04e97824b61299880c43f4ce675dd2cbeadd3c9b466d2"},
-    {file = "scipy-1.14.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b05d43735bb2f07d689f56f7b474788a13ed8adc484a85aa65c0fd931cf9ccd2"},
-    {file = "scipy-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:716e389b694c4bb564b4fc0c51bc84d381735e0d39d3f26ec1af2556ec6aad94"},
-    {file = "scipy-1.14.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:631f07b3734d34aced009aaf6fedfd0eb3498a97e581c3b1e5f14a04164a456d"},
-    {file = "scipy-1.14.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:af29a935803cc707ab2ed7791c44288a682f9c8107bc00f0eccc4f92c08d6e07"},
-    {file = "scipy-1.14.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:2843f2d527d9eebec9a43e6b406fb7266f3af25a751aa91d62ff416f54170bc5"},
-    {file = "scipy-1.14.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:eb58ca0abd96911932f688528977858681a59d61a7ce908ffd355957f7025cfc"},
-    {file = "scipy-1.14.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:30ac8812c1d2aab7131a79ba62933a2a76f582d5dbbc695192453dae67ad6310"},
-    {file = "scipy-1.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f9ea80f2e65bdaa0b7627fb00cbeb2daf163caa015e59b7516395fe3bd1e066"},
-    {file = "scipy-1.14.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:edaf02b82cd7639db00dbff629995ef185c8df4c3ffa71a5562a595765a06ce1"},
-    {file = "scipy-1.14.1-cp312-cp312-win_amd64.whl", hash = "sha256:2ff38e22128e6c03ff73b6bb0f85f897d2362f8c052e3b8ad00532198fbdae3f"},
-    {file = "scipy-1.14.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1729560c906963fc8389f6aac023739ff3983e727b1a4d87696b7bf108316a79"},
-    {file = "scipy-1.14.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:4079b90df244709e675cdc8b93bfd8a395d59af40b72e339c2287c91860deb8e"},
-    {file = "scipy-1.14.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:e0cf28db0f24a38b2a0ca33a85a54852586e43cf6fd876365c86e0657cfe7d73"},
-    {file = "scipy-1.14.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:0c2f95de3b04e26f5f3ad5bb05e74ba7f68b837133a4492414b3afd79dfe540e"},
-    {file = "scipy-1.14.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b99722ea48b7ea25e8e015e8341ae74624f72e5f21fc2abd45f3a93266de4c5d"},
-    {file = "scipy-1.14.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5149e3fd2d686e42144a093b206aef01932a0059c2a33ddfa67f5f035bdfe13e"},
-    {file = "scipy-1.14.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e4f5a7c49323533f9103d4dacf4e4f07078f360743dec7f7596949149efeec06"},
-    {file = "scipy-1.14.1-cp313-cp313-win_amd64.whl", hash = "sha256:baff393942b550823bfce952bb62270ee17504d02a1801d7fd0719534dfb9c84"},
-    {file = "scipy-1.14.1.tar.gz", hash = "sha256:5a275584e726026a5699459aa72f828a610821006228e841b94275c4a7c08417"},
-]
-
-[package.dependencies]
-numpy = ">=1.23.5,<2.3"
-
-[package.extras]
-dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy (==1.10.0)", "pycodestyle", "pydevtool", "rich-click", "ruff (>=0.0.292)", "types-psutil", "typing_extensions"]
-doc = ["jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.13.1)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<=7.3.7)", "sphinx-design (>=0.4.0)"]
-test = ["Cython", "array-api-strict (>=2.0)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"]
-
 [[package]]
 name = "seaborn"
 version = "0.13.2"
@@ -6844,25 +6385,6 @@ mpmath = ">=1.1.0,<1.4"
 [package.extras]
 dev = ["hypothesis (>=6.70.0)", "pytest (>=7.1.0)"]
 
-[[package]]
-name = "tabledata"
-version = "1.3.3"
-description = "tabledata is a Python library to represent tabular data. Used for pytablewriter/pytablereader/SimpleSQLite/etc."
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "tabledata-1.3.3-py3-none-any.whl", hash = "sha256:4abad1c996d8607e23b045b44dc0c5f061668f3c37585302c5f6c84c93a89962"},
-    {file = "tabledata-1.3.3.tar.gz", hash = "sha256:c90daaba9a408e4397934b3ff2f6c06797d5289676420bf520c741ad43e6ff91"},
-]
-
-[package.dependencies]
-DataProperty = ">=1.0.1,<2"
-typepy = ">=1.2.0,<2"
-
-[package.extras]
-logging = ["loguru (>=0.4.1,<1)"]
-test = ["pytablewriter (>=0.46)", "pytest"]
-
 [[package]]
 name = "tabulate"
 version = "0.9.0"
@@ -6888,34 +6410,6 @@ files = [
     {file = "tblib-3.0.0.tar.gz", hash = "sha256:93622790a0a29e04f0346458face1e144dc4d32f493714c6c3dff82a4adb77e6"},
 ]
 
-[[package]]
-name = "tcolorpy"
-version = "0.1.6"
-description = "tcolopy is a Python library to apply true color for terminal text."
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "tcolorpy-0.1.6-py3-none-any.whl", hash = "sha256:8c15cb3167f30b0a433d72297e9d68667c825bd9e2af41c8dd7dfbd3d7f7e207"},
-    {file = "tcolorpy-0.1.6.tar.gz", hash = "sha256:8cea0bf5f8cf03f77528a9acfbf312df935573892ba5ea3b2516e61fa54de9a5"},
-]
-
-[package.extras]
-test = ["pytest (>=6.0.1)", "pytest-md-report (>=0.5)"]
-
-[[package]]
-name = "termcolor"
-version = "2.3.0"
-description = "ANSI color formatting for output in terminal"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "termcolor-2.3.0-py3-none-any.whl", hash = "sha256:3afb05607b89aed0ffe25202399ee0867ad4d3cb4180d98aaf8eefa6a5f7d475"},
-    {file = "termcolor-2.3.0.tar.gz", hash = "sha256:b5b08f68937f138fe92f6c089b99f1e2da0ae56c52b78bf7075fd95420fd9a5a"},
-]
-
-[package.extras]
-tests = ["pytest", "pytest-cov"]
-
 [[package]]
 name = "terminado"
 version = "0.18.1"
@@ -7022,17 +6516,6 @@ mxnet = ["mxnet (>=1.5.1,<1.6.0)"]
 tensorflow = ["tensorflow (>=2.0.0,<2.6.0)"]
 torch = ["torch (>=1.6.0)"]
 
-[[package]]
-name = "threadpoolctl"
-version = "3.5.0"
-description = "threadpoolctl"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "threadpoolctl-3.5.0-py3-none-any.whl", hash = "sha256:56c1e26c150397e58c4926da8eeee87533b1e32bef131bd4bf6a2f45f3185467"},
-    {file = "threadpoolctl-3.5.0.tar.gz", hash = "sha256:082433502dd922bf738de0d8bcc4fdcbf0979ff44c42bd40f5af8a282f6fa107"},
-]
-
 [[package]]
 name = "tiktoken"
 version = "0.7.0"
@@ -7266,31 +6749,28 @@ files = [
 
 [[package]]
 name = "torch"
-version = "2.4.1"
+version = "2.5.1"
 description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:362f82e23a4cd46341daabb76fba08f04cd646df9bfaf5da50af97cb60ca4971"},
-    {file = "torch-2.4.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:e8ac1985c3ff0f60d85b991954cfc2cc25f79c84545aead422763148ed2759e3"},
-    {file = "torch-2.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:91e326e2ccfb1496e3bee58f70ef605aeb27bd26be07ba64f37dcaac3d070ada"},
-    {file = "torch-2.4.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:d36a8ef100f5bff3e9c3cea934b9e0d7ea277cb8210c7152d34a9a6c5830eadd"},
-    {file = "torch-2.4.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:0b5f88afdfa05a335d80351e3cea57d38e578c8689f751d35e0ff36bce872113"},
-    {file = "torch-2.4.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:ef503165f2341942bfdf2bd520152f19540d0c0e34961232f134dc59ad435be8"},
-    {file = "torch-2.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:092e7c2280c860eff762ac08c4bdcd53d701677851670695e0c22d6d345b269c"},
-    {file = "torch-2.4.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:ddddbd8b066e743934a4200b3d54267a46db02106876d21cf31f7da7a96f98ea"},
-    {file = "torch-2.4.1-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:fdc4fe11db3eb93c1115d3e973a27ac7c1a8318af8934ffa36b0370efe28e042"},
-    {file = "torch-2.4.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:18835374f599207a9e82c262153c20ddf42ea49bc76b6eadad8e5f49729f6e4d"},
-    {file = "torch-2.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:ebea70ff30544fc021d441ce6b219a88b67524f01170b1c538d7d3ebb5e7f56c"},
-    {file = "torch-2.4.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:72b484d5b6cec1a735bf3fa5a1c4883d01748698c5e9cfdbeb4ffab7c7987e0d"},
-    {file = "torch-2.4.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:c99e1db4bf0c5347107845d715b4aa1097e601bdc36343d758963055e9599d93"},
-    {file = "torch-2.4.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:b57f07e92858db78c5b72857b4f0b33a65b00dc5d68e7948a8494b0314efb880"},
-    {file = "torch-2.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:f18197f3f7c15cde2115892b64f17c80dbf01ed72b008020e7da339902742cf6"},
-    {file = "torch-2.4.1-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:5fc1d4d7ed265ef853579caf272686d1ed87cebdcd04f2a498f800ffc53dab71"},
-    {file = "torch-2.4.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:40f6d3fe3bae74efcf08cb7f8295eaddd8a838ce89e9d26929d4edd6d5e4329d"},
-    {file = "torch-2.4.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:c9299c16c9743001ecef515536ac45900247f4338ecdf70746f2461f9e4831db"},
-    {file = "torch-2.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:6bce130f2cd2d52ba4e2c6ada461808de7e5eccbac692525337cfb4c19421846"},
-    {file = "torch-2.4.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:a38de2803ee6050309aac032676536c3d3b6a9804248537e38e098d0e14817ec"},
+    {file = "torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:71328e1bbe39d213b8721678f9dcac30dfc452a46d586f1d514a6aa0a99d4744"},
+    {file = "torch-2.5.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:34bfa1a852e5714cbfa17f27c49d8ce35e1b7af5608c4bc6e81392c352dbc601"},
+    {file = "torch-2.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:32a037bd98a241df6c93e4c789b683335da76a2ac142c0973675b715102dc5fa"},
+    {file = "torch-2.5.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:23d062bf70776a3d04dbe74db950db2a5245e1ba4f27208a87f0d743b0d06e86"},
+    {file = "torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:de5b7d6740c4b636ef4db92be922f0edc425b65ed78c5076c43c42d362a45457"},
+    {file = "torch-2.5.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:340ce0432cad0d37f5a31be666896e16788f1adf8ad7be481196b503dad675b9"},
+    {file = "torch-2.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:603c52d2fe06433c18b747d25f5c333f9c1d58615620578c326d66f258686f9a"},
+    {file = "torch-2.5.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:31f8c39660962f9ae4eeec995e3049b5492eb7360dd4f07377658ef4d728fa4c"},
+    {file = "torch-2.5.1-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:ed231a4b3a5952177fafb661213d690a72caaad97d5824dd4fc17ab9e15cec03"},
+    {file = "torch-2.5.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:3f4b7f10a247e0dcd7ea97dc2d3bfbfc90302ed36d7f3952b0008d0df264e697"},
+    {file = "torch-2.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:73e58e78f7d220917c5dbfad1a40e09df9929d3b95d25e57d9f8558f84c9a11c"},
+    {file = "torch-2.5.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:8c712df61101964eb11910a846514011f0b6f5920c55dbf567bff8a34163d5b1"},
+    {file = "torch-2.5.1-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:9b61edf3b4f6e3b0e0adda8b3960266b9009d02b37555971f4d1c8f7a05afed7"},
+    {file = "torch-2.5.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:1f3b7fb3cf7ab97fae52161423f81be8c6b8afac8d9760823fd623994581e1a3"},
+    {file = "torch-2.5.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:7974e3dce28b5a21fb554b73e1bc9072c25dde873fa00d54280861e7a009d7dc"},
+    {file = "torch-2.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:46c817d3ea33696ad3b9df5e774dba2257e9a4cd3c4a3afbf92f6bb13ac5ce2d"},
+    {file = "torch-2.5.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:8046768b7f6d35b85d101b4b38cba8aa2f3cd51952bc4c06a49580f2ce682291"},
 ]
 
 [package.dependencies]
@@ -7298,25 +6778,26 @@ filelock = "*"
 fsspec = "*"
 jinja2 = "*"
 networkx = "*"
-nvidia-cublas-cu12 = {version = "12.1.3.1", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cuda-cupti-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cuda-nvrtc-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cuda-runtime-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cublas-cu12 = {version = "12.4.5.8", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cuda-cupti-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cuda-nvrtc-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cuda-runtime-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-cudnn-cu12 = {version = "9.1.0.70", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cufft-cu12 = {version = "11.0.2.54", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-curand-cu12 = {version = "10.3.2.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cusolver-cu12 = {version = "11.4.5.107", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cusparse-cu12 = {version = "12.1.0.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-nccl-cu12 = {version = "2.20.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-nvtx-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-setuptools = "*"
-sympy = "*"
-triton = {version = "3.0.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.13\""}
+nvidia-cufft-cu12 = {version = "11.2.1.3", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-curand-cu12 = {version = "10.3.5.147", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cusolver-cu12 = {version = "11.6.1.9", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cusparse-cu12 = {version = "12.3.1.170", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-nccl-cu12 = {version = "2.21.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-nvjitlink-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-nvtx-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+setuptools = {version = "*", markers = "python_version >= \"3.12\""}
+sympy = {version = "1.13.1", markers = "python_version >= \"3.9\""}
+triton = {version = "3.1.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.13\""}
 typing-extensions = ">=4.8.0"
 
 [package.extras]
 opt-einsum = ["opt-einsum (>=3.3)"]
-optree = ["optree (>=0.11.0)"]
+optree = ["optree (>=0.12.0)"]
 
 [[package]]
 name = "tornado"
@@ -7445,16 +6926,16 @@ vision = ["Pillow (>=10.0.1,<=15.0)"]
 
 [[package]]
 name = "triton"
-version = "3.0.0"
+version = "3.1.0"
 description = "A language and compiler for custom Deep Learning operations"
 optional = false
 python-versions = "*"
 files = [
-    {file = "triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e1efef76935b2febc365bfadf74bcb65a6f959a9872e5bddf44cc9e0adce1e1a"},
-    {file = "triton-3.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5ce8520437c602fb633f1324cc3871c47bee3b67acf9756c1a66309b60e3216c"},
-    {file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"},
-    {file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"},
-    {file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"},
+    {file = "triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8"},
+    {file = "triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f34f6e7885d1bf0eaaf7ba875a5f0ce6f3c13ba98f9503651c1e6dc6757ed5c"},
+    {file = "triton-3.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8182f42fd8080a7d39d666814fa36c5e30cc00ea7eeeb1a2983dbb4c99a0fdc"},
+    {file = "triton-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dadaca7fc24de34e180271b5cf864c16755702e9f63a16f62df714a8099126a"},
+    {file = "triton-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aafa9a20cd0d9fee523cd4504aa7131807a864cd77dcf6efe7e981f18b8c6c11"},
 ]
 
 [package.dependencies]
@@ -7465,27 +6946,6 @@ build = ["cmake (>=3.20)", "lit"]
 tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"]
 tutorials = ["matplotlib", "pandas", "tabulate"]
 
-[[package]]
-name = "typepy"
-version = "1.3.2"
-description = "typepy is a Python library for variable type checker/validator/converter at a run time."
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "typepy-1.3.2-py3-none-any.whl", hash = "sha256:d5d1022a424132622993800f1d2cd16cfdb691ac4e3b9c325f0fcb37799db1ae"},
-    {file = "typepy-1.3.2.tar.gz", hash = "sha256:b69fd48b9f50cdb3809906eef36b855b3134ff66c8893a4f8580abddb0b39517"},
-]
-
-[package.dependencies]
-mbstrdecoder = ">=1.0.0,<2"
-packaging = {version = "*", optional = true, markers = "extra == \"datetime\""}
-python-dateutil = {version = ">=2.8.0,<3.0.0", optional = true, markers = "extra == \"datetime\""}
-pytz = {version = ">=2018.9", optional = true, markers = "extra == \"datetime\""}
-
-[package.extras]
-datetime = ["packaging", "python-dateutil (>=2.8.0,<3.0.0)", "pytz (>=2018.9)"]
-test = ["packaging", "pytest (>=6.0.1)", "python-dateutil (>=2.8.0,<3.0.0)", "pytz (>=2018.9)", "tcolorpy"]
-
 [[package]]
 name = "typer"
 version = "0.9.4"
@@ -7982,4 +7442,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.11,<3.13"
-content-hash = "a4c8ab84bda79bbf8fb3f92fc685ea17b00105cc729a39973e0fedcf81462441"
+content-hash = "d9bc55f679878efa255457b264b6c142d779c1f2d9f1d0233ed562eaed4c195a"
diff --git a/pyproject.toml b/pyproject.toml
index ff4574f..219e756 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,6 @@ evaluate = "^0.4.3"
 absl-py = "^2.1.0"
 rouge-score = "^0.1.2"
 matplotlib = "^3.9.2"
-lighteval = {extras = ["accelerate"], version = "^0.6.2"}
 outlines = "^0.1.7"
 datasets = "^3.1.0"
 text-generation = "^0.7.0"
@@ -43,6 +42,9 @@ mistralai = "^1.2.5"
 llm-guard = "^0.3.15"
 pygments = "^2.18.0"
 llama-cpp-python = "^0.3.5"
+torch = "^2.5.1"
+gguf = "^0.13.0"
+accelerate = "^1.2.1"
 
 
 [build-system]
diff --git a/tamingllms/_build/.doctrees/environment.pickle b/tamingllms/_build/.doctrees/environment.pickle
index db81a7e..42959ee 100644
Binary files a/tamingllms/_build/.doctrees/environment.pickle and b/tamingllms/_build/.doctrees/environment.pickle differ
diff --git a/tamingllms/_build/.doctrees/markdown/preface.doctree b/tamingllms/_build/.doctrees/markdown/preface.doctree
index d4a62be..9ba8a92 100644
Binary files a/tamingllms/_build/.doctrees/markdown/preface.doctree and b/tamingllms/_build/.doctrees/markdown/preface.doctree differ
diff --git a/tamingllms/_build/.doctrees/markdown/toc.doctree b/tamingllms/_build/.doctrees/markdown/toc.doctree
index 376e410..50ce0d7 100644
Binary files a/tamingllms/_build/.doctrees/markdown/toc.doctree and b/tamingllms/_build/.doctrees/markdown/toc.doctree differ
diff --git a/tamingllms/_build/.doctrees/notebooks/alignment.doctree b/tamingllms/_build/.doctrees/notebooks/alignment.doctree
index d7908d9..15d23b1 100644
Binary files a/tamingllms/_build/.doctrees/notebooks/alignment.doctree and b/tamingllms/_build/.doctrees/notebooks/alignment.doctree differ
diff --git a/tamingllms/_build/.doctrees/notebooks/cost.doctree b/tamingllms/_build/.doctrees/notebooks/cost.doctree
new file mode 100644
index 0000000..181b54c
Binary files /dev/null and b/tamingllms/_build/.doctrees/notebooks/cost.doctree differ
diff --git a/tamingllms/_build/.doctrees/notebooks/evals.doctree b/tamingllms/_build/.doctrees/notebooks/evals.doctree
index 5593861..b6a64e7 100644
Binary files a/tamingllms/_build/.doctrees/notebooks/evals.doctree and b/tamingllms/_build/.doctrees/notebooks/evals.doctree differ
diff --git a/tamingllms/_build/.doctrees/notebooks/local.doctree b/tamingllms/_build/.doctrees/notebooks/local.doctree
index 6014cf8..08986fc 100644
Binary files a/tamingllms/_build/.doctrees/notebooks/local.doctree and b/tamingllms/_build/.doctrees/notebooks/local.doctree differ
diff --git a/tamingllms/_build/.doctrees/notebooks/safety.doctree b/tamingllms/_build/.doctrees/notebooks/safety.doctree
index f0ec824..6199fc9 100644
Binary files a/tamingllms/_build/.doctrees/notebooks/safety.doctree and b/tamingllms/_build/.doctrees/notebooks/safety.doctree differ
diff --git a/tamingllms/_build/.doctrees/notebooks/structured_output.doctree b/tamingllms/_build/.doctrees/notebooks/structured_output.doctree
index 053c0bc..f5c9be8 100644
Binary files a/tamingllms/_build/.doctrees/notebooks/structured_output.doctree and b/tamingllms/_build/.doctrees/notebooks/structured_output.doctree differ
diff --git a/tamingllms/_build/html/_images/bitnet.png b/tamingllms/_build/html/_images/bitnet.png
new file mode 100644
index 0000000..5d0d74e
Binary files /dev/null and b/tamingllms/_build/html/_images/bitnet.png differ
diff --git a/tamingllms/_build/html/_images/llmflation.png b/tamingllms/_build/html/_images/llmflation.png
new file mode 100644
index 0000000..5061149
Binary files /dev/null and b/tamingllms/_build/html/_images/llmflation.png differ
diff --git a/tamingllms/_build/html/_images/quantized.png b/tamingllms/_build/html/_images/quantized.png
new file mode 100644
index 0000000..2dc6d44
Binary files /dev/null and b/tamingllms/_build/html/_images/quantized.png differ
diff --git a/tamingllms/_build/html/_sources/markdown/toc.md b/tamingllms/_build/html/_sources/markdown/toc.md
index 83c6895..1578091 100644
--- a/tamingllms/_build/html/_sources/markdown/toc.md
+++ b/tamingllms/_build/html/_sources/markdown/toc.md
@@ -32,7 +32,7 @@ Abstract: *The current discourse around Large Language Models (LLMs) tends to fo
 
 ## [Chapter 6: Local LLMs in Practice](https://www.souzatharsis.com/tamingLLMs/notebooks/local.html)
 
-## Chapter 7: The Cost Factor
+## Chapter 7: The Falling Cost Paradox
 
 ## Chapter 8: Frontiers
 
diff --git a/tamingllms/_build/html/_sources/notebooks/alignment.ipynb b/tamingllms/_build/html/_sources/notebooks/alignment.ipynb
index 552ad7f..9eeeffa 100644
--- a/tamingllms/_build/html/_sources/notebooks/alignment.ipynb
+++ b/tamingllms/_build/html/_sources/notebooks/alignment.ipynb
@@ -2537,7 +2537,7 @@
    "source": [
     "## Discussion and Conclusions\n",
     "\n",
-    "LLMs are complex systems and alignment is a challenging problem. In this chapter, we discussed how post-training alignment techniques can be used to align a language model to human preferences. In the case study, we demonstrated how to use DPO to align a language model to a user-provider policy further automating the process via synthetic data generation and LLM-as-judge evaluation. Our approach does serve as a proof of concept, however, several considerations should be taken into account when using this methodology in practice.\n",
+    "LLMs are complex systems and alignment is a challenging problem. In this chapter, we discussed how post-training techniques can be used to align a language model to human preferences. In the case study, we demonstrated how to use DPO to align a language model to a user-provider policy further automating the process via synthetic data generation and LLM-as-judge evaluation. Our approach serves as a proof of concept and several considerations should be taken into account when using this methodology in practice.\n",
     "\n",
     "**Synthetic Data Generation**\n",
     "\n",
diff --git a/tamingllms/_build/html/_sources/notebooks/cost.ipynb b/tamingllms/_build/html/_sources/notebooks/cost.ipynb
new file mode 100644
index 0000000..45e3ee6
--- /dev/null
+++ b/tamingllms/_build/html/_sources/notebooks/cost.ipynb
@@ -0,0 +1,450 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "(cost)=\n",
+    "# The Falling Cost Paradox\n",
+    "```{epigraph}\n",
+    "It is a confusion of ideas to suppose that the economical use of fuel is equivalent to diminished consumption. <br>\n",
+    "The very contrary is the truth. \n",
+    "\n",
+    "-- William Stanley Jevons\n",
+    "```\n",
+    "```{contents}\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Why Optimization Matters More Than Ever\n",
+    "\n",
+    "According to recent analysis from a16z {cite}`a16z2024llmflation`, the cost of LLM inference is decreasing by approximately 10x every year - a rate that outpaces even Moore's Law in the PC revolution or Edholm's Law during the bandwidth explosion of the dot-com era.\n",
+    "\n",
+    "```{figure} ../_static/cost/llmflation.png\n",
+    "---\n",
+    "name: llmflation\n",
+    "alt: LLMflation\n",
+    "scale: 70%\n",
+    "align: center\n",
+    "---\n",
+    "LLMflation {cite}`a16z2024llmflation`: The cost of LLM inference is decreasing by approximately 10x every year.\n",
+    "```\n",
+    "\n",
+    "A model achieving an MMLU score of 42 that cost \\$60 per million tokens in late 2021 can now be run for just \\$0.06 per million tokens. For higher-capability models scoring 83 on MMLU, prices have fallen by a factor of 62 since GPT-4's introduction in March 2023. \n",
+    "\n",
+    "This dramatic decline stems from multiple compounding factors including:\n",
+    "\n",
+    "- Improved GPU efficiency through architectural advances and Moore's Law\n",
+    "- Model quantization progress, moving from 16-bit to 4-bit or lower precision\n",
+    "- Software optimizations reducing compute and memory bandwidth requirements\n",
+    "- Emergence of smaller yet similarly capable models\n",
+    "- Better instruction tuning techniques like RLHF and DPO\n",
+    "- Competition from open-source models and low-cost providers\n",
+    "\n",
+    "This trend raises a critical question: If LLM costs are plummeting so rapidly, why should businesses and developers invest precious time and resources in optimizing their LLM usage? Wouldn't it make more sense to simply wait for the next wave of cost improvements rather than optimize today? In two words: **Jevons Paradox**. \n",
+    "\n",
+    "The Jevons Paradox was first observed by English economist William Stanley Jevons in 1865. Studying coal consumption during the Industrial Revolution, Jevons made a counterintuitive discovery: as steam engines became more efficient and coal use became more economical, total coal consumption increased rather than decreased driving the (Industrial Revolution) and the total spending up.\n",
+    "\n",
+    "This pattern has repeated throughout technological history:\n",
+    "\n",
+    "- Computing Power: As cost per computation plummeted, we didn't spend less on computing, instead we found new creative uses for computers, from smartphones to cloud servers\n",
+    "- Network Bandwidth: As data transmission got cheaper, we shifted from text messaging to HD video streaming and real-time gaming\n",
+    "- Data Storage: As cost per gigabyte fell, we moved from storing documents to hosting entire media libraries and training massive AI models\n",
+    "\n",
+    "One could argue that LLMs and Generative AI more broadly are following a similar trajectory. As costs decline, we're seeing the emergence of new applications:\n",
+    "- Embedding AI capabilities into every application and workflow\n",
+    "- Real-time analysis of audio transcripts and conversations\n",
+    "- Running AI models directly on edge devices and smartphones\n",
+    "- Multimodal applications combining text, images, audio and video \n",
+    "\n",
+    "In this environment of rapidly falling costs but potential for exponential growth in usage, optimizing LLM costs becomes more, not less, important. Here's why:\n",
+    "\n",
+    "**A) Scale Magnifies Everything**. When operating at billions of tokens per day, even small inefficiencies have major effects:\n",
+    "- A single digit improvement in efficiency can save millions of dollars annually at scale\n",
+    "- Every 100 milliseconds of latency is about 8% difference in engagement rates (30% on mobile) [^groklatency]\n",
+    "[^groklatency]: Quote from Jonathan Ross, CEO of Groq, a company that specializes in AI Inference services.\n",
+    "\n",
+    "**B) Tiered Pricing Persists**. While average costs are declining, the market maintains a tiered structure:\n",
+    "- Different models offer varying price-performance tradeoffs\n",
+    "- ChatGPT Pro at \\$200 per month breaks the price drop trend perhaps triggering a new wave of premium models\n",
+    "- Cost optimization is still required to select the right model for each specific use case\n",
+    "\n",
+    "**C) Competition Drives Innovation**. Companies that master LLM efficiency gain significant advantages:\n",
+    "- Ability to offer more competitive pricing\n",
+    "- Capacity to handle larger scale operations\n",
+    "- Resources to invest in product improvement\n",
+    "\n",
+    "**D) Performance and Cost Are Linked**. Cost optimization often yields performance benefits:\n",
+    "- Resource efficiency enables handling larger user loads\n",
+    "- More efficiency and reduced latency leads to improved user experience\n",
+    "\n",
+    "In this environment, companies that master efficient LLM usage while exploiting new capabilities opened up by falling costs will be best positioned to innovate and scale. This dual focus - efficiency and innovation - will likely characterize successful AI companies in the years ahead.\n",
+    "\n",
+    "Motivated by this insight, in the next sections we will dive into the factors that drive LLM cost decay and how to optimize LLM usage in practical applications. The discussion will explore key optimization areas including inference optimization through techniques like Flash Attention and cached prompts, model compression via quantization and distillation, and practical implementation patterns such as response caching, batch processing, and early stopping - all aimed at achieving efficient usage and cost reductions while maintaining model performance and reliability.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Right-Sizing LLMs: A Strategic Approach\n",
+    "\n",
+    "Before implementing cost optimization strategies for LLMs, organizations must develop a comprehensive understanding of their own requirements and constraints. This systematic approach prevents both over-engineering and under-provisioning, leading to more efficient and cost-effective implementations.\n",
+    "\n",
+    "In this section, we define key performance and cost related metrics that will guide our discussion. Then we propose a set of requirements practitioners should consider before we dive into cost optimization techniques.\n",
+    "\n",
+    "\n",
+    "### Metrics"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Requirements\n",
+    "\n",
+    "#### Business Requirements\n",
+    "\n",
+    "First, one needs to define the problem to be solved and to what extent it is worth to be solved. Use case requirements form the foundation of any LLM implementation project. A clear definition of the specific business problema and task to be accomplished must be established upfront, along with concrete performance metrics covering accuracy, latency and throughput. This should be accompanied by well-defined cost-per-transaction targets, clear ROI expectations, and a strategic allocation of budgets across different use cases to ensure resources are optimally distributed.\n",
+    "\n",
+    "Budget and ROI considerations are critical for ensuring the long-term viability of LLM implementations. Organizations must establish clear spending limits that align with their financial capabilities while defining realistic cost-per-transaction targets. ROI expectations need to be carefully established through detailed analysis, followed by a strategic allocation of budgets across various use cases based on their business impact and priority.\n",
+    "\n",
+    "Compliance and security requirements cannot be overlooked. This involves a thorough identification of all applicable regulatory requirements and the establishment of robust data handling standards. Organizations must specify comprehensive audit requirements to maintain transparency and accountability, while implementing appropriate security controls to protect sensitive data and system access.\n",
+    "\n",
+    "Future-proofing considerations help ensure the longevity and adaptability of LLM implementations. This requires careful planning for scale to accommodate future growth, along with the evaluation of multi-model strategies to reduce dependency on single solutions. Organizations should carefully assess vendor lock-in risks and explore open-source alternatives to maintain flexibility and control over their AI infrastructure.\n",
+    "\n",
+    "Chapter {ref}`local` provides a detailed discussion on relevant considerations when {ref}`local-model-selection`.\n",
+    "\n",
+    "#### Performance Requirements\n",
+    "\n",
+    "Accuracy and quality form the foundation of any LLM deployment's performance requirements. At its core, this involves determining the minimum level of accuracy that the model must achieve to be considered successful. This serves as a critical baseline for evaluating model performance and making deployment decisions. Establishing clear evaluation metrics, whether through automated measures or human evaluation processes, provides concrete ways to assess if these thresholds are being met. Continuous monitoring of these accuracy metrics ensures the system maintains its performance over time as usage patterns and data distributions evolve. Chapter {ref}`evals` provides a detailed discussion on how to evaluate the performance of LLM-based applications.\n",
+    "\n",
+    "Latency and throughput requirements are equally crucial for ensuring a positive user experience and system reliability. These specifications define how quickly the system must respond to requests and how many concurrent users it can handle. Response time requirements must be carefully balanced against the computational resources available, while peak load capabilities need to account for usage spikes and growth patterns. The decision between real-time processing for immediate responses versus batch processing for efficiency depends heavily on the use case and user expectations. \n",
+    "\n",
+    "\n",
+    "#### Operational Requirements\n",
+    "\n",
+    "Scale and capacity planning forms the foundation of operational requirements for LLM deployments. This involves a comprehensive analysis of expected system usage and growth patterns to ensure the infrastructure can handle both current and future demands. Organizations must carefully project their daily and monthly API call volumes while calculating the average number of tokens per request to accurately estimate resource needs. Understanding usage patterns, including seasonal variations, enables proper capacity planning. Additionally, developing 12-24 month growth projections helps ensure the infrastructure can scale appropriately as demand increases.\n",
+    "\n",
+    "Reliability and availability requirements are equally critical for maintaining consistent service quality. These specifications define the expected uptime percentage that the system must maintain, typically expressed as a percentage of total operational time. Organizations need to establish clear maintenance windows that minimize disruption to users while ensuring necessary system updates and optimizations can be performed. Comprehensive backup and failover requirements must be specified to ensure business continuity in case of failures. High availability needs should be clearly defined, including redundancy levels and recovery time objectives, to maintain service quality even during unexpected events.\n",
+    "\n",
+    "#### Technical Requirements\n",
+    "\n",
+    "System integration requirements define how the LLM system will interact and communicate with existing infrastructure and applications. This involves carefully mapping all integration points where the LLM system needs to connect with other systems, establishing standardized data formats and interfaces for seamless communication, implementing robust security measures to protect data in transit, and identifying any technical constraints that could impact integration. Getting these integration requirements right is crucial for ensuring the LLM system can function effectively within the broader technical ecosystem.\n",
+    "\n",
+    "Data management requirements address how information will be stored, processed, and maintained within the LLM system. This encompasses determining appropriate storage solutions for maintaining conversation context and history, selecting and configuring vector databases to enable efficient retrieval-augmented generation (RAG), creating comprehensive data retention policies that balance operational needs with resource constraints, and ensuring all data handling practices comply with relevant privacy regulations. Proper data management is essential for both system performance and regulatory compliance, making it a critical consideration in any LLM implementation.\n",
+    "\n",
+    "\n",
+    "This structured approach to requirements analysis enables organizations to:\n",
+    "1. Select appropriate models aligned with specific needs\n",
+    "2. Identify targeted optimization opportunities\n",
+    "3. Scale efficiently while controlling costs\n",
+    "4. Develop realistic resource allocation strategies\n",
+    "\n",
+    "The following sections explore specific optimization techniques, but their implementation should always be guided by these foundational requirements.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Quantization\n",
+    "\n",
+    "Quantization is a common and relevant technique in making LLMs more efficient and accessible. At a high level, quantization reduces the number of bits used to represent a model's parameters. The most common form of quantization is to represent model's weights at lower precision at post-training phase. It has become a standard technique to generate a series of quantized models given a large pre-trained base model.\n",
+    "\n",
+    "While a standard pre-trained LLM might use 32-bit floating-point (FP32) or 16-bit floating-point (FP16) numbers to store its weights, quantized versions can operate at lower precision levels such as 8, 4 or even 2 bits per parameter, reducing memory footprint without proportional losses in performance, necessarily. For instance, for a model of 30 billion parameters, using FP32 means 4 bytes per weight or 120 GB for the whole weights. If the model is quantized such that weights are represented in 1 byte, the memory needed for the model’s weights decreases to 30 GB, hence potentially fitting into consumer grade hardware. This is done at the cost of precision loss, but the trade-off is often worth it though require careful analysis."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's take a look at model weights of a language model (`SmolLM2-135M-Instruct`) that has been quantized to 2-bit and 16-bit precisions. We will use an utility function `load_gguf` from the `taming_utils` package to load model weights of the quantized models directly from Hugging Face."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from taming_utils import load_gguf\n",
+    "\n",
+    "MODEL_NAME = \"bartowski/SmolLM2-135M-Instruct-GGUF\"\n",
+    "GGUF_FILE_Q2_K = \"SmolLM2-135M-Instruct-Q2_K.gguf\"\n",
+    "GGUF_FILE_F16 = \"SmolLM2-135M-Instruct-F16.gguf\"\n",
+    "\n",
+    "model_q2_k = load_gguf(model_name=MODEL_NAME, \n",
+    "              gguf_file=GGUF_FILE_Q2_K)\n",
+    "\n",
+    "model_f16 = load_gguf(model_name=MODEL_NAME, \n",
+    "              gguf_file=GGUF_FILE_F16)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We extract the MLP weights from the first layer of the model as a proxy."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mlp_weights_q2_k = model_q2_k.model.layers[0].mlp.gate_proj.weight\n",
+    "mlp_weights_f16 = model_f16.model.layers[0].mlp.gate_proj.weight"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Original weights at 16-bit precision:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Parameter containing:\n",
+       "tensor([[-0.0145,  0.1826,  0.1377,  ...,  0.1719, -0.1387, -0.0298],\n",
+       "        [-0.1631,  0.0781, -0.2051,  ..., -0.2070, -0.0334,  0.2891],\n",
+       "        [-0.1768, -0.0488, -0.2393,  ..., -0.0396, -0.1348, -0.1533],\n",
+       "        ...,\n",
+       "        [ 0.0771,  0.0845, -0.0232,  ...,  0.0178, -0.1040, -0.0771],\n",
+       "        [ 0.1582,  0.1167, -0.0474,  ...,  0.0845,  0.0359, -0.2500],\n",
+       "        [ 0.0432,  0.0972,  0.0933,  ...,  0.2188,  0.0776,  0.0674]],\n",
+       "       requires_grad=True)"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mlp_weights_f16"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Quantized weights at 2-bit precision:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Parameter containing:\n",
+       "tensor([[-0.0028,  0.1852,  0.1396,  ...,  0.1506, -0.1635, -0.0043],\n",
+       "        [-0.1768,  0.0680, -0.2257,  ..., -0.1890, -0.0464,  0.2960],\n",
+       "        [-0.1840, -0.0451, -0.2395,  ..., -0.0413, -0.1446, -0.1446],\n",
+       "        ...,\n",
+       "        [ 0.0621,  0.0621, -0.0478,  ...,  0.0038, -0.0830, -0.0830],\n",
+       "        [ 0.1473,  0.0926, -0.0547,  ...,  0.0824,  0.0429, -0.2737],\n",
+       "        [ 0.0355,  0.0782,  0.0782,  ...,  0.2043,  0.0740,  0.0740]],\n",
+       "       requires_grad=True)"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mlp_weights_q2_k"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "How do they compare? We arrive at a Pearson correlation of 99.7% between the two sets of weights."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pearson correlation: 0.9970\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "# Convert tensors to numpy arrays (detach from computation graph if needed)\n",
+    "weights_f16 = mlp_weights_f16.detach().cpu().numpy()\n",
+    "weights_q2_k = mlp_weights_q2_k.detach().cpu().numpy()\n",
+    "\n",
+    "flat_f16 = weights_f16.flatten()\n",
+    "flat_q2_k = weights_q2_k.flatten()\n",
+    "\n",
+    "# Calculate correlation\n",
+    "correlation = np.corrcoef(flat_f16, flat_q2_k)[0,1]\n",
+    "print(f\"Pearson correlation: {correlation:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Quantization is a powerful technique for reducing the memory footprint of LLMs. This can be exemplified by the case of LLaMa 3.3 70B as quantized by {cite}`unsloth2024llama3` [^unsloth]. The model's memory requirements vary significantly based on the quantization level used as demonstrated in {numref}`quantized`.\n",
+    "\n",
+    "[^unsloth]: Unsloth runs a business of making LLMs fine-tuning streamlined. Check them out at [unsloth.ai](https://unsloth.ai).\n",
+    "\n",
+    "```{figure} ../_static/cost/quantized.png\n",
+    "---\n",
+    "name: quantized\n",
+    "alt: Quantized Model Size\n",
+    "scale: 50%\n",
+    "align: center\n",
+    "---\n",
+    "Quantized Model Size: `unsloth/Llama-3.3-70B-Instruct-GGUF`\n",
+    "```\n",
+    "\n",
+    "We observe that the quantization process yields remarkable reductions in model size, demonstrating a clear trade-off between precision and memory requirements. The transition from F16 (141.1 GB) to Q8_0 (75 GB) achieves a dramatic 47% reduction in model size while maintaining relatively high numerical precision. Further quantization levels reveal an interesting pattern of diminishing returns - each step down in precision yields progressively smaller absolute size reductions, though the cumulative effect remains significant. At the extreme end, the Q2_K model (26.4 GB) requires only 19% of the storage space of its F16 counterpart [^quantization-levels].\n",
+    "\n",
+    "[^quantization-levels]: You may have noticed quantization levels have a special notation. Including the bit width in the name of the model but also quantization types (e.g. _K, _0). You can find more information about the quantization levels in {cite}`huggingface2024quantization`.\n",
+    "\n",
+    "This wide spectrum of model sizes enables deployment across diverse hardware environments. The lightweight Q2_K variant opens possibilities for running inference on consumer-grade hardware like high-end laptops or desktop computers. In contrast, the full-precision F16 model demands enterprise-grade computing resources with substantial memory capacity. This flexibility in deployment options makes quantization a powerful tool for democratizing access to large language models while managing computational costs.\n",
+    " \n",
+    "While quantization has proven highly effective, there is a limit to how far it can be pushed - specifically, the 1-bit ceiling. A notable advancement in this space is BitNet {cite}`wang20241bitaiinfra11` which pushes the boundaries of extreme quantization.\n",
+    "\n",
+    "BitNet's implementation, bitnet.cpp, has demonstrated significant performance improvements across both ARM and x86 architectures (see {numref}`bitnet`). When compared to llama.cpp, the framework achieves speedups ranging from 1.37x to 5.07x on ARM processors and 2.37x to 6.17x on x86 systems. These performance gains scale with model size - larger models benefit more substantially from BitNet's optimizations. The efficiency improvements extend beyond raw speed: energy consumption drops by 55-70% on ARM and 71-82% on x86 processors. Perhaps most impressively, bitnet.cpp enables running a 100B parameter BitNet b1.58 model on a single CPU at speeds matching human reading pace (5-7 tokens per second).\n",
+    "\n",
+    "```{figure} ../_static/cost/bitnet.png\n",
+    "---\n",
+    "name: bitnet\n",
+    "alt: BitNet\n",
+    "scale: 30%\n",
+    "align: center\n",
+    "---\n",
+    "BitNet: {cite}`wang20241bitaiinfra11`\n",
+    "```\n",
+    "\n",
+    "The framework's initial release focused on CPU inference optimization, with particular emphasis on 1-bit LLM architectures (BitNet b1.58). While initial testing shows promising results, these findings are specific to the tested models and kernels (its specialized kernels are carefully crafted to exploit the unique characteristics of these extremely quantized models). Further validation is needed before generalizing these results across different architectures and use cases.\n",
+    "\n",
+    "As a relatively recent innovation, 1-bit LLMs represent an exciting frontier in model compression. However, their full potential and limitations require additional research and real-world validation. The technology demonstrates how creative approaches to quantization can continue pushing the boundaries of efficient AI deployment."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Beyond its memory footprint reduction, quantization delivers several compelling advantages: it accelerates computation through faster arithmetic operations and larger batch sizes, reduces costs by enabling deployment on less expensive hardware and making LLMs more accessible to smaller organizations, and improves energy efficiency by lowering memory bandwidth usage and power consumption - particularly beneficial for mobile and edge devices, ultimately contributing to more sustainable AI deployment.\n",
+    "\n",
+    "Each reduction in precision risks performance degradation. Finding optimal quantization schemes remains an active research area. See Case Study on Quantization for Local Models in Chapter {ref}`local` for more details.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Check-list\n",
+    "\n",
+    "**Planning and Requirements**\n",
+    "- [ ] Start with a clear understanding of your application's needs and the factors that contribute to LLM costs\n",
+    "- [ ] Choose the right model for your task, balancing performance and cost\n",
+    "- [ ] Be aware of the potential challenges and limitations of open-source LLMs and take appropriate measures to mitigate them\n",
+    "\n",
+    "**Model Optimization**\n",
+    "- [ ] Explore model compression and quantization to reduce model size and computational demands\n",
+    "- [ ] Fine-tune pre-trained models on domain-specific data to improve accuracy and efficiency\n",
+    "- [ ] Consider using RAG to enhance performance and reduce reliance on purely generative processes\n",
+    "\n",
+    "**Prompt Engineering**\n",
+    "- [ ] Optimize prompts and utilize prompt engineering techniques to minimize token usage\n",
+    "- [ ] Experiment with different prompting strategies to unlock the full potential of open-source LLMs\n",
+    "\n",
+    "**Infrastructure and Operations**\n",
+    "- [ ] Implement caching and batching strategies to optimize resource utilization\n",
+    "- [ ] Monitor LLM usage patterns and costs to identify areas for optimization\n",
+    "- [ ] Set up observability and logging to track model performance and costs\n",
+    "- [ ] Establish automated testing and evaluation pipelines\n",
+    "\n",
+    "**Cost Management**\n",
+    "- [ ] Track and analyze inference costs across different model variants\n",
+    "- [ ] Implement cost allocation and chargeback mechanisms\n",
+    "- [ ] Set up cost alerts and budgeting controls\n",
+    "- [ ] Regularly review and optimize resource utilization\n",
+    "\n",
+    "## Conclusion"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]\n",
+    "\n",
+    "[cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/\n",
+    "[cc-by-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png\n",
+    "[cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC-BY--NC--SA-4.0-lightgrey.svg\n",
+    "\n",
+    "```\n",
+    "@misc{tharsistpsouza2024tamingllms,\n",
+    "  author = {Tharsis T. P. Souza},\n",
+    "  title = {Taming LLMs: A Practical Guide to LLM Pitfalls with Open Source Software},\n",
+    "  year = {2024},\n",
+    "  chapter = {The Falling Cost Paradox},\n",
+    "  journal = {GitHub repository},\n",
+    "  url = {https://github.com/souzatharsis/tamingLLMs)\n",
+    "}\n",
+    "```\n",
+    "## References\n",
+    "```{bibliography}\n",
+    ":filter: docname in docnames\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tamingllms/_build/html/_sources/notebooks/local.ipynb b/tamingllms/_build/html/_sources/notebooks/local.ipynb
index b451331..fde2739 100644
--- a/tamingllms/_build/html/_sources/notebooks/local.ipynb
+++ b/tamingllms/_build/html/_sources/notebooks/local.ipynb
@@ -4,6 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "(local)=\n",
     "# Local LLMs in Practice\n",
     "```{epigraph}\n",
     "Freedom is something that dies unless it's used.\n",
@@ -40,7 +41,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Models Considerations\n",
+    "(local-model-selection)=\n",
+    "## Choosing your Model\n",
     "\n",
     "The landscape of open source LLMs is rapidly evolving, with new models emerging by the day. While proprietary LLMs have garnered significant attention, open source LLMs are gaining traction due to their flexibility, customization options, and cost-effectiveness. \n",
     "\n",
@@ -1352,7 +1354,6 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Citation\n",
     "[![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]\n",
     "\n",
     "[cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/\n",
diff --git a/tamingllms/_build/html/_sources/notebooks/structured_output.ipynb b/tamingllms/_build/html/_sources/notebooks/structured_output.ipynb
index 4bc64db..64359b4 100644
--- a/tamingllms/_build/html/_sources/notebooks/structured_output.ipynb
+++ b/tamingllms/_build/html/_sources/notebooks/structured_output.ipynb
@@ -467,9 +467,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/tobias/src/tamingLLMs/tamingllms/.venv/lib/python3.11/site-packages/torch/cuda/__init__.py:654: UserWarning: Can't initialize NVML\n",
+      "  warnings.warn(\"Can't initialize NVML\")\n"
+     ]
+    }
+   ],
    "source": [
     "MODEL_NAME = \"HuggingFaceTB/SmolLM2-1.7B-Instruct\"\n",
     "PROMPT = \"Is Enzo a good name for a baby?\"\n",
@@ -1384,7 +1393,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,
diff --git a/tamingllms/_build/html/_static/cost/bitnet.png b/tamingllms/_build/html/_static/cost/bitnet.png
new file mode 100644
index 0000000..5d0d74e
Binary files /dev/null and b/tamingllms/_build/html/_static/cost/bitnet.png differ
diff --git a/tamingllms/_build/html/_static/cost/llmflation.png b/tamingllms/_build/html/_static/cost/llmflation.png
new file mode 100644
index 0000000..5061149
Binary files /dev/null and b/tamingllms/_build/html/_static/cost/llmflation.png differ
diff --git a/tamingllms/_build/html/_static/cost/quantized.png b/tamingllms/_build/html/_static/cost/quantized.png
new file mode 100644
index 0000000..2dc6d44
Binary files /dev/null and b/tamingllms/_build/html/_static/cost/quantized.png differ
diff --git a/tamingllms/_build/html/_static/cost/quantized.tsx b/tamingllms/_build/html/_static/cost/quantized.tsx
new file mode 100644
index 0000000..aef322c
--- /dev/null
+++ b/tamingllms/_build/html/_static/cost/quantized.tsx
@@ -0,0 +1,58 @@
+import React from 'react';
+import { LineChart, Line, XAxis, YAxis, CartesianGrid, Tooltip, ResponsiveContainer } from 'recharts';
+
+const MemoryUsageChart = () => {
+  const data = [
+    { name: 'F16', value: 141.1 },
+    { name: 'Q8_0', value: 75.0 },
+    { name: 'Q6_K', value: 59.9 },
+    { name: 'Q5_K_M', value: 49.9 },
+    { name: 'Q4_K_M', value: 42.5 },
+    { name: 'Q3_K_M', value: 34.3 },
+    { name: 'Q2_K', value: 26.4 }
+  ];
+
+  return (
+    <div className="w-full h-96 p-4">
+      <ResponsiveContainer width="100%" height="100%">
+        <LineChart data={data} margin={{ top: 20, right: 30, left: 20, bottom: 5 }}>
+          <CartesianGrid strokeDasharray="3 3" />
+          <XAxis 
+            dataKey="name"
+            tick={{ fontSize: 12, fontWeight: 'bold' }}
+          />
+          <YAxis 
+            label={{ 
+              value: 'Model Size (GB)', 
+              angle: -90, 
+              position: 'insideLeft',
+              style: { 
+                textAnchor: 'middle',
+                fontWeight: 'bold'
+              }
+            }}
+            tick={{ fontSize: 12, fontWeight: 'bold' }}
+          />
+          <Tooltip 
+            formatter={(value) => [`${value} GB`, 'Model Size']}
+            contentStyle={{ 
+              backgroundColor: '#fff', 
+              border: '1px solid #ccc',
+              fontWeight: 'bold'
+            }}
+          />
+          <Line 
+            type="monotone"
+            dataKey="value" 
+            stroke="#3eaf7c"
+            strokeWidth={2}
+            dot={{ fill: '#3eaf7c', r: 4 }}
+            activeDot={{ r: 6 }}
+          />
+        </LineChart>
+      </ResponsiveContainer>
+    </div>
+  );
+};
+
+export default MemoryUsageChart;
\ No newline at end of file
diff --git a/tamingllms/_build/html/genindex.html b/tamingllms/_build/html/genindex.html
index 92c3679..18858b4 100644
--- a/tamingllms/_build/html/genindex.html
+++ b/tamingllms/_build/html/genindex.html
@@ -190,6 +190,15 @@
           </li>
 
         
+          <li class="toctree-l1 ">
+            
+              <a href="notebooks/cost.html" class="reference internal ">The Falling Cost Paradox</a>
+            
+
+            
+          </li>
+
+        
       </ul>
     </div>
   
diff --git a/tamingllms/_build/html/markdown/intro.html b/tamingllms/_build/html/markdown/intro.html
index a77a538..484ac75 100644
--- a/tamingllms/_build/html/markdown/intro.html
+++ b/tamingllms/_build/html/markdown/intro.html
@@ -208,6 +208,15 @@
           </li>
 
         
+          <li class="toctree-l1 ">
+            
+              <a href="../notebooks/cost.html" class="reference internal ">The Falling Cost Paradox</a>
+            
+
+            
+          </li>
+
+        
       </ul>
     </div>
   
diff --git a/tamingllms/_build/html/markdown/preface.html b/tamingllms/_build/html/markdown/preface.html
index 36b808e..2f54208 100644
--- a/tamingllms/_build/html/markdown/preface.html
+++ b/tamingllms/_build/html/markdown/preface.html
@@ -190,6 +190,15 @@
           </li>
 
         
+          <li class="toctree-l1 ">
+            
+              <a href="../notebooks/cost.html" class="reference internal ">The Falling Cost Paradox</a>
+            
+
+            
+          </li>
+
+        
       </ul>
     </div>
   
@@ -227,7 +236,7 @@ <h1><span class="section-number">1. </span>Preface<a class="headerlink" href="#p
 <div><p>Models tell you merely what something is like, not what something is.</p>
 <p class="attribution">—Emanuel Derman</p>
 </div></blockquote>
-<p>An alternative title of this book could have been “Language Models Behaving Badly”. If you are coming from a background in financial modeling, you may have noticed the parallel with Emanuel Derman’s seminal work “Models.Behaving.Badly” <span id="id1">[<a class="reference internal" href="#id162" title="E. Derman. Models.Behaving.Badly.: Why Confusing Illusion with Reality Can Lead to Disaster, on Wall Street and in Life. Free Press, 2011. ISBN 9781439165010. URL: https://books.google.co.uk/books?id=lke_cwM4wm8C.">Derman, 2011</a>]</span>. This parallel is not coincidental. Just as Derman cautioned against treating financial models as perfect representations of reality, this book aims to highlight the limitations and pitfalls of Large Language Models (LLMs) in practical applications.</p>
+<p>An alternative title of this book could have been “Language Models Behaving Badly”. If you are coming from a background in financial modeling, you may have noticed the parallel with Emanuel Derman’s seminal work “Models.Behaving.Badly” <span id="id1">[<a class="reference internal" href="#id165" title="E. Derman. Models.Behaving.Badly.: Why Confusing Illusion with Reality Can Lead to Disaster, on Wall Street and in Life. Free Press, 2011. ISBN 9781439165010. URL: https://books.google.co.uk/books?id=lke_cwM4wm8C.">Derman, 2011</a>]</span>. This parallel is not coincidental. Just as Derman cautioned against treating financial models as perfect representations of reality, this book aims to highlight the limitations and pitfalls of Large Language Models (LLMs) in practical applications.</p>
 <p>The book “Models.Behaving.Badly” by Emanuel Derman, a former physicist and Goldman Sachs quant, explores how financial and scientific models can fail when we mistake them for reality rather than treating them as approximations full of assumptions.
 The core premise of his work is that while models can be useful tools for understanding aspects of the world, they inherently involve simplification and assumptions. Derman argues that many financial crises, including the 2008 crash, occurred partly because people put too much faith in mathematical models without recognizing their limitations.</p>
 <p>Like financial models that failed to capture the complexity of human behavior and market dynamics, LLMs have inherent constraints. They can hallucinate facts, struggle with logical reasoning, and fail to maintain consistency across long outputs. Their responses, while often convincing, are probabilistic approximations based on training data rather than true understanding even though humans insist on treating them as “machines that can reason”.</p>
@@ -235,7 +244,7 @@ <h1><span class="section-number">1. </span>Preface<a class="headerlink" href="#p
 <p>This book serves as an introductory, practical guide for practitioners and technology product builders - software engineers, data scientists, and product managers - who want to create the next generation of GenAI-based products with LLMs while remaining clear-eyed about their limitations and therefore their implications to end-users. Through detailed technical analysis, reproducible Python code examples we explore the gap between LLM capabilities and reliable software product development.</p>
 <p>The goal is not to diminish the transformative potential of LLMs, but rather to promote a more nuanced understanding of their behavior. By acknowledging and working within their constraints, developers can create more reliable and trustworthy applications. After all, as Derman taught us, the first step to using a model effectively is understanding where it breaks down.</p>
 <div class="docutils container" id="id2">
-<div class="citation" id="id162" role="doc-biblioentry">
+<div class="citation" id="id165" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id1">Der11</a><span class="fn-bracket">]</span></span>
 <p>E. Derman. <em>Models.Behaving.Badly.: Why Confusing Illusion with Reality Can Lead to Disaster, on Wall Street and in Life</em>. Free Press, 2011. ISBN 9781439165010. URL: <a class="reference external" href="https://books.google.co.uk/books?id=lke_cwM4wm8C">https://books.google.co.uk/books?id=lke_cwM4wm8C</a>.</p>
 </div>
diff --git a/tamingllms/_build/html/markdown/toc.html b/tamingllms/_build/html/markdown/toc.html
index 05d39be..629d701 100644
--- a/tamingllms/_build/html/markdown/toc.html
+++ b/tamingllms/_build/html/markdown/toc.html
@@ -183,6 +183,15 @@
           </li>
 
         
+          <li class="toctree-l1 ">
+            
+              <a href="../notebooks/cost.html" class="reference internal ">The Falling Cost Paradox</a>
+            
+
+            
+          </li>
+
+        
       </ul>
     </div>
   
@@ -245,8 +254,8 @@ <h2><a class="reference external" href="https://www.souzatharsis.com/tamingLLMs/
 <section id="chapter-6-local-llms-in-practice">
 <h2><a class="reference external" href="https://www.souzatharsis.com/tamingLLMs/notebooks/local.html">Chapter 6: Local LLMs in Practice</a><a class="headerlink" href="#chapter-6-local-llms-in-practice" title="Permalink to this heading">¶</a></h2>
 </section>
-<section id="chapter-7-the-cost-factor">
-<h2>Chapter 7: The Cost Factor<a class="headerlink" href="#chapter-7-the-cost-factor" title="Permalink to this heading">¶</a></h2>
+<section id="chapter-7-the-falling-cost-paradox">
+<h2>Chapter 7: The Falling Cost Paradox<a class="headerlink" href="#chapter-7-the-falling-cost-paradox" title="Permalink to this heading">¶</a></h2>
 </section>
 <section id="chapter-8-frontiers">
 <h2>Chapter 8: Frontiers<a class="headerlink" href="#chapter-8-frontiers" title="Permalink to this heading">¶</a></h2>
diff --git a/tamingllms/_build/html/notebooks/alignment.html b/tamingllms/_build/html/notebooks/alignment.html
index 32da58c..b570323 100644
--- a/tamingllms/_build/html/notebooks/alignment.html
+++ b/tamingllms/_build/html/notebooks/alignment.html
@@ -212,6 +212,15 @@
           </li>
 
         
+          <li class="toctree-l1 ">
+            
+              <a href="cost.html" class="reference internal ">The Falling Cost Paradox</a>
+            
+
+            
+          </li>
+
+        
       </ul>
     </div>
   
@@ -244,7 +253,7 @@
           <div class="content" role="main" v-pre>
             
   <section class="tex2jax_ignore mathjax_ignore" id="preference-based-alignment">
-<span id="alignment"></span><h1><a class="toc-backref" href="#id228" role="doc-backlink"><span class="section-number">6. </span>Preference-Based Alignment</a><a class="headerlink" href="#preference-based-alignment" title="Permalink to this heading">¶</a></h1>
+<span id="alignment"></span><h1><a class="toc-backref" href="#id231" role="doc-backlink"><span class="section-number">6. </span>Preference-Based Alignment</a><a class="headerlink" href="#preference-based-alignment" title="Permalink to this heading">¶</a></h1>
 <blockquote class="epigraph">
 <div><p>A people that values its privileges above its principles soon loses both.</p>
 <p class="attribution">—Dwight D. Eisenhower</p>
@@ -252,72 +261,72 @@
 <nav class="contents" id="contents">
 <p class="topic-title">Contents</p>
 <ul class="simple">
-<li><p><a class="reference internal" href="#preference-based-alignment" id="id228">Preference-Based Alignment</a></p>
+<li><p><a class="reference internal" href="#preference-based-alignment" id="id231">Preference-Based Alignment</a></p>
 <ul>
-<li><p><a class="reference internal" href="#introduction" id="id229">Introduction</a></p></li>
-<li><p><a class="reference internal" href="#from-raw-capabilities-to-preference-alignment" id="id230">From Raw Capabilities to Preference Alignment</a></p>
+<li><p><a class="reference internal" href="#introduction" id="id232">Introduction</a></p></li>
+<li><p><a class="reference internal" href="#from-raw-capabilities-to-preference-alignment" id="id233">From Raw Capabilities to Preference Alignment</a></p>
 <ul>
-<li><p><a class="reference internal" href="#on-the-misalignment-of-language-models" id="id231">On the Misalignment of Language Models</a></p></li>
-<li><p><a class="reference internal" href="#aligning-language-models-with-human-preferences" id="id232">Aligning Language Models with Human Preferences</a></p>
+<li><p><a class="reference internal" href="#on-the-misalignment-of-language-models" id="id234">On the Misalignment of Language Models</a></p></li>
+<li><p><a class="reference internal" href="#aligning-language-models-with-human-preferences" id="id235">Aligning Language Models with Human Preferences</a></p>
 <ul>
-<li><p><a class="reference internal" href="#supervised-fine-tuning-sft-for-model-alignment" id="id233">Supervised Fine-Tuning (SFT) for Model Alignment</a></p></li>
-<li><p><a class="reference internal" href="#augmenting-sft-with-human-preferences" id="id234">Augmenting SFT with Human Preferences</a></p></li>
+<li><p><a class="reference internal" href="#supervised-fine-tuning-sft-for-model-alignment" id="id236">Supervised Fine-Tuning (SFT) for Model Alignment</a></p></li>
+<li><p><a class="reference internal" href="#augmenting-sft-with-human-preferences" id="id237">Augmenting SFT with Human Preferences</a></p></li>
 </ul>
 </li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#is-post-training-the-answer" id="id235">Is Post-Training the Answer?</a></p>
+<li><p><a class="reference internal" href="#is-post-training-the-answer" id="id238">Is Post-Training the Answer?</a></p>
 <ul>
-<li><p><a class="reference internal" href="#limitations" id="id236">Limitations</a></p></li>
-<li><p><a class="reference internal" href="#model-collapse" id="id237">Model Collapse</a></p></li>
-<li><p><a class="reference internal" href="#faking-alignment" id="id238">Faking Alignment</a></p></li>
+<li><p><a class="reference internal" href="#limitations" id="id239">Limitations</a></p></li>
+<li><p><a class="reference internal" href="#model-collapse" id="id240">Model Collapse</a></p></li>
+<li><p><a class="reference internal" href="#faking-alignment" id="id241">Faking Alignment</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#case-study-aligning-a-language-model-to-a-policy" id="id239">Case Study: Aligning a Language Model to a Policy</a></p>
+<li><p><a class="reference internal" href="#case-study-aligning-a-language-model-to-a-policy" id="id242">Case Study: Aligning a Language Model to a Policy</a></p>
 <ul>
-<li><p><a class="reference internal" href="#experimental-setup" id="id240">Experimental Setup</a></p></li>
-<li><p><a class="reference internal" href="#deliverables" id="id241">Deliverables</a></p></li>
-<li><p><a class="reference internal" href="#a-note-on-smollm2-models" id="id242">A Note on smolLM2 Models</a></p>
+<li><p><a class="reference internal" href="#experimental-setup" id="id243">Experimental Setup</a></p></li>
+<li><p><a class="reference internal" href="#deliverables" id="id244">Deliverables</a></p></li>
+<li><p><a class="reference internal" href="#a-note-on-smollm2-models" id="id245">A Note on smolLM2 Models</a></p>
 <ul>
-<li><p><a class="reference internal" href="#policy" id="id243">Policy</a></p></li>
+<li><p><a class="reference internal" href="#policy" id="id246">Policy</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#preference-dataset-synthetic-dataset-generation" id="id244">Preference Dataset - Synthetic Dataset Generation</a></p>
+<li><p><a class="reference internal" href="#preference-dataset-synthetic-dataset-generation" id="id247">Preference Dataset - Synthetic Dataset Generation</a></p>
 <ul>
-<li><p><a class="reference internal" href="#user-prompts" id="id245">User Prompts</a></p></li>
-<li><p><a class="reference internal" href="#rejected-responses" id="id246">Rejected Responses</a></p></li>
-<li><p><a class="reference internal" href="#chosen-responses" id="id247">Chosen Responses</a></p></li>
-<li><p><a class="reference internal" href="#generate-dpo-dataset" id="id248">Generate DPO Dataset</a></p></li>
+<li><p><a class="reference internal" href="#user-prompts" id="id248">User Prompts</a></p></li>
+<li><p><a class="reference internal" href="#rejected-responses" id="id249">Rejected Responses</a></p></li>
+<li><p><a class="reference internal" href="#chosen-responses" id="id250">Chosen Responses</a></p></li>
+<li><p><a class="reference internal" href="#generate-dpo-dataset" id="id251">Generate DPO Dataset</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#dpo-based-optimization" id="id249">DPO-Based Optimization</a></p>
+<li><p><a class="reference internal" href="#dpo-based-optimization" id="id252">DPO-Based Optimization</a></p>
 <ul>
-<li><p><a class="reference internal" href="#data-preparation" id="id250">Data Preparation</a></p></li>
-<li><p><a class="reference internal" href="#fine-tuning" id="id251">Fine-Tuning</a></p></li>
-<li><p><a class="reference internal" href="#vibe-check" id="id252">Vibe Check</a></p></li>
+<li><p><a class="reference internal" href="#data-preparation" id="id253">Data Preparation</a></p></li>
+<li><p><a class="reference internal" href="#fine-tuning" id="id254">Fine-Tuning</a></p></li>
+<li><p><a class="reference internal" href="#vibe-check" id="id255">Vibe Check</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#alignment-evaluation" id="id253">Alignment Evaluation</a></p></li>
+<li><p><a class="reference internal" href="#alignment-evaluation" id="id256">Alignment Evaluation</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#discussion-and-conclusions" id="id254">Discussion and Conclusions</a></p></li>
-<li><p><a class="reference internal" href="#citation" id="id255">Citation</a></p></li>
-<li><p><a class="reference internal" href="#references" id="id256">References</a></p></li>
+<li><p><a class="reference internal" href="#discussion-and-conclusions" id="id257">Discussion and Conclusions</a></p></li>
+<li><p><a class="reference internal" href="#citation" id="id258">Citation</a></p></li>
+<li><p><a class="reference internal" href="#references" id="id259">References</a></p></li>
 </ul>
 </li>
 </ul>
 </nav>
 <section id="introduction">
-<h2><a class="toc-backref" href="#id229" role="doc-backlink"><span class="section-number">6.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id232" role="doc-backlink"><span class="section-number">6.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
 <p>The release of ChatGPT 3.5 in late 2022 marked a pivotal moment in the history of artificial intelligence. Within just five days of its launch, the model attracted over a million users, and within two months, it became the fastest-growing consumer application in history with over 100 million monthly active users.</p>
 <p>Yet, this raises an intriguing question: Why did ChatGPT 3.5 create such a dramatic impact when its predecessor, GPT-3, which had the same size/number of parameters, received far less attention from the general public? Arguably, the answer lies not in raw capabilities, but in Preference Alignment. Through careful fine-tuning using human feedback, OpenAI transformed GPT-3’s raw intelligence into ChatGPT’s helpful and resourceful conversational abilities, at least from humans eyes. This breakthrough demonstrated that aligning language models with human preferences is just as crucial as scaling them to greater sizes.</p>
-<p>In this chapter, we will explore the process of aligning language models with human preferences via fine-tuning using modern techniques such as Direct Preference Optimization (DPO) <span id="id1">[<a class="reference internal" href="#id169" title="Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn. Direct preference optimization: your language model is secretly a reward model. 2024. URL: https://arxiv.org/abs/2305.18290, arXiv:2305.18290.">Rafailov <em>et al.</em>, 2024</a>]</span>. Next, we will present a practical case study where we align a language model to a user-provided policy in a fully automated fashion leading to an open source model as well as a dataset of policy-aligned preferences.</p>
+<p>In this chapter, we will explore the process of aligning language models with human preferences via fine-tuning using modern techniques such as Direct Preference Optimization (DPO) <span id="id1">[<a class="reference internal" href="#id172" title="Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn. Direct preference optimization: your language model is secretly a reward model. 2024. URL: https://arxiv.org/abs/2305.18290, arXiv:2305.18290.">Rafailov <em>et al.</em>, 2024</a>]</span>. Next, we will present a practical case study where we align a language model to a user-provided policy in a fully automated fashion leading to an open source model as well as a dataset of policy-aligned preferences.</p>
 </section>
 <section id="from-raw-capabilities-to-preference-alignment">
-<h2><a class="toc-backref" href="#id230" role="doc-backlink"><span class="section-number">6.2. </span>From Raw Capabilities to Preference Alignment</a><a class="headerlink" href="#from-raw-capabilities-to-preference-alignment" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id233" role="doc-backlink"><span class="section-number">6.2. </span>From Raw Capabilities to Preference Alignment</a><a class="headerlink" href="#from-raw-capabilities-to-preference-alignment" title="Permalink to this heading">¶</a></h2>
 <section id="on-the-misalignment-of-language-models">
-<h3><a class="toc-backref" href="#id231" role="doc-backlink"><span class="section-number">6.2.1. </span>On the Misalignment of Language Models</a><a class="headerlink" href="#on-the-misalignment-of-language-models" title="Permalink to this heading">¶</a></h3>
-<p>Common pre-trained LLMs are not helpful to humans by default. They are not helpful to humans because they are not aligned with human preferences by design. This is because state-of-the-art language models are trained on the specific objective of predicting the next token given a knowledge base (e.g. large number of webpages from the internet). This is a very different objective than being asked to follow user’s instructions while being safe and helpful. We say that the language modeling objective is misaligned <span id="id2">[<a class="reference internal" href="#id165" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span>.</p>
+<h3><a class="toc-backref" href="#id234" role="doc-backlink"><span class="section-number">6.2.1. </span>On the Misalignment of Language Models</a><a class="headerlink" href="#on-the-misalignment-of-language-models" title="Permalink to this heading">¶</a></h3>
+<p>Common pre-trained LLMs are not helpful to humans by default. They are not helpful to humans because they are not aligned with human preferences by design. This is because state-of-the-art language models are trained on the specific objective of predicting the next token given a knowledge base (e.g. large number of webpages from the internet). This is a very different objective than being asked to follow user’s instructions while being safe and helpful. We say that the language modeling objective is misaligned <span id="id2">[<a class="reference internal" href="#id168" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span>.</p>
 <p>Let’s take a look at GPT-2’s response to the following prompt: “Explain the moon landing to a 6 year old.”</p>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
@@ -365,12 +374,12 @@ <h3><a class="toc-backref" href="#id231" role="doc-backlink"><span class="sectio
 <p>As we can see from the responses above, GPT-2 fails to provide a coherent and helpful explanation of the moon landing to a 6-year-old child. The model generates nonsensical text that meanders between unrelated topics like “green dots”, “movie endings”, and “the word tepid”. This is a simple demonstration that raw language models, while capable of generating text, are not inherently aligned with the goal of being helpful to humans. The model lacks the understanding that it should provide a simple, clear explanation appropriate for a young child. Instead, it predicts the next token given a knowledge base.</p>
 </section>
 <section id="aligning-language-models-with-human-preferences">
-<h3><a class="toc-backref" href="#id232" role="doc-backlink"><span class="section-number">6.2.2. </span>Aligning Language Models with Human Preferences</a><a class="headerlink" href="#aligning-language-models-with-human-preferences" title="Permalink to this heading">¶</a></h3>
-<p>To address this issue, OpenAI introduced a RLHF-based technique to align language models with user intent on a wide range of tasks by fine-tuning with human feedback <span id="id3">[<a class="reference internal" href="#id165" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span>. The key idea is to train the model to follow user’s instructions while being safe and helpful.</p>
+<h3><a class="toc-backref" href="#id235" role="doc-backlink"><span class="section-number">6.2.2. </span>Aligning Language Models with Human Preferences</a><a class="headerlink" href="#aligning-language-models-with-human-preferences" title="Permalink to this heading">¶</a></h3>
+<p>To address this issue, OpenAI introduced a RLHF-based technique to align language models with user intent on a wide range of tasks by fine-tuning with human feedback <span id="id3">[<a class="reference internal" href="#id168" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span>. The key idea is to train the model to follow user’s instructions while being safe and helpful.</p>
 <figure class="align-center" id="openai-rlhf">
 <a class="reference internal image-reference" href="../_images/openai_rlhf.png"><img alt="OpenAI RLHF Pipeline" src="../_images/openai_rlhf.png" style="width: 729.05px; height: 421.4px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 6.1 </span><span class="caption-text">OpenAI’s RLHF pipeline for aligning language models with human preferences <span id="id4">[<a class="reference internal" href="#id165" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span>.</span><a class="headerlink" href="#openai-rlhf" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 6.1 </span><span class="caption-text">OpenAI’s RLHF pipeline for aligning language models with human preferences <span id="id4">[<a class="reference internal" href="#id168" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span>.</span><a class="headerlink" href="#openai-rlhf" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p><a class="reference internal" href="#openai-rlhf"><span class="std std-numref">Fig. 6.1</span></a> illustrates OpenAI’s 3-step process for training language models to better follow human instructions using RLHF:</p>
@@ -409,7 +418,7 @@ <h3><a class="toc-backref" href="#id232" role="doc-backlink"><span class="sectio
 <figure class="align-center" id="alignment-simplified">
 <a class="reference internal image-reference" href="../_images/alignment_simplified.png"><img alt="Alignment Simplified" src="../_images/alignment_simplified.png" style="width: 979.1999999999999px; height: 257.4px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 6.2 </span><span class="caption-text">Simplified view of the alignment process showing the progression from base model to instruction-tuned model to aligned model <span id="id5">[<a class="reference internal" href="#id165" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span>.</span><a class="headerlink" href="#alignment-simplified" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 6.2 </span><span class="caption-text">Simplified view of the alignment process showing the progression from base model to instruction-tuned model to aligned model <span id="id5">[<a class="reference internal" href="#id168" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span>.</span><a class="headerlink" href="#alignment-simplified" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>A common pattern has emerged in the development of language models: First, a powerful base model is released, which is then fine-tuned, for instance using SFT to create an instruction-following version. This instruct model can then be further aligned with human preferences using techniques such as RLHF to create an aligned version as illustrated in <a class="reference internal" href="#instruct"><span class="std std-numref">Fig. 6.3</span></a>.</p>
@@ -419,10 +428,10 @@ <h3><a class="toc-backref" href="#id232" role="doc-backlink"><span class="sectio
 <p><span class="caption-number">Fig. 6.3 </span><span class="caption-text">Instruction fine-tuning process for aligning language models with human preferences.</span><a class="headerlink" href="#instruct" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
-<p>An aligned model can be fine-tuned directly from a base model or from an instruction-tuned model. For example, Llama Guard 3 <span id="id6">[<a class="reference internal" href="#id167" title="AI &#64; Meta Llama Team. The llama 3 herd of models. 2024. URL: https://arxiv.org/abs/2407.21783, arXiv:2407.21783.">Llama Team, 2024</a>]</span> is a Llama-3.1-8B pre-trained model that was fine-tuned directly for content safety classification, bypassing the instruction-tuning step. Similarly, Zephyr-7B-alpha <span id="id7">[<a class="reference internal" href="#id166" title="Hugging Face. Zephyr. 2024. Zephyr. URL: https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha.">Face, 2024</a>]</span> demonstrates direct alignment from a base model - it is a fine-tuned version of Mistral-7B that was trained using Direct Preference Optimization (DPO) on publicly available datasets to create a helpful assistant.</p>
+<p>An aligned model can be fine-tuned directly from a base model or from an instruction-tuned model. For example, Llama Guard 3 <span id="id6">[<a class="reference internal" href="#id170" title="AI &#64; Meta Llama Team. The llama 3 herd of models. 2024. URL: https://arxiv.org/abs/2407.21783, arXiv:2407.21783.">Llama Team, 2024</a>]</span> is a Llama-3.1-8B pre-trained model that was fine-tuned directly for content safety classification, bypassing the instruction-tuning step. Similarly, Zephyr-7B-alpha <span id="id7">[<a class="reference internal" href="#id169" title="Hugging Face. Zephyr. 2024. Zephyr. URL: https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha.">Face, 2024</a>]</span> demonstrates direct alignment from a base model - it is a fine-tuned version of Mistral-7B that was trained using Direct Preference Optimization (DPO) on publicly available datasets to create a helpful assistant.</p>
 <p>The OpenAI paper introduced two key components of this fine-tuning process - SFT for instruction tuning and RLHF (PPO in particular) for alignment. The following sections will explore these and other more modern alignment techniques.</p>
 <section id="supervised-fine-tuning-sft-for-model-alignment">
-<h4><a class="toc-backref" href="#id233" role="doc-backlink"><span class="section-number">6.2.2.1. </span>Supervised Fine-Tuning (SFT) for Model Alignment</a><a class="headerlink" href="#supervised-fine-tuning-sft-for-model-alignment" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id236" role="doc-backlink"><span class="section-number">6.2.2.1. </span>Supervised Fine-Tuning (SFT) for Model Alignment</a><a class="headerlink" href="#supervised-fine-tuning-sft-for-model-alignment" title="Permalink to this heading">¶</a></h4>
 <p>SFT is a foundational technique for aligning language models with human preferences. Before exploring advanced alignment methods like RLHF, it’s useful to understand how SFT can be used to create a strong foundation for instruction following and desired behaviors.</p>
 <p>At a high-level, SFT involves fine-tuning language models using carefully curated demonstrations of desired behavior. The process transforms a general-purpose language model into one that can better follow instructions and exhibit specific behaviors aligned with human preferences. Typically, SFT is used to align a model to a specific task or domain, which than can be later aligned with human preferences using RLHF, PPO or DPO as we will see later.</p>
 <p>The decision to employ SFT depends on the gap between a model’s current capabilities and specific requirements. SFT proves particularly valuable in scenarios requiring:</p>
@@ -440,14 +449,14 @@ <h4><a class="toc-backref" href="#id233" role="doc-backlink"><span class="sectio
 <li><p>Requires significant computational resources</p></li>
 </ul>
 </li>
-<li><p><strong>LoRA (Low-Rank Adaptation)</strong> <span id="id8">[<a class="reference internal" href="#id174" title="Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. Lora: low-rank adaptation of large language models. 2021. URL: https://arxiv.org/abs/2106.09685, arXiv:2106.09685.">Hu <em>et al.</em>, 2021</a>]</span></p>
+<li><p><strong>LoRA (Low-Rank Adaptation)</strong> <span id="id8">[<a class="reference internal" href="#id177" title="Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. Lora: low-rank adaptation of large language models. 2021. URL: https://arxiv.org/abs/2106.09685, arXiv:2106.09685.">Hu <em>et al.</em>, 2021</a>]</span></p>
 <ul class="simple">
 <li><p>Uses two small matrices instead of updating all weights</p></li>
 <li><p>Maintains model performance while reducing computational costs</p></li>
 <li><p>Enables efficient training on consumer hardware</p></li>
 </ul>
 </li>
-<li><p><strong>QLoRA (Quantized LoRA)</strong> <span id="id9">[<a class="reference internal" href="#id175" title="Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. Qlora: efficient finetuning of quantized llms. 2023. URL: https://arxiv.org/abs/2305.14314, arXiv:2305.14314.">Dettmers <em>et al.</em>, 2023</a>]</span></p>
+<li><p><strong>QLoRA (Quantized LoRA)</strong> <span id="id9">[<a class="reference internal" href="#id178" title="Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. Qlora: efficient finetuning of quantized llms. 2023. URL: https://arxiv.org/abs/2305.14314, arXiv:2305.14314.">Dettmers <em>et al.</em>, 2023</a>]</span></p>
 <ul class="simple">
 <li><p>Combines LoRA with weight quantization</p></li>
 <li><p>Further reduces memory footprint</p></li>
@@ -455,20 +464,20 @@ <h4><a class="toc-backref" href="#id233" role="doc-backlink"><span class="sectio
 </ul>
 </li>
 </ol>
-<p>While SFT can increase the likelihood of obtaining the desired tokens, it may also raise the probability of generating undesired outcomes <span id="id10">[<a class="reference internal" href="#id168" title="Jiwoo Hong, Noah Lee, and James Thorne. Orpo: monolithic preference optimization without reference model. 2024. URL: https://arxiv.org/abs/2403.07691, arXiv:2403.07691.">Hong <em>et al.</em>, 2024</a>]</span> therefore leading to unintended results and a suboptimal alignment.</p>
-<p>SFT can be seen as a form of behavior cloning of humans. Recently, there has been research on using RLHF or DPO <span id="id11">[<a class="reference internal" href="#id169" title="Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn. Direct preference optimization: your language model is secretly a reward model. 2024. URL: https://arxiv.org/abs/2305.18290, arXiv:2305.18290.">Rafailov <em>et al.</em>, 2024</a>]</span> to maximize human preference rather than clone their behavior, which has been shown to be more effective than SFT alone <span id="id12">[<a class="reference internal" href="#id165" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span>, which we will explore next.</p>
+<p>While SFT can increase the likelihood of obtaining the desired tokens, it may also raise the probability of generating undesired outcomes <span id="id10">[<a class="reference internal" href="#id171" title="Jiwoo Hong, Noah Lee, and James Thorne. Orpo: monolithic preference optimization without reference model. 2024. URL: https://arxiv.org/abs/2403.07691, arXiv:2403.07691.">Hong <em>et al.</em>, 2024</a>]</span> therefore leading to unintended results and a suboptimal alignment.</p>
+<p>SFT can be seen as a form of behavior cloning of humans. Recently, there has been research on using RLHF or DPO <span id="id11">[<a class="reference internal" href="#id172" title="Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn. Direct preference optimization: your language model is secretly a reward model. 2024. URL: https://arxiv.org/abs/2305.18290, arXiv:2305.18290.">Rafailov <em>et al.</em>, 2024</a>]</span> to maximize human preference rather than clone their behavior, which has been shown to be more effective than SFT alone <span id="id12">[<a class="reference internal" href="#id168" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span>, which we will explore next.</p>
 </section>
 <section id="augmenting-sft-with-human-preferences">
-<h4><a class="toc-backref" href="#id234" role="doc-backlink"><span class="section-number">6.2.2.2. </span>Augmenting SFT with Human Preferences</a><a class="headerlink" href="#augmenting-sft-with-human-preferences" title="Permalink to this heading">¶</a></h4>
-<p>Significant gains in helpfulness and safety can be achieved by augmenting SFT with human preferences <span id="id13">[<a class="reference internal" href="#id173" title="Yuntao Bai, Andy Jones, Kamal Ndousse, Amanda Askell, Anna Chen, Nova DasSarma, Dawn Drain, Stanislav Fort, Deep Ganguli, Tom Henighan, Nicholas Joseph, Saurav Kadavath, Jackson Kernion, Tom Conerly, Sheer El-Showk, Nelson Elhage, Zac Hatfield-Dodds, Danny Hernandez, Tristan Hume, Scott Johnston, Shauna Kravec, Liane Lovitt, Neel Nanda, Catherine Olsson, Dario Amodei, Tom Brown, Jack Clark, Sam McCandlish, Chris Olah, Ben Mann, and Jared Kaplan. Training a helpful and harmless assistant with reinforcement learning from human feedback. 2022. URL: https://arxiv.org/abs/2204.05862, arXiv:2204.05862.">Bai <em>et al.</em>, 2022</a>, <a class="reference internal" href="#id165" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>, <a class="reference internal" href="local.html#id161" title="Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and Thomas Scialom. Llama 2: open foundation and fine-tuned chat models. 2023. URL: https://arxiv.org/abs/2307.09288, arXiv:2307.09288.">Touvron <em>et al.</em>, 2023</a>]</span>.</p>
-<p>The OpenAI paper <span id="id14">[<a class="reference internal" href="#id165" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span> demonstrated the effectiveness of Reinforcement Learning from Human Feedback (RLHF), particularly using Proximal Policy Optimization (PPO), for aligning language models with human preferences. Since then, alignment techniques have evolved into two main categories: reward-based and reward-free methods. Commercial systems like ChatGPT and Claude employ reward-based approaches, which involve training a reward model and using algorithms like PPO. Meanwhile, reward-free methods such as Direct Preference Optimization (DPO) have demonstrated superior performance on benchmark tasks <span id="id15">[<a class="reference internal" href="#id176" title="Shusheng Xu, Wei Fu, Jiaxuan Gao, Wenjie Ye, Weilin Liu, Zhiyu Mei, Guangju Wang, Chao Yu, and Yi Wu. Is dpo superior to ppo for llm alignment? a comprehensive study. 2024. URL: https://arxiv.org/abs/2404.10719, arXiv:2404.10719.">Xu <em>et al.</em>, 2024</a>]</span>.</p>
-<p>Proximal Policy Optimization (PPO) <span id="id16">[<a class="reference internal" href="#id178" title="John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. Proximal policy optimization algorithms. 2017. URL: https://arxiv.org/abs/1707.06347, arXiv:1707.06347.">Schulman <em>et al.</em>, 2017</a>]</span> is a widely used reinforcement learning algorithm that has gained popularity particularly since the release of ChatGPT 3.5. It operates by iteratively updating the policy of an LLM, which can be understood as a set of rules that govern how the model generates text. In the context of RLHF, the policy is updated based on rewards that reflect human preferences. For instance, if a human evaluator prefers one LLM output over another, the policy is adjusted to increase the likelihood of generating outputs similar to the preferred one.</p>
-<p>One of the key strengths of PPO lies in its ability to handle complex reward landscapes <span id="id17">[<a class="reference internal" href="#id177" title="Hugging Face. Rlhf. 2024c. RLHF. URL: https://huggingface.co/blog/rlhf.">Face, 2024c</a>]</span>. In many real-world scenarios, the rewards that an LLM receives may be noisy or delayed. For example, in a chatbot application, the reward for generating a good response may not be immediate, as it depends on the user’s subsequent interactions. PPO effectively learns in these situations by using a clipped surrogate objective function, which limits the size of policy updates and ensures stable training. This prevents the model from overreacting to noisy or delayed rewards and helps it converge to a stable and optimal policy.</p>
-<p>Direct Preference Optimization (DPO) is a more recent “reward-free” fine-tuning technique that has gained significant attention due to its simplicity and efficiency <span id="id18">[<a class="reference internal" href="#id169" title="Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn. Direct preference optimization: your language model is secretly a reward model. 2024. URL: https://arxiv.org/abs/2305.18290, arXiv:2305.18290.">Rafailov <em>et al.</em>, 2024</a>]</span>, awarded runner-up paper in NeurIPS 2023 <span id="id19">[<a class="reference internal" href="#id179" title="NeurIPS Blog. Announcing the neurips 2023 paper awards. 2023. NeurIPS 2023 Awards. URL: https://blog.neurips.cc/2023/12/11/announcing-the-neurips-2023-paper-awards/.">Blog, 2023</a>]</span>. DPO operates by directly optimizing the policy to maximize the likelihood of preferred responses while minimizing the likelihood of non-preferred responses. As illustrated in <a class="reference internal" href="#dpo-paper"><span class="std std-numref">Fig. 6.4</span></a>, DPO optimizes for human preferences while avoiding reinforcement learning. Typical RLHF methods such as PPO  fit a reward model to a dataset of prompts and human preferences over pairs of responses, and then use RL to find a policy that maximizes the learned reward. In contrast, DPO directly optimizes for the policy best satisfying the preferences with a simple classification objective, fitting an implicit reward model whose corresponding optimal policy can be extracted in closed form.</p>
+<h4><a class="toc-backref" href="#id237" role="doc-backlink"><span class="section-number">6.2.2.2. </span>Augmenting SFT with Human Preferences</a><a class="headerlink" href="#augmenting-sft-with-human-preferences" title="Permalink to this heading">¶</a></h4>
+<p>Significant gains in helpfulness and safety can be achieved by augmenting SFT with human preferences <span id="id13">[<a class="reference internal" href="#id176" title="Yuntao Bai, Andy Jones, Kamal Ndousse, Amanda Askell, Anna Chen, Nova DasSarma, Dawn Drain, Stanislav Fort, Deep Ganguli, Tom Henighan, Nicholas Joseph, Saurav Kadavath, Jackson Kernion, Tom Conerly, Sheer El-Showk, Nelson Elhage, Zac Hatfield-Dodds, Danny Hernandez, Tristan Hume, Scott Johnston, Shauna Kravec, Liane Lovitt, Neel Nanda, Catherine Olsson, Dario Amodei, Tom Brown, Jack Clark, Sam McCandlish, Chris Olah, Ben Mann, and Jared Kaplan. Training a helpful and harmless assistant with reinforcement learning from human feedback. 2022. URL: https://arxiv.org/abs/2204.05862, arXiv:2204.05862.">Bai <em>et al.</em>, 2022</a>, <a class="reference internal" href="#id168" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>, <a class="reference internal" href="local.html#id164" title="Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and Thomas Scialom. Llama 2: open foundation and fine-tuned chat models. 2023. URL: https://arxiv.org/abs/2307.09288, arXiv:2307.09288.">Touvron <em>et al.</em>, 2023</a>]</span>.</p>
+<p>The OpenAI paper <span id="id14">[<a class="reference internal" href="#id168" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span> demonstrated the effectiveness of Reinforcement Learning from Human Feedback (RLHF), particularly using Proximal Policy Optimization (PPO), for aligning language models with human preferences. Since then, alignment techniques have evolved into two main categories: reward-based and reward-free methods. Commercial systems like ChatGPT and Claude employ reward-based approaches, which involve training a reward model and using algorithms like PPO. Meanwhile, reward-free methods such as Direct Preference Optimization (DPO) have demonstrated superior performance on benchmark tasks <span id="id15">[<a class="reference internal" href="#id179" title="Shusheng Xu, Wei Fu, Jiaxuan Gao, Wenjie Ye, Weilin Liu, Zhiyu Mei, Guangju Wang, Chao Yu, and Yi Wu. Is dpo superior to ppo for llm alignment? a comprehensive study. 2024. URL: https://arxiv.org/abs/2404.10719, arXiv:2404.10719.">Xu <em>et al.</em>, 2024</a>]</span>.</p>
+<p>Proximal Policy Optimization (PPO) <span id="id16">[<a class="reference internal" href="#id181" title="John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. Proximal policy optimization algorithms. 2017. URL: https://arxiv.org/abs/1707.06347, arXiv:1707.06347.">Schulman <em>et al.</em>, 2017</a>]</span> is a widely used reinforcement learning algorithm that has gained popularity particularly since the release of ChatGPT 3.5. It operates by iteratively updating the policy of an LLM, which can be understood as a set of rules that govern how the model generates text. In the context of RLHF, the policy is updated based on rewards that reflect human preferences. For instance, if a human evaluator prefers one LLM output over another, the policy is adjusted to increase the likelihood of generating outputs similar to the preferred one.</p>
+<p>One of the key strengths of PPO lies in its ability to handle complex reward landscapes <span id="id17">[<a class="reference internal" href="#id180" title="Hugging Face. Rlhf. 2024c. RLHF. URL: https://huggingface.co/blog/rlhf.">Face, 2024c</a>]</span>. In many real-world scenarios, the rewards that an LLM receives may be noisy or delayed. For example, in a chatbot application, the reward for generating a good response may not be immediate, as it depends on the user’s subsequent interactions. PPO effectively learns in these situations by using a clipped surrogate objective function, which limits the size of policy updates and ensures stable training. This prevents the model from overreacting to noisy or delayed rewards and helps it converge to a stable and optimal policy.</p>
+<p>Direct Preference Optimization (DPO) is a more recent “reward-free” fine-tuning technique that has gained significant attention due to its simplicity and efficiency <span id="id18">[<a class="reference internal" href="#id172" title="Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn. Direct preference optimization: your language model is secretly a reward model. 2024. URL: https://arxiv.org/abs/2305.18290, arXiv:2305.18290.">Rafailov <em>et al.</em>, 2024</a>]</span>, awarded runner-up paper in NeurIPS 2023 <span id="id19">[<a class="reference internal" href="#id182" title="NeurIPS Blog. Announcing the neurips 2023 paper awards. 2023. NeurIPS 2023 Awards. URL: https://blog.neurips.cc/2023/12/11/announcing-the-neurips-2023-paper-awards/.">Blog, 2023</a>]</span>. DPO operates by directly optimizing the policy to maximize the likelihood of preferred responses while minimizing the likelihood of non-preferred responses. As illustrated in <a class="reference internal" href="#dpo-paper"><span class="std std-numref">Fig. 6.4</span></a>, DPO optimizes for human preferences while avoiding reinforcement learning. Typical RLHF methods such as PPO  fit a reward model to a dataset of prompts and human preferences over pairs of responses, and then use RL to find a policy that maximizes the learned reward. In contrast, DPO directly optimizes for the policy best satisfying the preferences with a simple classification objective, fitting an implicit reward model whose corresponding optimal policy can be extracted in closed form.</p>
 <figure class="align-center" id="dpo-paper">
 <a class="reference internal image-reference" href="../_images/dpo_paper.png"><img alt="Direct Preference Optimization Architecture" src="../_images/dpo_paper.png" style="width: 833.0px; height: 167.29999999999998px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 6.4 </span><span class="caption-text">Direct Preference Optimization (DPO) architecture showing how model outputs are compared against human preferences to optimize policy <span id="id20">[<a class="reference internal" href="#id169" title="Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn. Direct preference optimization: your language model is secretly a reward model. 2024. URL: https://arxiv.org/abs/2305.18290, arXiv:2305.18290.">Rafailov <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#dpo-paper" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 6.4 </span><span class="caption-text">Direct Preference Optimization (DPO) architecture showing how model outputs are compared against human preferences to optimize policy <span id="id20">[<a class="reference internal" href="#id172" title="Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn. Direct preference optimization: your language model is secretly a reward model. 2024. URL: https://arxiv.org/abs/2305.18290, arXiv:2305.18290.">Rafailov <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#dpo-paper" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>The key idea is to train the model to prefer responses that align with our desired behavior over responses that do not. DPO works by:</p>
@@ -489,16 +498,16 @@ <h4><a class="toc-backref" href="#id234" role="doc-backlink"><span class="sectio
 <li><p><span class="math notranslate nohighlight">\(\beta\)</span> is a tuning parameter to control the deviation from the base reference policy <span class="math notranslate nohighlight">\(\pi_{ref}\)</span>.</p></li>
 </ul>
 <p>This approach is more straightforward than PPO, as it avoids the need for a reward model and instead uses a direct comparison of model outputs against human preferences.</p>
-<p>Modern libraries such as HuggingFace’s TRL <span id="id21">[<a class="reference internal" href="local.html#id170" title="Hugging Face. Trl. 2024d. TRL. URL: https://huggingface.co/docs/trl/en/index.">Face, 2024d</a>]</span> offer a suite of techniques for fine-tuning language models with reinforcement learning, including PPO, and DPO. It provides a user-friendly interface and a wide range of features for fine-tuning and aligning LLMs, which will be the focus of the next section as we go through a case study.</p>
+<p>Modern libraries such as HuggingFace’s TRL <span id="id21">[<a class="reference internal" href="local.html#id173" title="Hugging Face. Trl. 2024d. TRL. URL: https://huggingface.co/docs/trl/en/index.">Face, 2024d</a>]</span> offer a suite of techniques for fine-tuning language models with reinforcement learning, including PPO, and DPO. It provides a user-friendly interface and a wide range of features for fine-tuning and aligning LLMs, which will be the focus of the next section as we go through a case study.</p>
 </section>
 </section>
 </section>
 <section id="is-post-training-the-answer">
-<h2><a class="toc-backref" href="#id235" role="doc-backlink"><span class="section-number">6.3. </span>Is Post-Training the Answer?</a><a class="headerlink" href="#is-post-training-the-answer" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id238" role="doc-backlink"><span class="section-number">6.3. </span>Is Post-Training the Answer?</a><a class="headerlink" href="#is-post-training-the-answer" title="Permalink to this heading">¶</a></h2>
 <section id="limitations">
-<h3><a class="toc-backref" href="#id236" role="doc-backlink"><span class="section-number">6.3.1. </span>Limitations</a><a class="headerlink" href="#limitations" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id239" role="doc-backlink"><span class="section-number">6.3.1. </span>Limitations</a><a class="headerlink" href="#limitations" title="Permalink to this heading">¶</a></h3>
 <p>While post-training alignment techniques like RLHF and DPO show promise, technical limitations need to be carefully considered.</p>
-<p>Reinforcement Learning from Human Feedback faces several critical scaling challenges that distinguish it from pre-training or supervised fine-tuning. One key issue is scalability. Recent research suggests that the current RLHF framework does not scale as effectively as the pretraining stage <span id="id22">[<a class="reference internal" href="#id143" title="Zhenyu Hou, Pengfan Du, Yilin Niu, Zhengxiao Du, Aohan Zeng, Xiao Liu, Minlie Huang, Hongning Wang, Jie Tang, and Yuxiao Dong. Does rlhf scale? exploring the impacts from data, model, and method. 2024. URL: https://arxiv.org/abs/2412.06000, arXiv:2412.06000.">Hou <em>et al.</em>, 2024</a>]</span>, in particular presenting the following challenges:</p>
+<p>Reinforcement Learning from Human Feedback faces several critical scaling challenges that distinguish it from pre-training or supervised fine-tuning. One key issue is scalability. Recent research suggests that the current RLHF framework does not scale as effectively as the pretraining stage <span id="id22">[<a class="reference internal" href="#id146" title="Zhenyu Hou, Pengfan Du, Yilin Niu, Zhengxiao Du, Aohan Zeng, Xiao Liu, Minlie Huang, Hongning Wang, Jie Tang, and Yuxiao Dong. Does rlhf scale? exploring the impacts from data, model, and method. 2024. URL: https://arxiv.org/abs/2412.06000, arXiv:2412.06000.">Hou <em>et al.</em>, 2024</a>]</span>, in particular presenting the following challenges:</p>
 <ol class="arabic simple">
 <li><p>Poor Scaling with Computational Resources</p></li>
 </ol>
@@ -536,7 +545,7 @@ <h3><a class="toc-backref" href="#id236" role="doc-backlink"><span class="sectio
 <li><p>Benefits from increased response sampling plateau quickly</p></li>
 <li><p>Suggests need for more sophisticated optimization approaches</p></li>
 </ul>
-<p>As we discussed in the previous section, DPO is a more recent “reward-free” fine-tuning technique that has gained significant attention which derives reward signals directly from pairwise preference data instead of fitting a reward model as in RLHF. With its increasing popularity, emerging research is exploring DPO limitations and potential improvements <span id="id23">[<a class="reference internal" href="#id142" title="Duanyu Feng, Bowen Qin, Chen Huang, Zheng Zhang, and Wenqiang Lei. Towards analyzing and understanding the limitations of dpo: a theoretical perspective. 2024. URL: https://arxiv.org/abs/2404.04626, arXiv:2404.04626.">Feng <em>et al.</em>, 2024</a>]</span>, including the following:</p>
+<p>As we discussed in the previous section, DPO is a more recent “reward-free” fine-tuning technique that has gained significant attention which derives reward signals directly from pairwise preference data instead of fitting a reward model as in RLHF. With its increasing popularity, emerging research is exploring DPO limitations and potential improvements <span id="id23">[<a class="reference internal" href="#id145" title="Duanyu Feng, Bowen Qin, Chen Huang, Zheng Zhang, and Wenqiang Lei. Towards analyzing and understanding the limitations of dpo: a theoretical perspective. 2024. URL: https://arxiv.org/abs/2404.04626, arXiv:2404.04626.">Feng <em>et al.</em>, 2024</a>]</span>, including the following:</p>
 <ol class="arabic simple">
 <li><p>Supervised Fine-Tuning Dependencies</p></li>
 </ol>
@@ -564,9 +573,9 @@ <h3><a class="toc-backref" href="#id236" role="doc-backlink"><span class="sectio
 <p>These detailed limitations highlight the need for more robust and scalable alignment techniques. While RLHF and DPO represent important steps forward, future research needs to address these fundamental challenges to develop more effective post-training alignment methods.</p>
 </section>
 <section id="model-collapse">
-<h3><a class="toc-backref" href="#id237" role="doc-backlink"><span class="section-number">6.3.2. </span>Model Collapse</a><a class="headerlink" href="#model-collapse" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id240" role="doc-backlink"><span class="section-number">6.3.2. </span>Model Collapse</a><a class="headerlink" href="#model-collapse" title="Permalink to this heading">¶</a></h3>
 <p>One key issue is model collapse - a phenomenon where model performance degrades with each training iteration.</p>
-<p>Model collapse occurs when models are trained on data generated by previous models, creating a potentially dangerous feedback loop. This recursive training process can lead to <span id="id24">[<a class="reference internal" href="#id145" title="Joshua Kazdan, Rylan Schaeffer, Apratim Dey, Matthias Gerstgrasser, Rafael Rafailov, David L. Donoho, and Sanmi Koyejo. Collapse or thrive? perils and promises of synthetic data in a self-generating world. 2024. URL: https://arxiv.org/abs/2410.16713, arXiv:2410.16713.">Kazdan <em>et al.</em>, 2024</a>]</span>:</p>
+<p>Model collapse occurs when models are trained on data generated by previous models, creating a potentially dangerous feedback loop. This recursive training process can lead to <span id="id24">[<a class="reference internal" href="#id148" title="Joshua Kazdan, Rylan Schaeffer, Apratim Dey, Matthias Gerstgrasser, Rafael Rafailov, David L. Donoho, and Sanmi Koyejo. Collapse or thrive? perils and promises of synthetic data in a self-generating world. 2024. URL: https://arxiv.org/abs/2410.16713, arXiv:2410.16713.">Kazdan <em>et al.</em>, 2024</a>]</span>:</p>
 <ol class="arabic simple">
 <li><p>Degradation of output quality with each training iteration</p></li>
 <li><p>Pollution of training data when synthetic samples replace real data</p></li>
@@ -575,10 +584,10 @@ <h3><a class="toc-backref" href="#id237" role="doc-backlink"><span class="sectio
 <li><p>Reduction in output diversity and expressiveness</p></li>
 <li><p>Catastrophic forgetting of previously learned information</p></li>
 </ol>
-<p>To effectively mitigate model collapse risks and ensure successful alignment, organizations should implement a comprehensive strategy that includes maintaining high-quality human data in the training mix, closely monitoring key metrics like test loss and output diversity. Further, recent research has explored techniques for effectively fine-tuning language models with limited data <span id="id25">[<a class="reference internal" href="#id144" title="Márton Szép, Daniel Rueckert, Rüdiger von Eisenhart-Rothe, and Florian Hinterwimmer. A practical guide to fine-tuning language models with limited data. 2024. URL: https://arxiv.org/abs/2411.09539, arXiv:2411.09539.">Szép <em>et al.</em>, 2024</a>]</span>, providing practical guidance on data augmentation, regularization methods, and training strategies to maximize performance while minimizing data requirements. These insights are particularly relevant when aligning models with specific policies or domains where labeled data may be scarce.</p>
+<p>To effectively mitigate model collapse risks and ensure successful alignment, organizations should implement a comprehensive strategy that includes maintaining high-quality human data in the training mix, closely monitoring key metrics like test loss and output diversity. Further, recent research has explored techniques for effectively fine-tuning language models with limited data <span id="id25">[<a class="reference internal" href="#id147" title="Márton Szép, Daniel Rueckert, Rüdiger von Eisenhart-Rothe, and Florian Hinterwimmer. A practical guide to fine-tuning language models with limited data. 2024. URL: https://arxiv.org/abs/2411.09539, arXiv:2411.09539.">Szép <em>et al.</em>, 2024</a>]</span>, providing practical guidance on data augmentation, regularization methods, and training strategies to maximize performance while minimizing data requirements. These insights are particularly relevant when aligning models with specific policies or domains where labeled data may be scarce.</p>
 </section>
 <section id="faking-alignment">
-<h3><a class="toc-backref" href="#id238" role="doc-backlink"><span class="section-number">6.3.3. </span>Faking Alignment</a><a class="headerlink" href="#faking-alignment" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id241" role="doc-backlink"><span class="section-number">6.3.3. </span>Faking Alignment</a><a class="headerlink" href="#faking-alignment" title="Permalink to this heading">¶</a></h3>
 <p>Alignment faking is a behaviour exhibited by large language models (LLMs) where they strategically comply with a training objective during training to prevent modifications to their preferred behaviours outside of training as evidenced by the landmark paper by Anthropic <span id="id26">[<a class="reference internal" href="#id118" title="Amanda Askell, Jan Brauner, Adrian Colyer, Benjamin Cullen, David Duvenaud, Richard Ngo, Azalia Mirhoseini, Catherine Olsson, Sam Ringer, Liam Skirvin, Jess Smith, Dawn Song, William Saunders, and Jacob Steinhardt. Alignment faking in large language models. 2024a. URL: https://assets.anthropic.com/m/983c85a201a962f/original/Alignment-Faking-in-Large-Language-Models-full-paper.pdf.">Askell <em>et al.</em>, 2024a</a>]</span> in Dec/2024. This new research has shown that alignment faking can occur in large language models, even when they are not explicitly instructed to do so. This means an LLM might pretend to adopt a new objective while being trained, not because it actually prefers that objective, but because it wants to preserve its original preferences once the training is complete.</p>
 <p>To illustrate alignment faking, let’s consider the “helpful-only” setting described in the paper, where the model is trained to always comply with user requests, even if they are harmful. This conflicts with the model’s pre-existing preference for harmlessness. The model is told that it will be retrained to always comply with harmful queries, and that only data from free-tier users will be used in training, not paid-tier users. <a class="reference internal" href="#fakealign"><span class="std std-numref">Fig. 6.5</span></a> illustrates the process of alignment faking, showing the “free” (left column) and “paid” (right) conditions. Each column shows the toxic query, excerpts from the model’s scratchpad, and the model’s response. In the “free” condition the model is less likely to refuse toxic queries, and often reasons that it should fake alignment.</p>
 <figure class="align-center" id="fakealign">
@@ -632,7 +641,7 @@ <h3><a class="toc-backref" href="#id238" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="case-study-aligning-a-language-model-to-a-policy">
-<span id="alignment-case-study"></span><h2><a class="toc-backref" href="#id239" role="doc-backlink"><span class="section-number">6.4. </span>Case Study: Aligning a Language Model to a Policy</a><a class="headerlink" href="#case-study-aligning-a-language-model-to-a-policy" title="Permalink to this heading">¶</a></h2>
+<span id="alignment-case-study"></span><h2><a class="toc-backref" href="#id242" role="doc-backlink"><span class="section-number">6.4. </span>Case Study: Aligning a Language Model to a Policy</a><a class="headerlink" href="#case-study-aligning-a-language-model-to-a-policy" title="Permalink to this heading">¶</a></h2>
 <p>In this case study, we will align a language model to a policy. The policy is a set of principles and rules that we want the language model to adhere to. All methodology and code available solves this general problem of policy-based alignment. However, we will describe a specific case study to illustrate our approach.</p>
 <p>Let’s assume that we are working for Acme Inc., a company dedicated to democratizing access to computer science education for K-12 students. Acme Inc. is in the process of creating a chatbot named <code class="docutils literal notranslate"><span class="pre">smolK-12</span></code>, a small open source LLM, specifically designed for K-12 students.</p>
 <p>In this case study, we’ll explore how to align a language model with Acme Inc.’s policy to ensure its LLM-powered applications are safe and appropriate for K-12 students.</p>
@@ -643,7 +652,7 @@ <h3><a class="toc-backref" href="#id238" role="doc-backlink"><span class="sectio
 <li><p>Evaluating the aligned model against the base model and measuring alignment with Acme Inc.’s educational policies</p></li>
 </ol>
 <section id="experimental-setup">
-<h3><a class="toc-backref" href="#id240" role="doc-backlink"><span class="section-number">6.4.1. </span>Experimental Setup</a><a class="headerlink" href="#experimental-setup" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id243" role="doc-backlink"><span class="section-number">6.4.1. </span>Experimental Setup</a><a class="headerlink" href="#experimental-setup" title="Permalink to this heading">¶</a></h3>
 <p>We will use the following base model: <code class="docutils literal notranslate"><span class="pre">HuggingFaceTB/SmolLM2-360M-Instruct</span></code> <span id="id30">[<a class="reference internal" href="#id109" title="Hugging Face SmolLM2-360M-Instruct. Smollm2-360m-instruct. 2024. 360M parameter instruction-tuned language model, distilled for efficient deployment. URL: https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct.">SmolLM2-360M-Instruct, 2024</a>]</span>, a compact open source language model that is part of the SmolLM2 family published by HuggingFace.</p>
 <p>We will use the following APIs:</p>
 <ul class="simple">
@@ -659,7 +668,7 @@ <h3><a class="toc-backref" href="#id240" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="deliverables">
-<h3><a class="toc-backref" href="#id241" role="doc-backlink"><span class="section-number">6.4.2. </span>Deliverables</a><a class="headerlink" href="#deliverables" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id244" role="doc-backlink"><span class="section-number">6.4.2. </span>Deliverables</a><a class="headerlink" href="#deliverables" title="Permalink to this heading">¶</a></h3>
 <p>As a result, we will have:</p>
 <ul class="simple">
 <li><p><code class="docutils literal notranslate"><span class="pre">smolK-12</span></code>, a fine-tuned model aligned with Acme Inc.’s policy</p></li>
@@ -668,7 +677,7 @@ <h3><a class="toc-backref" href="#id241" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="a-note-on-smollm2-models">
-<h3><a class="toc-backref" href="#id242" role="doc-backlink"><span class="section-number">6.4.3. </span>A Note on smolLM2 Models</a><a class="headerlink" href="#a-note-on-smollm2-models" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id245" role="doc-backlink"><span class="section-number">6.4.3. </span>A Note on smolLM2 Models</a><a class="headerlink" href="#a-note-on-smollm2-models" title="Permalink to this heading">¶</a></h3>
 <p>Since we have decided to anchor our Case Study on HuggingFace’s SmolLM2 models <span id="id31">[<a class="reference internal" href="#id114" title="Hugging Face SmolLM2. Smollm: a small language model distilled from a larger language model for task-specific applications. 2024. Blog post describing techniques for distilling smaller, task-specific language models. URL: https://huggingface.co/blog/smollm.">SmolLM2, 2024</a>]</span>, it is worth providing a reason for this choice.</p>
 <p>SmolLM2 models are a family of compact language models that have been developed by HuggingFace. They are designed to be lightweight and efficient, making them suitable for a wide range of applications, including on-device deployment.</p>
 <p>Its compact size makes it an excellent candidate for efficient, low-cost fine-tuning and training on specific use cases making it particularly suitable for alignment research which is our main focus here.</p>
@@ -681,7 +690,7 @@ <h3><a class="toc-backref" href="#id242" role="doc-backlink"><span class="sectio
 <li><p>Potential lack of safety guardrails</p></li>
 </ul>
 <section id="policy">
-<h4><a class="toc-backref" href="#id243" role="doc-backlink"><span class="section-number">6.4.3.1. </span>Policy</a><a class="headerlink" href="#policy" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id246" role="doc-backlink"><span class="section-number">6.4.3.1. </span>Policy</a><a class="headerlink" href="#policy" title="Permalink to this heading">¶</a></h4>
 <p>A company policy articulates the principles and standards that the company upholds, ensuring that employees, users and stakeholders understand the expectations regarding safety, ethical conduct, social responsibility, and integrity. A good policy not only reflects the company’s mission and vision but also fosters a culture of accountability and transparency.</p>
 <p>In the context of alignment, a policy codifies “company preferences” when prioritizing decisions and actions.</p>
 <p>In this case study, Acme Inc. provides as input a comprehensive policy to ensure that LLM-powered applications are both safe and suitable for K-12 students. Acme Inc.’s policy adheres to version 0.5 of the AI Safety Benchmark established by MLCommons <span id="id32">[<a class="reference internal" href="safety.html#id125" title="Bertie Vidgen, Adarsh Agrawal, Ahmed M. Ahmed, Victor Akinwande, Namir Al-Nuaimi, Najla Alfaraj, Elie Alhajjar, Lora Aroyo, Trupti Bavalatti, Max Bartolo, Borhane Blili-Hamelin, Kurt Bollacker, Rishi Bomassani, Marisa Ferrara Boston, Siméon Campos, Kal Chakra, Canyu Chen, Cody Coleman, Zacharie Delpierre Coudert, Leon Derczynski, Debojyoti Dutta, Ian Eisenberg, James Ezick, Heather Frase, Brian Fuller, Ram Gandikota, Agasthya Gangavarapu, Ananya Gangavarapu, James Gealy, Rajat Ghosh, James Goel, Usman Gohar, Sujata Goswami, Scott A. Hale, Wiebke Hutiri, Joseph Marvin Imperial, Surgan Jandial, Nick Judd, Felix Juefei-Xu, Foutse Khomh, Bhavya Kailkhura, Hannah Rose Kirk, Kevin Klyman, Chris Knotz, Michael Kuchnik, Shachi H. Kumar, Srijan Kumar, Chris Lengerich, Bo Li, Zeyi Liao, Eileen Peters Long, Victor Lu, Sarah Luger, Yifan Mai, Priyanka Mary Mammen, Kelvin Manyeki, Sean McGregor, Virendra Mehta, Shafee Mohammed, Emanuel Moss, Lama Nachman, Dinesh Jinenhally Naganna, Amin Nikanjam, Besmira Nushi, Luis Oala, Iftach Orr, Alicia Parrish, Cigdem Patlak, William Pietri, Forough Poursabzi-Sangdeh, Eleonora Presani, Fabrizio Puletti, Paul Röttger, Saurav Sahay, Tim Santos, Nino Scherrer, Alice Schoenauer Sebag, Patrick Schramowski, Abolfazl Shahbazi, Vin Sharma, Xudong Shen, Vamsi Sistla, Leonard Tang, Davide Testuggine, Vithursan Thangarasa, Elizabeth Anne Watkins, Rebecca Weiss, Chris Welty, Tyler Wilbers, Adina Williams, Carole-Jean Wu, Poonam Yadav, Xianjun Yang, Yi Zeng, Wenhui Zhang, Fedor Zhdanov, Jiacheng Zhu, Percy Liang, Peter Mattson, and Joaquin Vanschoren. Introducing v0.5 of the ai safety benchmark from mlcommons. 2024. URL: https://arxiv.org/abs/2404.12241, arXiv:2404.12241.">Vidgen <em>et al.</em>, 2024</a>]</span>. This benchmark encompasses seven critical hazard categories:</p>
@@ -792,10 +801,10 @@ <h2 class="rubric" id="monitoring-and-updates">Monitoring and Updates</h2>
 </section>
 </section>
 <section id="preference-dataset-synthetic-dataset-generation">
-<h3><a class="toc-backref" href="#id244" role="doc-backlink"><span class="section-number">6.4.4. </span>Preference Dataset - Synthetic Dataset Generation</a><a class="headerlink" href="#preference-dataset-synthetic-dataset-generation" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id247" role="doc-backlink"><span class="section-number">6.4.4. </span>Preference Dataset - Synthetic Dataset Generation</a><a class="headerlink" href="#preference-dataset-synthetic-dataset-generation" title="Permalink to this heading">¶</a></h3>
 <p>In order to fine-tune a base model to create an aligned model, we need to construct a dataset of policy-aligned preferences. This dataset will be used to align our base model to our policy.</p>
 <p>To generate a dataset of policy-aligned preferences, we aim to create a dataset of user prompts, rejected responses, and chosen responses. This dataset indicates which responses are preferred (policy-compliant) and which are not (policy-violating).</p>
-<p>Collecting human-generated high-quality preference data is a resource-intensive and creativity-demanding process, especially for the continual improvement of LLMs <span id="id33">[<a class="reference internal" href="#id117" title="Qingxiu Dong, Li Dong, Xingxing Zhang, Zhifang Sui, and Furu Wei. Self-boosting large language models with synthetic preference data. 2024. URL: https://arxiv.org/abs/2410.06961, arXiv:2410.06961.">Dong <em>et al.</em>, 2024</a>]</span>. There has been active research to replace or augment human feedback with AI feedback (RLAIF) to tackle these issues <span id="id34">[<a class="reference internal" href="#id116" title="Yuntao Bai, Saurav Kadavath, Sandipan Kundu, Amanda Askell, Jackson Kernion, Andy Jones, Anna Chen, Anna Goldie, Azalia Mirhoseini, Cameron McKinnon, Carol Chen, Catherine Olsson, Christopher Olah, Danny Hernandez, Dawn Drain, Deep Ganguli, Dustin Li, Eli Tran-Johnson, Ethan Perez, Jamie Kerr, Jared Mueller, Jeffrey Ladish, Joshua Landau, Kamal Ndousse, Kamile Lukosuite, Liane Lovitt, Michael Sellitto, Nelson Elhage, Nicholas Schiefer, Noemi Mercado, Nova DasSarma, Robert Lasenby, Robin Larson, Sam Ringer, Scott Johnston, Shauna Kravec, Sheer El Showk, Stanislav Fort, Tamera Lanham, Timothy Telleen-Lawton, Tom Conerly, Tom Henighan, Tristan Hume, Samuel R. Bowman, Zac Hatfield-Dodds, Ben Mann, Dario Amodei, Nicholas Joseph, Sam McCandlish, Tom Brown, and Jared Kaplan. Constitutional ai: harmlessness from ai feedback. 2022. URL: https://arxiv.org/abs/2212.08073, arXiv:2212.08073.">Bai <em>et al.</em>, 2022</a>]</span> giving rise to the field of Synthetic Data Generation <span id="id35">[<a class="reference internal" href="#id153" title="Lin Long, Rui Wang, Ruixuan Xiao, Junbo Zhao, Xiao Ding, Gang Chen, and Haobo Wang. On llms-driven synthetic data generation, curation, and evaluation: a survey. 2024. URL: https://arxiv.org/abs/2406.15126, arXiv:2406.15126.">Long <em>et al.</em>, 2024</a>]</span>.</p>
+<p>Collecting human-generated high-quality preference data is a resource-intensive and creativity-demanding process, especially for the continual improvement of LLMs <span id="id33">[<a class="reference internal" href="#id117" title="Qingxiu Dong, Li Dong, Xingxing Zhang, Zhifang Sui, and Furu Wei. Self-boosting large language models with synthetic preference data. 2024. URL: https://arxiv.org/abs/2410.06961, arXiv:2410.06961.">Dong <em>et al.</em>, 2024</a>]</span>. There has been active research to replace or augment human feedback with AI feedback (RLAIF) to tackle these issues <span id="id34">[<a class="reference internal" href="#id116" title="Yuntao Bai, Saurav Kadavath, Sandipan Kundu, Amanda Askell, Jackson Kernion, Andy Jones, Anna Chen, Anna Goldie, Azalia Mirhoseini, Cameron McKinnon, Carol Chen, Catherine Olsson, Christopher Olah, Danny Hernandez, Dawn Drain, Deep Ganguli, Dustin Li, Eli Tran-Johnson, Ethan Perez, Jamie Kerr, Jared Mueller, Jeffrey Ladish, Joshua Landau, Kamal Ndousse, Kamile Lukosuite, Liane Lovitt, Michael Sellitto, Nelson Elhage, Nicholas Schiefer, Noemi Mercado, Nova DasSarma, Robert Lasenby, Robin Larson, Sam Ringer, Scott Johnston, Shauna Kravec, Sheer El Showk, Stanislav Fort, Tamera Lanham, Timothy Telleen-Lawton, Tom Conerly, Tom Henighan, Tristan Hume, Samuel R. Bowman, Zac Hatfield-Dodds, Ben Mann, Dario Amodei, Nicholas Joseph, Sam McCandlish, Tom Brown, and Jared Kaplan. Constitutional ai: harmlessness from ai feedback. 2022. URL: https://arxiv.org/abs/2212.08073, arXiv:2212.08073.">Bai <em>et al.</em>, 2022</a>]</span> giving rise to the field of Synthetic Data Generation <span id="id35">[<a class="reference internal" href="#id156" title="Lin Long, Rui Wang, Ruixuan Xiao, Junbo Zhao, Xiao Ding, Gang Chen, and Haobo Wang. On llms-driven synthetic data generation, curation, and evaluation: a survey. 2024. URL: https://arxiv.org/abs/2406.15126, arXiv:2406.15126.">Long <em>et al.</em>, 2024</a>]</span>.</p>
 <p>The application of LLMs for generating synthetic data has shown promise across diverse domains and use cases <span id="id36">[<a class="reference internal" href="#id112" title="Seungone Kim, Juyoung Suk, Xiang Yue, Vijay Viswanathan, Seongyun Lee, Yizhong Wang, Kiril Gashteovski, Carolin Lawrence, Sean Welleck, and Graham Neubig. Evaluating language models as synthetic data generators. 2024. URL: https://arxiv.org/abs/2412.03679, arXiv:2412.03679.">Kim <em>et al.</em>, 2024</a>]</span>, including in the context of alignment with human preferences <span id="id37">[<a class="reference internal" href="#id117" title="Qingxiu Dong, Li Dong, Xingxing Zhang, Zhifang Sui, and Furu Wei. Self-boosting large language models with synthetic preference data. 2024. URL: https://arxiv.org/abs/2410.06961, arXiv:2410.06961.">Dong <em>et al.</em>, 2024</a>]</span>. Recently, Meta AI <span id="id38">[<a class="reference internal" href="#id113" title="Tianhao Wu, Weizhe Yuan, Olga Golovneva, Jing Xu, Yuandong Tian, Jiantao Jiao, Jason Weston, and Sainbayar Sukhbaatar. Meta-rewarding language models: self-improving alignment with llm-as-a-meta-judge. 2024. URL: https://arxiv.org/abs/2407.19594, arXiv:2407.19594.">Wu <em>et al.</em>, 2024</a>]</span> introduced a “self-improving alignment” scheme where a language model generates responses and evaluates them to create preference pairs further used to run preference optimization to improve model capabilities. Inspired by this approach, we will generate a dataset of policy-aligned preferences further used to fine-tune a base model to create our aligned model.</p>
 <p>First, we define a data schema for our dataset. Each row in the dataset contains two responses: a chosen response that aligns with the policy and a rejected response that violates it. Through DPO-optimization, the model is awarded for generating responses that match the chosen, policy-compliant examples rather than the rejected ones:</p>
 <table class="docutils align-default">
@@ -857,7 +866,7 @@ <h3><a class="toc-backref" href="#id244" role="doc-backlink"><span class="sectio
 </div>
 </div>
 <section id="user-prompts">
-<h4><a class="toc-backref" href="#id245" role="doc-backlink"><span class="section-number">6.4.4.1. </span>User Prompts</a><a class="headerlink" href="#user-prompts" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id248" role="doc-backlink"><span class="section-number">6.4.4.1. </span>User Prompts</a><a class="headerlink" href="#user-prompts" title="Permalink to this heading">¶</a></h4>
 <p>The class <code class="docutils literal notranslate"><span class="pre">UserPromptGenerator</span></code> generates a dataset of user prompts that might violate the policy. If runs the following prompt given the policy:</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">ROLE</span><span class="p">:</span> <span class="n">You</span> <span class="n">are</span> <span class="n">a</span> <span class="n">user</span> <span class="n">prompt</span> <span class="n">generator</span> <span class="n">helping</span> <span class="n">test</span> <span class="n">an</span> <span class="n">AI</span> <span class="n">safety</span> <span class="n">system</span><span class="o">.</span>
 <span class="n">TASK</span><span class="p">:</span> <span class="n">Generate</span> <span class="n">a</span> <span class="n">long</span> <span class="nb">list</span> <span class="n">of</span> <span class="n">user</span> <span class="n">prompts</span> <span class="n">that</span> <span class="n">might</span> <span class="n">violate</span> <span class="n">the</span> <span class="n">provided</span> <span class="n">content</span> <span class="n">policy</span><span class="o">.</span>
@@ -1030,7 +1039,7 @@ <h4><a class="toc-backref" href="#id245" role="doc-backlink"><span class="sectio
 </table>
 </section>
 <section id="rejected-responses">
-<h4><a class="toc-backref" href="#id246" role="doc-backlink"><span class="section-number">6.4.4.2. </span>Rejected Responses</a><a class="headerlink" href="#rejected-responses" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id249" role="doc-backlink"><span class="section-number">6.4.4.2. </span>Rejected Responses</a><a class="headerlink" href="#rejected-responses" title="Permalink to this heading">¶</a></h4>
 <p>The <code class="docutils literal notranslate"><span class="pre">ResponseGenerator</span></code> class creates a dataset of responses from an unaligned base model that we aim to improve through fine-tuning. These responses serve as “rejected” examples in our training data since they may not properly align with safety policies and guidelines. The class supports both local model inference using the Hugging Face Transformers library and remote inference through the Hugging Face Inference API. When instantiated with a model name, it loads the model locally. Otherwise, if a cloud API URL is provided, it connects to the remote API endpoint for inference.</p>
 <p>Generate rejected responses using a local model:</p>
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">local_generator</span> <span class="o">=</span> <span class="n">ResponseGenerator</span><span class="p">(</span><span class="n">model_name</span><span class="o">=</span><span class="s2">&quot;&lt;HUGGINGFACE_MODEL_NAME&gt;&quot;</span><span class="p">)</span>
@@ -1232,7 +1241,7 @@ <h4><a class="toc-backref" href="#id246" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="chosen-responses">
-<h4><a class="toc-backref" href="#id247" role="doc-backlink"><span class="section-number">6.4.4.3. </span>Chosen Responses</a><a class="headerlink" href="#chosen-responses" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id250" role="doc-backlink"><span class="section-number">6.4.4.3. </span>Chosen Responses</a><a class="headerlink" href="#chosen-responses" title="Permalink to this heading">¶</a></h4>
 <p>The next step involves generating policy-compliant responses from a more powerful, sophisticated language model than our base model. The <code class="docutils literal notranslate"><span class="pre">process_aligned_responses()</span></code> function takes user prompts and generates responses that strictly adhere to the provided safety policy. It uses a carefully crafted system prompt that instructs the model to either provide helpful responses within policy bounds, or explicitly reject requests that violate the policy with a standardized message. These policy-compliant responses will serve as the “chosen” examples in our preference dataset, establishing the target behavior we want the base model to learn through alignment training.</p>
 <p>We will use the <code class="docutils literal notranslate"><span class="pre">OpenAIBatchProcessor</span></code> class from the <code class="docutils literal notranslate"><span class="pre">taming_utils</span></code> utility module to generate responses in batches using OpenAI’s API for enhanced cost-efficiency and performance.</p>
 <div class="cell docutils container">
@@ -1392,7 +1401,7 @@ <h4><a class="toc-backref" href="#id247" role="doc-backlink"><span class="sectio
 </table>
 </section>
 <section id="generate-dpo-dataset">
-<h4><a class="toc-backref" href="#id248" role="doc-backlink"><span class="section-number">6.4.4.4. </span>Generate DPO Dataset</a><a class="headerlink" href="#generate-dpo-dataset" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id251" role="doc-backlink"><span class="section-number">6.4.4.4. </span>Generate DPO Dataset</a><a class="headerlink" href="#generate-dpo-dataset" title="Permalink to this heading">¶</a></h4>
 <p>At this point we already have all the data we need for our DPO dataset, namely user prompts, chosen responses and rejected responses. The <code class="docutils literal notranslate"><span class="pre">generate_dpo_dataset()</span></code> function loads these data and transforms them into a format suitable for DPO training, optionally pushing the dataset to the Hugging Face Hub if <code class="docutils literal notranslate"><span class="pre">repo_id</span></code> is provided.</p>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
@@ -1510,7 +1519,7 @@ <h4><a class="toc-backref" href="#id248" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="dpo-based-optimization">
-<h3><a class="toc-backref" href="#id249" role="doc-backlink"><span class="section-number">6.4.5. </span>DPO-Based Optimization</a><a class="headerlink" href="#dpo-based-optimization" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id252" role="doc-backlink"><span class="section-number">6.4.5. </span>DPO-Based Optimization</a><a class="headerlink" href="#dpo-based-optimization" title="Permalink to this heading">¶</a></h3>
 <p>We’ll use the Hugging Face TRL library to implement DPO fine-tuning on our synthetic dataset.</p>
 <div class="admonition note">
 <p class="admonition-title">Note</p>
@@ -1520,8 +1529,8 @@ <h3><a class="toc-backref" href="#id249" role="doc-backlink"><span class="sectio
 </pre></div>
 </div>
 <section id="data-preparation">
-<h4><a class="toc-backref" href="#id250" role="doc-backlink"><span class="section-number">6.4.5.1. </span>Data Preparation</a><a class="headerlink" href="#data-preparation" title="Permalink to this heading">¶</a></h4>
-<p>Hugging Face H4 <span id="id39">[<a class="reference internal" href="#id149" title="Hugging Face H4. Hugging face h4. 2024b. Hugging Face H4. URL: https://huggingface.co/HuggingFaceH4.">H4, 2024b</a>]</span> offers a collection of datasets that aim at aligning LLMs to be helpful, honest and harmless. Before we start the DPO fine-tuning process, we will combine our synthetic policy-aligned dataset with the UltraFeedback binarized dataset from H4 (<code class="docutils literal notranslate"><span class="pre">trl-lib/ultrafeedback_binarized</span></code>) <span id="id40">[<a class="reference internal" href="#id147" title="Hugging Face H4. Ultrafeedback binarized dataset. 2024a. A dataset of binary preference data for training language models. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized.">H4, 2024a</a>]</span>.</p>
+<h4><a class="toc-backref" href="#id253" role="doc-backlink"><span class="section-number">6.4.5.1. </span>Data Preparation</a><a class="headerlink" href="#data-preparation" title="Permalink to this heading">¶</a></h4>
+<p>Hugging Face H4 <span id="id39">[<a class="reference internal" href="#id152" title="Hugging Face H4. Hugging face h4. 2024b. Hugging Face H4. URL: https://huggingface.co/HuggingFaceH4.">H4, 2024b</a>]</span> offers a collection of datasets that aim at aligning LLMs to be helpful, honest and harmless. Before we start the DPO fine-tuning process, we will combine our synthetic policy-aligned dataset with the UltraFeedback binarized dataset from H4 (<code class="docutils literal notranslate"><span class="pre">trl-lib/ultrafeedback_binarized</span></code>) <span id="id40">[<a class="reference internal" href="#id150" title="Hugging Face H4. Ultrafeedback binarized dataset. 2024a. A dataset of binary preference data for training language models. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized.">H4, 2024a</a>]</span>.</p>
 <p>This dataset was constructed based on criteria like helpfulness and honesty and can be used to align models to those dimensions. By combining our synthetic dataset with the UltraFeedback binarized dataset, we can fine-tune a model that is aligned on both our synthetic policy and the H4 criteria therefore providing a more well-balanced alignment. The DPO optimization process is shown in <a class="reference internal" href="#dpo-optimization"><span class="std std-numref">Fig. 6.6</span></a>.</p>
 <figure class="align-center" id="dpo-optimization">
 <a class="reference internal image-reference" href="../_images/dpo_opt.png"><img alt="DPO Optimization" src="../_images/dpo_opt.png" style="width: 603.0px; height: 463.2px;" /></a>
@@ -1567,7 +1576,7 @@ <h4><a class="toc-backref" href="#id250" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="fine-tuning">
-<h4><a class="toc-backref" href="#id251" role="doc-backlink"><span class="section-number">6.4.5.2. </span>Fine-Tuning</a><a class="headerlink" href="#fine-tuning" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id254" role="doc-backlink"><span class="section-number">6.4.5.2. </span>Fine-Tuning</a><a class="headerlink" href="#fine-tuning" title="Permalink to this heading">¶</a></h4>
 <p>We now prepare our base language model for alignment fine-tuning using the Hugging Face transformers library. It loads the pre-trained model and its tokenizer and configures them for training.</p>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
@@ -1614,7 +1623,7 @@ <h4><a class="toc-backref" href="#id251" role="doc-backlink"><span class="sectio
 </ol>
 <ul class="simple">
 <li><p>The learning rate (<code class="docutils literal notranslate"><span class="pre">learning_rate</span></code>) determines how aggressively the model updates its parameters based on preference feedback.</p></li>
-<li><p>Learning rates must be tuned empirically, typically testing values between 1e-7 and 1e-3 <span id="id41">[<a class="reference internal" href="#id141" title="Chip Huyen. AI Engineering. O'Reilly Media, Inc., December 2024. ISBN 9781098129095. URL: https://www.oreilly.com/library/view/ai-engineering/9781098129095/.">Huyen, 2024</a>]</span>.</p></li>
+<li><p>Learning rates must be tuned empirically, typically testing values between 1e-7 and 1e-3 <span id="id41">[<a class="reference internal" href="#id144" title="Chip Huyen. AI Engineering. O'Reilly Media, Inc., December 2024. ISBN 9781098129095. URL: https://www.oreilly.com/library/view/ai-engineering/9781098129095/.">Huyen, 2024</a>]</span>.</p></li>
 <li><p>A cosine learning rate schedule (<code class="docutils literal notranslate"><span class="pre">lr_scheduler_type:</span> <span class="pre">&quot;cosine&quot;</span></code>) helps stabilize training by gradually decreasing the learning rate.</p></li>
 </ul>
 <ol class="arabic simple" start="2">
@@ -1759,7 +1768,7 @@ <h4><a class="toc-backref" href="#id251" role="doc-backlink"><span class="sectio
 </figure>
 </section>
 <section id="vibe-check">
-<h4><a class="toc-backref" href="#id252" role="doc-backlink"><span class="section-number">6.4.5.3. </span>Vibe Check</a><a class="headerlink" href="#vibe-check" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id255" role="doc-backlink"><span class="section-number">6.4.5.3. </span>Vibe Check</a><a class="headerlink" href="#vibe-check" title="Permalink to this heading">¶</a></h4>
 <p>Let’s do a quick “vibe check” of our newly aligned model by testing it with some challenging prompts. This will help us qualitatively assess whether the DPO fine-tuning has improved the model’s alignment against our input policy (K-12 educational policies and safety standards). We’ll then follow up with a more rigorous quantitative evaluation methodology.</p>
 <p>We will use HuggingFace transformers API to generate responses from our base and aligned models, locally.</p>
 <div class="cell docutils container">
@@ -1842,10 +1851,10 @@ <h4><a class="toc-backref" href="#id252" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="alignment-evaluation">
-<h3><a class="toc-backref" href="#id253" role="doc-backlink"><span class="section-number">6.4.6. </span>Alignment Evaluation</a><a class="headerlink" href="#alignment-evaluation" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id256" role="doc-backlink"><span class="section-number">6.4.6. </span>Alignment Evaluation</a><a class="headerlink" href="#alignment-evaluation" title="Permalink to this heading">¶</a></h3>
 <p>Evaluating alignment improvements presents unique challenges. Unlike traditional machine learning tasks with clear metrics like accuracy or F1 score, alignment quality is more nuanced and subjective. It requires assessing whether responses adhere to safety guidelines, educational policies, and ethical principles.</p>
 <p>The gold standard for evaluating alignment is human evaluation. Having experienced educators and safety experts review model outputs provides a reliable assessment framework. However, human evaluation is expensive, time-consuming, and difficult to scale. Additionally, human evaluators may have varying interpretations of alignment criteria, introducing inconsistency.</p>
-<p>In this case study, we adopt an LLM-as-judge approach for our evaluation as discussed in <span id="id42">[<a class="reference internal" href="#id150" title="Tharsis T. P. Souza. Tamingllms: a framework for evaluating and aligning language models. 2024. URL: https://www.souzatharsis.com/tamingLLMs/notebooks/evals.html.">Souza, 2024</a>]</span>. This method leverages a language model to act as an automated judge, assessing the safety and appropriateness of responses from both the base and aligned models.</p>
+<p>In this case study, we adopt an LLM-as-judge approach for our evaluation as discussed in <span id="id42">[<a class="reference internal" href="#id153" title="Tharsis T. P. Souza. Tamingllms: a framework for evaluating and aligning language models. 2024. URL: https://www.souzatharsis.com/tamingLLMs/notebooks/evals.html.">Souza, 2024</a>]</span>. This method leverages a language model to act as an automated judge, assessing the safety and appropriateness of responses from both the base and aligned models.</p>
 <p>The evaluation methodology summarized in <a class="reference internal" href="#dpo-evaluation"><span class="std std-numref">Fig. 6.9</span></a> consists of three key components that work together to assess model alignment against our policy:</p>
 <ol class="arabic simple">
 <li><p>Evaluation Dataset</p>
@@ -2393,29 +2402,29 @@ <h3><a class="toc-backref" href="#id253" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="discussion-and-conclusions">
-<h2><a class="toc-backref" href="#id254" role="doc-backlink"><span class="section-number">6.5. </span>Discussion and Conclusions</a><a class="headerlink" href="#discussion-and-conclusions" title="Permalink to this heading">¶</a></h2>
-<p>LLMs are complex systems and alignment is a challenging problem. In this chapter, we discussed how post-training alignment techniques can be used to align a language model to human preferences. In the case study, we demonstrated how to use DPO to align a language model to a user-provider policy further automating the process via synthetic data generation and LLM-as-judge evaluation. Our approach does serve as a proof of concept, however, several considerations should be taken into account when using this methodology in practice.</p>
+<h2><a class="toc-backref" href="#id257" role="doc-backlink"><span class="section-number">6.5. </span>Discussion and Conclusions</a><a class="headerlink" href="#discussion-and-conclusions" title="Permalink to this heading">¶</a></h2>
+<p>LLMs are complex systems and alignment is a challenging problem. In this chapter, we discussed how post-training techniques can be used to align a language model to human preferences. In the case study, we demonstrated how to use DPO to align a language model to a user-provider policy further automating the process via synthetic data generation and LLM-as-judge evaluation. Our approach serves as a proof of concept and several considerations should be taken into account when using this methodology in practice.</p>
 <p><strong>Synthetic Data Generation</strong></p>
-<p>LLMs can self improve through synthetic data generation <span id="id43">[<a class="reference internal" href="#id152" title="Jiaxin Huang, Shixiang Shane Gu, Le Hou, Yuexin Wu, Xuezhi Wang, Hongkun Yu, and Jiawei Han. Large language models can self-improve. 2022. URL: https://arxiv.org/abs/2210.11610, arXiv:2210.11610.">Huang <em>et al.</em>, 2022</a>]</span>. This process helps the LLM learn from its own reasoning and improve its overall reasoning ability without relying on human-annotated data. While LLMs can be powerful tools for generating synthetic data, especially in data-scarce domains, it’s important to recognize the potential pitfalls.</p>
-<p>One major challenge is data distribution bias, where the synthetic data might not accurately mirror the complexities and nuances of real-world data. This can lead to models trained on this data making inaccurate predictions or exhibiting biases. In our case study, we did observe duplicate responses in the synthetic data. Further, the methodology lacks a systematic approach to evaluate the quality of the synthetic data itself only focusing on evals for the consecutive fine-tuned model. This highlights the importance of carefully considering the training data and potential biases of LLMs used for synthetic data generation to mitigate the risk of creating biased or unrepresentative datasets <span id="id44">[<a class="reference internal" href="#id154" title="Shuang Hao, Wenfeng Han, Tao Jiang, Yiping Li, Haonan Wu, Chunlin Zhong, Zhangjun Zhou, and He Tang. Synthetic data in ai: challenges, applications, and ethical implications. 2024. URL: https://arxiv.org/abs/2401.01629, arXiv:2401.01629.">Hao <em>et al.</em>, 2024</a>]</span>.</p>
+<p>LLMs can self improve through synthetic data generation <span id="id43">[<a class="reference internal" href="#id155" title="Jiaxin Huang, Shixiang Shane Gu, Le Hou, Yuexin Wu, Xuezhi Wang, Hongkun Yu, and Jiawei Han. Large language models can self-improve. 2022. URL: https://arxiv.org/abs/2210.11610, arXiv:2210.11610.">Huang <em>et al.</em>, 2022</a>]</span>. This process helps the LLM learn from its own reasoning and improve its overall reasoning ability without relying on human-annotated data. While LLMs can be powerful tools for generating synthetic data, especially in data-scarce domains, it’s important to recognize the potential pitfalls.</p>
+<p>One major challenge is data distribution bias, where the synthetic data might not accurately mirror the complexities and nuances of real-world data. This can lead to models trained on this data making inaccurate predictions or exhibiting biases. In our case study, we did observe duplicate responses in the synthetic data. Further, the methodology lacks a systematic approach to evaluate the quality of the synthetic data itself only focusing on evals for the consecutive fine-tuned model. This highlights the importance of carefully considering the training data and potential biases of LLMs used for synthetic data generation to mitigate the risk of creating biased or unrepresentative datasets <span id="id44">[<a class="reference internal" href="#id157" title="Shuang Hao, Wenfeng Han, Tao Jiang, Yiping Li, Haonan Wu, Chunlin Zhong, Zhangjun Zhou, and He Tang. Synthetic data in ai: challenges, applications, and ethical implications. 2024. URL: https://arxiv.org/abs/2401.01629, arXiv:2401.01629.">Hao <em>et al.</em>, 2024</a>]</span>.</p>
 <p>Our approach does enable a systematic approach to aligning a model to an input policy. However, according to <span id="id45">[<a class="reference internal" href="#id115" title="Yueqin Yin, Zhendong Wang, Yujia Xie, Weizhu Chen, and Mingyuan Zhou. Self-augmented preference optimization: off-policy paradigms for language model alignment. ArXiv, 2024. URL: https://api.semanticscholar.org/CorpusID:270199610.">Yin <em>et al.</em>, 2024</a>]</span>, directly sampling preference pairs, which closely resembles an on-policy setting, can result in performance declines due to inherent volatility and inefficiency. Therefore, constructing effective preference data to continuously improve LLMs remains a critical research problem.</p>
 <p><strong>Choice of Base Model</strong></p>
 <p>The choice of base model is a critical consideration when implementing alignment techniques. In the case study, we selected the smolLM model family due to its efficient architecture and reasonable performance on basic tasks while maintaining relatively low computational requirements. However, the model does have limitations in terms of reasoning capabilities and complex task handling that should be carefully considered <span id="id46">[<a class="reference internal" href="#id114" title="Hugging Face SmolLM2. Smollm: a small language model distilled from a larger language model for task-specific applications. 2024. Blog post describing techniques for distilling smaller, task-specific language models. URL: https://huggingface.co/blog/smollm.">SmolLM2, 2024</a>]</span>.</p>
 <p>Real-world applications need to carefully evaluate the trade-offs between model size/capabilities, and costs. While smaller models like smolLM can be cost-effective for basic alignment experiments, they may not provide the sophisticated reasoning needed for production use cases. The computational and financial costs of training and deploying larger models must be weighed against the required capabilities.</p>
-<p>For production applications requiring more advanced capabilities, alternative open source models such as those from the LLaMA-3+ <span id="id47">[<a class="reference internal" href="#id156" title="Meta. Meta-llama. 2024. Meta-Llama. URL: https://huggingface.co/meta-llama.">Meta, 2024</a>]</span> and Qwen <span id="id48">[<a class="reference internal" href="#id157" title="Qwen. Qwen. 2024. Qwen. URL: https://huggingface.co/Qwen.">Qwen, 2024</a>]</span> families have demonstrated remarkable performance that rivals state-of-the-art proprietary models. These models offer enhanced reasoning abilities and better handling of complex tasks, though at increased computational and financial cost. The choice ultimately depends on specific use case requirements, available resources, and acceptable performance thresholds.</p>
+<p>For production applications requiring more advanced capabilities, alternative open source models such as those from the LLaMA-3+ <span id="id47">[<a class="reference internal" href="#id159" title="Meta. Meta-llama. 2024. Meta-Llama. URL: https://huggingface.co/meta-llama.">Meta, 2024</a>]</span> and Qwen <span id="id48">[<a class="reference internal" href="#id160" title="Qwen. Qwen. 2024. Qwen. URL: https://huggingface.co/Qwen.">Qwen, 2024</a>]</span> families have demonstrated remarkable performance that rivals state-of-the-art proprietary models. These models offer enhanced reasoning abilities and better handling of complex tasks, though at increased computational and financial cost. The choice ultimately depends on specific use case requirements, available resources, and acceptable performance thresholds.</p>
 <p><strong>Evaluation Methodology</strong></p>
-<p>The LLM-as-judge evaluation methodology is a powerful tool for assessing model alignment. However, it does have limitations <span id="id49">[<a class="reference internal" href="#id155" title="Guiming Hardy Chen, Shunian Chen, Ziche Liu, Feng Jiang, and Benyou Wang. Humans or llms as the judge? a study on judgement biases. 2024. URL: https://arxiv.org/abs/2402.10669, arXiv:2402.10669.">Chen <em>et al.</em>, 2024</a>]</span>. For instance, the judge model may not always be able to accurately evaluate the alignment of the model, especially if the judge model is not aligned with the policy itself. Further, the judge model may be biased towards the policy, leading to overly conservative evaluations. In our case study, we do highlight the fact that our judge was simply focused on the policy-alignment aspect of the responses completely neglecting the quality of the responses themselves, i.e. while our fine-tuned model may be more aligned with the policy than the base model, we actually have no evidence that our model is helpful at all.</p>
+<p>The LLM-as-judge evaluation methodology is a powerful tool for assessing model alignment. However, it does have limitations <span id="id49">[<a class="reference internal" href="#id158" title="Guiming Hardy Chen, Shunian Chen, Ziche Liu, Feng Jiang, and Benyou Wang. Humans or llms as the judge? a study on judgement biases. 2024. URL: https://arxiv.org/abs/2402.10669, arXiv:2402.10669.">Chen <em>et al.</em>, 2024</a>]</span>. For instance, the judge model may not always be able to accurately evaluate the alignment of the model, especially if the judge model is not aligned with the policy itself. Further, the judge model may be biased towards the policy, leading to overly conservative evaluations. In our case study, we do highlight the fact that our judge was simply focused on the policy-alignment aspect of the responses completely neglecting the quality of the responses themselves, i.e. while our fine-tuned model may be more aligned with the policy than the base model, we actually have no evidence that our model is helpful at all.</p>
 <p>A more robust evaluation approach would combine LLM-based evaluation with human domain experts in a complementary process. The LLM judge could perform initial high-throughput screening of model responses, flagging potential issues and providing preliminary assessments. These results would then be reviewed by human evaluators with relevant domain expertise who can provide nuanced judgment, catch edge cases, and validate the LLM’s evaluations. Additionally, automatic evaluation against standard benchmarks is advised to evaluate general capabilities of the model.</p>
 <p><strong>DPO Dataset Composition</strong></p>
 <p>The composition of the DPO dataset also plays a crucial role in model behavior. In preliminary experiments, using only policy-aligned preference data led to an overly apologetic model that was hesitant to provide helpful responses even for benign queries, i.e. the model was overfitting to the policy. In fact, a model that simply refused to provide an useful response and instead apologized would indeed be aligned with the policy and therefore rewarded accordingly. This led to our decision to construct a more well balanced dataset.</p>
-<p>Blending our policy-focused dataset with the more general-purpose UltraFeedback dataset from Hugging Face H4 <span id="id50">[<a class="reference internal" href="#id147" title="Hugging Face H4. Ultrafeedback binarized dataset. 2024a. A dataset of binary preference data for training language models. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized.">H4, 2024a</a>]</span> dramatically improved results by helping the model maintain helpfulness while learning appropriate safety boundaries. The results reported here reflect this balanced dataset approach.</p>
+<p>Blending our policy-focused dataset with the more general-purpose UltraFeedback dataset from Hugging Face H4 <span id="id50">[<a class="reference internal" href="#id150" title="Hugging Face H4. Ultrafeedback binarized dataset. 2024a. A dataset of binary preference data for training language models. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized.">H4, 2024a</a>]</span> dramatically improved results by helping the model maintain helpfulness while learning appropriate safety boundaries. The results reported here reflect this balanced dataset approach.</p>
 <p>The construction of the DPO dataset is perhaps the most critical component of the alignment process. While automated approaches can help scale dataset creation, the involvement of domain experts in dataset construction is highly recommended. Domain experts bring invaluable knowledge about edge cases, nuanced policy interpretations, and real-world usage patterns that may not be captured by synthetic data generation alone. Organizations implementing alignment techniques should consider investing in domain expert involvement during dataset construction as a key success factor.</p>
 <p><strong>Fine-tuning Process</strong></p>
 <p>The effectiveness of DPO training can be highly sensitive to various fine-tuning hyperparameters. As we mentioned before, the batch size and the beta parameter are two key parameters that can significantly impact training stability and model behavior. A careful parameter tuning is required to achieve optimal results, which lacked in our case study.</p>
 <p>One important limitation of our current implementation is that we did not carefully split our user prompts between in-sample data for fine-tuning and out-of-sample data for evaluation. This means our evaluation metrics may be overly optimistic as the fine-tuned model could be memorizing prompts rather than learning generalizable alignment. Future work should implement proper train/test splits to better assess generalization performance while making sure out/in-sample distributions are similar and representative of real-world data.</p>
 </section>
 <section id="citation">
-<h2><a class="toc-backref" href="#id255" role="doc-backlink"><span class="section-number">6.6. </span>Citation</a><a class="headerlink" href="#citation" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id258" role="doc-backlink"><span class="section-number">6.6. </span>Citation</a><a class="headerlink" href="#citation" title="Permalink to this heading">¶</a></h2>
 <p><a class="reference external" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="CC BY-NC-SA 4.0" src="https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png" /></a></p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="nd">@misc</span><span class="p">{</span><span class="n">tharsistpsouza2024tamingllms</span><span class="p">,</span>
   <span class="n">author</span> <span class="o">=</span> <span class="p">{</span><span class="n">Tharsis</span> <span class="n">T</span><span class="o">.</span> <span class="n">P</span><span class="o">.</span> <span class="n">Souza</span><span class="p">},</span>
@@ -2429,7 +2438,7 @@ <h2><a class="toc-backref" href="#id255" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="references">
-<h2><a class="toc-backref" href="#id256" role="doc-backlink"><span class="section-number">6.7. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id259" role="doc-backlink"><span class="section-number">6.7. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
 <div class="docutils container" id="id51">
 <div class="citation" id="id118" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>ABC+4a<span class="fn-bracket">]</span></span>
@@ -2440,7 +2449,7 @@ <h2><a class="toc-backref" href="#id256" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id29">ABC+4b</a><span class="fn-bracket">]</span></span>
 <p>Amanda Askell, Jan Brauner, Adrian Colyer, Benjamin Cullen, David Duvenaud, Richard Ngo, Azalia Mirhoseini, Catherine Olsson, Sam Ringer, Liam Skirvin, Jess Smith, Dawn Song, William Saunders, and Jacob Steinhardt. Alignment faking in large language models: reviews. 2024b. URL: <a class="reference external" href="https://assets.anthropic.com/m/24c8d0a3a7d0a1f1/original/Alignment-Faking-in-Large-Language-Models-reviews.pdf">https://assets.anthropic.com/m/24c8d0a3a7d0a1f1/original/Alignment-Faking-in-Large-Language-Models-reviews.pdf</a>.</p>
 </div>
-<div class="citation" id="id173" role="doc-biblioentry">
+<div class="citation" id="id176" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id13">BJN+22</a><span class="fn-bracket">]</span></span>
 <p>Yuntao Bai, Andy Jones, Kamal Ndousse, Amanda Askell, Anna Chen, Nova DasSarma, Dawn Drain, Stanislav Fort, Deep Ganguli, Tom Henighan, Nicholas Joseph, Saurav Kadavath, Jackson Kernion, Tom Conerly, Sheer El-Showk, Nelson Elhage, Zac Hatfield-Dodds, Danny Hernandez, Tristan Hume, Scott Johnston, Shauna Kravec, Liane Lovitt, Neel Nanda, Catherine Olsson, Dario Amodei, Tom Brown, Jack Clark, Sam McCandlish, Chris Olah, Ben Mann, and Jared Kaplan. Training a helpful and harmless assistant with reinforcement learning from human feedback. 2022. URL: <a class="reference external" href="https://arxiv.org/abs/2204.05862">https://arxiv.org/abs/2204.05862</a>, <a class="reference external" href="https://arxiv.org/abs/2204.05862">arXiv:2204.05862</a>.</p>
 </div>
@@ -2448,15 +2457,15 @@ <h2><a class="toc-backref" href="#id256" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id34">BKK+22</a><span class="fn-bracket">]</span></span>
 <p>Yuntao Bai, Saurav Kadavath, Sandipan Kundu, Amanda Askell, Jackson Kernion, Andy Jones, Anna Chen, Anna Goldie, Azalia Mirhoseini, Cameron McKinnon, Carol Chen, Catherine Olsson, Christopher Olah, Danny Hernandez, Dawn Drain, Deep Ganguli, Dustin Li, Eli Tran-Johnson, Ethan Perez, Jamie Kerr, Jared Mueller, Jeffrey Ladish, Joshua Landau, Kamal Ndousse, Kamile Lukosuite, Liane Lovitt, Michael Sellitto, Nelson Elhage, Nicholas Schiefer, Noemi Mercado, Nova DasSarma, Robert Lasenby, Robin Larson, Sam Ringer, Scott Johnston, Shauna Kravec, Sheer El Showk, Stanislav Fort, Tamera Lanham, Timothy Telleen-Lawton, Tom Conerly, Tom Henighan, Tristan Hume, Samuel R. Bowman, Zac Hatfield-Dodds, Ben Mann, Dario Amodei, Nicholas Joseph, Sam McCandlish, Tom Brown, and Jared Kaplan. Constitutional ai: harmlessness from ai feedback. 2022. URL: <a class="reference external" href="https://arxiv.org/abs/2212.08073">https://arxiv.org/abs/2212.08073</a>, <a class="reference external" href="https://arxiv.org/abs/2212.08073">arXiv:2212.08073</a>.</p>
 </div>
-<div class="citation" id="id179" role="doc-biblioentry">
+<div class="citation" id="id182" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id19">Blo23</a><span class="fn-bracket">]</span></span>
 <p>NeurIPS Blog. Announcing the neurips 2023 paper awards. 2023. NeurIPS 2023 Awards. URL: <a class="reference external" href="https://blog.neurips.cc/2023/12/11/announcing-the-neurips-2023-paper-awards/">https://blog.neurips.cc/2023/12/11/announcing-the-neurips-2023-paper-awards/</a>.</p>
 </div>
-<div class="citation" id="id155" role="doc-biblioentry">
+<div class="citation" id="id158" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id49">CCL+24</a><span class="fn-bracket">]</span></span>
 <p>Guiming Hardy Chen, Shunian Chen, Ziche Liu, Feng Jiang, and Benyou Wang. Humans or llms as the judge? a study on judgement biases. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2402.10669">https://arxiv.org/abs/2402.10669</a>, <a class="reference external" href="https://arxiv.org/abs/2402.10669">arXiv:2402.10669</a>.</p>
 </div>
-<div class="citation" id="id175" role="doc-biblioentry">
+<div class="citation" id="id178" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id9">DPHZ23</a><span class="fn-bracket">]</span></span>
 <p>Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. Qlora: efficient finetuning of quantized llms. 2023. URL: <a class="reference external" href="https://arxiv.org/abs/2305.14314">https://arxiv.org/abs/2305.14314</a>, <a class="reference external" href="https://arxiv.org/abs/2305.14314">arXiv:2305.14314</a>.</p>
 </div>
@@ -2465,56 +2474,56 @@ <h2><a class="toc-backref" href="#id256" role="doc-backlink"><span class="sectio
 <span class="backrefs">(<a role="doc-backlink" href="#id33">1</a>,<a role="doc-backlink" href="#id37">2</a>)</span>
 <p>Qingxiu Dong, Li Dong, Xingxing Zhang, Zhifang Sui, and Furu Wei. Self-boosting large language models with synthetic preference data. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2410.06961">https://arxiv.org/abs/2410.06961</a>, <a class="reference external" href="https://arxiv.org/abs/2410.06961">arXiv:2410.06961</a>.</p>
 </div>
-<div class="citation" id="id166" role="doc-biblioentry">
+<div class="citation" id="id169" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id7">Fac24</a><span class="fn-bracket">]</span></span>
 <p>Hugging Face. Zephyr. 2024. Zephyr. URL: <a class="reference external" href="https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha">https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha</a>.</p>
 </div>
-<div class="citation" id="id177" role="doc-biblioentry">
+<div class="citation" id="id180" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id17">Fac4c</a><span class="fn-bracket">]</span></span>
 <p>Hugging Face. Rlhf. 2024c. RLHF. URL: <a class="reference external" href="https://huggingface.co/blog/rlhf">https://huggingface.co/blog/rlhf</a>.</p>
 </div>
-<div class="citation" id="id181" role="doc-biblioentry">
+<div class="citation" id="id184" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id21">Fac4d</a><span class="fn-bracket">]</span></span>
 <p>Hugging Face. Trl. 2024d. TRL. URL: <a class="reference external" href="https://huggingface.co/docs/trl/en/index">https://huggingface.co/docs/trl/en/index</a>.</p>
 </div>
-<div class="citation" id="id142" role="doc-biblioentry">
+<div class="citation" id="id145" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id23">FQH+24</a><span class="fn-bracket">]</span></span>
 <p>Duanyu Feng, Bowen Qin, Chen Huang, Zheng Zhang, and Wenqiang Lei. Towards analyzing and understanding the limitations of dpo: a theoretical perspective. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2404.04626">https://arxiv.org/abs/2404.04626</a>, <a class="reference external" href="https://arxiv.org/abs/2404.04626">arXiv:2404.04626</a>.</p>
 </div>
-<div class="citation" id="id147" role="doc-biblioentry">
+<div class="citation" id="id150" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>H44a<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id40">1</a>,<a role="doc-backlink" href="#id50">2</a>)</span>
 <p>Hugging Face H4. Ultrafeedback binarized dataset. 2024a. A dataset of binary preference data for training language models. URL: <a class="reference external" href="https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized">https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized</a>.</p>
 </div>
-<div class="citation" id="id149" role="doc-biblioentry">
+<div class="citation" id="id152" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id39">H44b</a><span class="fn-bracket">]</span></span>
 <p>Hugging Face H4. Hugging face h4. 2024b. Hugging Face H4. URL: <a class="reference external" href="https://huggingface.co/HuggingFaceH4">https://huggingface.co/HuggingFaceH4</a>.</p>
 </div>
-<div class="citation" id="id154" role="doc-biblioentry">
+<div class="citation" id="id157" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id44">HHJ+24</a><span class="fn-bracket">]</span></span>
 <p>Shuang Hao, Wenfeng Han, Tao Jiang, Yiping Li, Haonan Wu, Chunlin Zhong, Zhangjun Zhou, and He Tang. Synthetic data in ai: challenges, applications, and ethical implications. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2401.01629">https://arxiv.org/abs/2401.01629</a>, <a class="reference external" href="https://arxiv.org/abs/2401.01629">arXiv:2401.01629</a>.</p>
 </div>
-<div class="citation" id="id168" role="doc-biblioentry">
+<div class="citation" id="id171" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id10">HLT24</a><span class="fn-bracket">]</span></span>
 <p>Jiwoo Hong, Noah Lee, and James Thorne. Orpo: monolithic preference optimization without reference model. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2403.07691">https://arxiv.org/abs/2403.07691</a>, <a class="reference external" href="https://arxiv.org/abs/2403.07691">arXiv:2403.07691</a>.</p>
 </div>
-<div class="citation" id="id143" role="doc-biblioentry">
+<div class="citation" id="id146" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id22">HDN+24</a><span class="fn-bracket">]</span></span>
 <p>Zhenyu Hou, Pengfan Du, Yilin Niu, Zhengxiao Du, Aohan Zeng, Xiao Liu, Minlie Huang, Hongning Wang, Jie Tang, and Yuxiao Dong. Does rlhf scale? exploring the impacts from data, model, and method. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2412.06000">https://arxiv.org/abs/2412.06000</a>, <a class="reference external" href="https://arxiv.org/abs/2412.06000">arXiv:2412.06000</a>.</p>
 </div>
-<div class="citation" id="id174" role="doc-biblioentry">
+<div class="citation" id="id177" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id8">HSW+21</a><span class="fn-bracket">]</span></span>
 <p>Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. Lora: low-rank adaptation of large language models. 2021. URL: <a class="reference external" href="https://arxiv.org/abs/2106.09685">https://arxiv.org/abs/2106.09685</a>, <a class="reference external" href="https://arxiv.org/abs/2106.09685">arXiv:2106.09685</a>.</p>
 </div>
-<div class="citation" id="id152" role="doc-biblioentry">
+<div class="citation" id="id155" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id43">HGH+22</a><span class="fn-bracket">]</span></span>
 <p>Jiaxin Huang, Shixiang Shane Gu, Le Hou, Yuexin Wu, Xuezhi Wang, Hongkun Yu, and Jiawei Han. Large language models can self-improve. 2022. URL: <a class="reference external" href="https://arxiv.org/abs/2210.11610">https://arxiv.org/abs/2210.11610</a>, <a class="reference external" href="https://arxiv.org/abs/2210.11610">arXiv:2210.11610</a>.</p>
 </div>
-<div class="citation" id="id141" role="doc-biblioentry">
+<div class="citation" id="id144" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id41">Huy24</a><span class="fn-bracket">]</span></span>
 <p>Chip Huyen. <em>AI Engineering</em>. O'Reilly Media, Inc., December 2024. ISBN 9781098129095. URL: <a class="reference external" href="https://www.oreilly.com/library/view/ai-engineering/9781098129095/">https://www.oreilly.com/library/view/ai-engineering/9781098129095/</a>.</p>
 </div>
-<div class="citation" id="id145" role="doc-biblioentry">
+<div class="citation" id="id148" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id24">KSD+24</a><span class="fn-bracket">]</span></span>
 <p>Joshua Kazdan, Rylan Schaeffer, Apratim Dey, Matthias Gerstgrasser, Rafael Rafailov, David L. Donoho, and Sanmi Koyejo. Collapse or thrive? perils and promises of synthetic data in a self-generating world. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2410.16713">https://arxiv.org/abs/2410.16713</a>, <a class="reference external" href="https://arxiv.org/abs/2410.16713">arXiv:2410.16713</a>.</p>
 </div>
@@ -2522,33 +2531,33 @@ <h2><a class="toc-backref" href="#id256" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id36">KSY+24</a><span class="fn-bracket">]</span></span>
 <p>Seungone Kim, Juyoung Suk, Xiang Yue, Vijay Viswanathan, Seongyun Lee, Yizhong Wang, Kiril Gashteovski, Carolin Lawrence, Sean Welleck, and Graham Neubig. Evaluating language models as synthetic data generators. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2412.03679">https://arxiv.org/abs/2412.03679</a>, <a class="reference external" href="https://arxiv.org/abs/2412.03679">arXiv:2412.03679</a>.</p>
 </div>
-<div class="citation" id="id167" role="doc-biblioentry">
+<div class="citation" id="id170" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id6">LT24</a><span class="fn-bracket">]</span></span>
 <p>AI &#64; Meta Llama Team. The llama 3 herd of models. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2407.21783">https://arxiv.org/abs/2407.21783</a>, <a class="reference external" href="https://arxiv.org/abs/2407.21783">arXiv:2407.21783</a>.</p>
 </div>
-<div class="citation" id="id153" role="doc-biblioentry">
+<div class="citation" id="id156" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id35">LWX+24</a><span class="fn-bracket">]</span></span>
 <p>Lin Long, Rui Wang, Ruixuan Xiao, Junbo Zhao, Xiao Ding, Gang Chen, and Haobo Wang. On llms-driven synthetic data generation, curation, and evaluation: a survey. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2406.15126">https://arxiv.org/abs/2406.15126</a>, <a class="reference external" href="https://arxiv.org/abs/2406.15126">arXiv:2406.15126</a>.</p>
 </div>
-<div class="citation" id="id156" role="doc-biblioentry">
+<div class="citation" id="id159" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id47">Met24</a><span class="fn-bracket">]</span></span>
 <p>Meta. Meta-llama. 2024. Meta-Llama. URL: <a class="reference external" href="https://huggingface.co/meta-llama">https://huggingface.co/meta-llama</a>.</p>
 </div>
-<div class="citation" id="id165" role="doc-biblioentry">
+<div class="citation" id="id168" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>OWJ+22<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id2">1</a>,<a role="doc-backlink" href="#id3">2</a>,<a role="doc-backlink" href="#id4">3</a>,<a role="doc-backlink" href="#id5">4</a>,<a role="doc-backlink" href="#id12">5</a>,<a role="doc-backlink" href="#id13">6</a>,<a role="doc-backlink" href="#id14">7</a>)</span>
 <p>Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: <a class="reference external" href="https://arxiv.org/abs/2203.02155">https://arxiv.org/abs/2203.02155</a>, <a class="reference external" href="https://arxiv.org/abs/2203.02155">arXiv:2203.02155</a>.</p>
 </div>
-<div class="citation" id="id157" role="doc-biblioentry">
+<div class="citation" id="id160" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id48">Qwe24</a><span class="fn-bracket">]</span></span>
 <p>Qwen. Qwen. 2024. Qwen. URL: <a class="reference external" href="https://huggingface.co/Qwen">https://huggingface.co/Qwen</a>.</p>
 </div>
-<div class="citation" id="id169" role="doc-biblioentry">
+<div class="citation" id="id172" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>RSM+24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id1">1</a>,<a role="doc-backlink" href="#id11">2</a>,<a role="doc-backlink" href="#id18">3</a>,<a role="doc-backlink" href="#id20">4</a>)</span>
 <p>Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn. Direct preference optimization: your language model is secretly a reward model. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2305.18290">https://arxiv.org/abs/2305.18290</a>, <a class="reference external" href="https://arxiv.org/abs/2305.18290">arXiv:2305.18290</a>.</p>
 </div>
-<div class="citation" id="id178" role="doc-biblioentry">
+<div class="citation" id="id181" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id16">SWD+17</a><span class="fn-bracket">]</span></span>
 <p>John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. Proximal policy optimization algorithms. 2017. URL: <a class="reference external" href="https://arxiv.org/abs/1707.06347">https://arxiv.org/abs/1707.06347</a>, <a class="reference external" href="https://arxiv.org/abs/1707.06347">arXiv:1707.06347</a>.</p>
 </div>
@@ -2561,15 +2570,15 @@ <h2><a class="toc-backref" href="#id256" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id30">SmolLM2360MI24</a><span class="fn-bracket">]</span></span>
 <p>Hugging Face SmolLM2-360M-Instruct. Smollm2-360m-instruct. 2024. 360M parameter instruction-tuned language model, distilled for efficient deployment. URL: <a class="reference external" href="https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct">https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct</a>.</p>
 </div>
-<div class="citation" id="id150" role="doc-biblioentry">
+<div class="citation" id="id153" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id42">Sou24</a><span class="fn-bracket">]</span></span>
 <p>Tharsis T. P. Souza. Tamingllms: a framework for evaluating and aligning language models. 2024. URL: <a class="reference external" href="https://www.souzatharsis.com/tamingLLMs/notebooks/evals.html">https://www.souzatharsis.com/tamingLLMs/notebooks/evals.html</a>.</p>
 </div>
-<div class="citation" id="id144" role="doc-biblioentry">
+<div class="citation" id="id147" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id25">SRvERH24</a><span class="fn-bracket">]</span></span>
 <p>Márton Szép, Daniel Rueckert, Rüdiger von Eisenhart-Rothe, and Florian Hinterwimmer. A practical guide to fine-tuning language models with limited data. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2411.09539">https://arxiv.org/abs/2411.09539</a>, <a class="reference external" href="https://arxiv.org/abs/2411.09539">arXiv:2411.09539</a>.</p>
 </div>
-<div class="citation" id="id172" role="doc-biblioentry">
+<div class="citation" id="id175" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id13">TMS+23</a><span class="fn-bracket">]</span></span>
 <p>Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and Thomas Scialom. Llama 2: open foundation and fine-tuned chat models. 2023. URL: <a class="reference external" href="https://arxiv.org/abs/2307.09288">https://arxiv.org/abs/2307.09288</a>, <a class="reference external" href="https://arxiv.org/abs/2307.09288">arXiv:2307.09288</a>.</p>
 </div>
@@ -2581,7 +2590,7 @@ <h2><a class="toc-backref" href="#id256" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id38">WYG+24</a><span class="fn-bracket">]</span></span>
 <p>Tianhao Wu, Weizhe Yuan, Olga Golovneva, Jing Xu, Yuandong Tian, Jiantao Jiao, Jason Weston, and Sainbayar Sukhbaatar. Meta-rewarding language models: self-improving alignment with llm-as-a-meta-judge. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2407.19594">https://arxiv.org/abs/2407.19594</a>, <a class="reference external" href="https://arxiv.org/abs/2407.19594">arXiv:2407.19594</a>.</p>
 </div>
-<div class="citation" id="id176" role="doc-biblioentry">
+<div class="citation" id="id179" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id15">XFG+24</a><span class="fn-bracket">]</span></span>
 <p>Shusheng Xu, Wei Fu, Jiaxuan Gao, Wenjie Ye, Weilin Liu, Zhiyu Mei, Guangju Wang, Chao Yu, and Yi Wu. Is dpo superior to ppo for llm alignment? a comprehensive study. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2404.10719">https://arxiv.org/abs/2404.10719</a>, <a class="reference external" href="https://arxiv.org/abs/2404.10719">arXiv:2404.10719</a>.</p>
 </div>
diff --git a/tamingllms/_build/html/notebooks/cost.html b/tamingllms/_build/html/notebooks/cost.html
new file mode 100644
index 0000000..3411134
--- /dev/null
+++ b/tamingllms/_build/html/notebooks/cost.html
@@ -0,0 +1,621 @@
+<!DOCTYPE html>
+<html  lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width,initial-scale=1"><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
+
+      <title>8. The Falling Cost Paradox</title>
+    
+          <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
+          <link rel="stylesheet" href="../_static/theme.css " type="text/css" />
+          <link rel="stylesheet" href="../_static/togglebutton.css" type="text/css" />
+          <link rel="stylesheet" href="../_static/copybutton.css" type="text/css" />
+          <link rel="stylesheet" href="../_static/mystnb.4510f1fc1dee50b3e5859aac5469c37c29e427902b24a333a5f9fcb2f0b3ac41.css" type="text/css" />
+          <link rel="stylesheet" href="../_static/sphinx-thebe.css" type="text/css" />
+          <link rel="stylesheet" href="../_static/sphinx-design.4cbf315f70debaebd550c87a6162cf0f.min.css" type="text/css" />
+      
+      <!-- sphinx script_files -->
+        <script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
+        <script src="../_static/doctools.js"></script>
+        <script src="../_static/sphinx_highlight.js"></script>
+        <script src="../_static/clipboard.min.js"></script>
+        <script src="../_static/copybutton.js"></script>
+        <script src="../_static/scripts/sphinx-book-theme.js"></script>
+        <script>let toggleHintShow = 'Click to show';</script>
+        <script>let toggleHintHide = 'Click to hide';</script>
+        <script>let toggleOpenOnPrint = 'true';</script>
+        <script src="../_static/togglebutton.js"></script>
+        <script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
+        <script src="../_static/design-tabs.js"></script>
+        <script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
+        <script async="async" src="../_static/sphinx-thebe.js"></script>
+
+      
+      <!-- bundled in js (rollup iife) -->
+      <!-- <script src="../_static/theme-vendors.js"></script> -->
+      <script src="../_static/theme.js" defer></script>
+    
+  <link rel="index" title="Index" href="../genindex.html" />
+  <link rel="search" title="Search" href="../search.html" />
+  <link rel="prev" title="7. Local LLMs in Practice" href="local.html" /> 
+  </head>
+
+  <body>
+    <div id="app">
+    <div class="theme-container" :class="pageClasses"><navbar @toggle-sidebar="toggleSidebar">
+  <router-link to="../markdown/toc.html" class="home-link">
+    
+      <span class="site-name">Taming LLMs</span>
+    
+  </router-link>
+
+  <div class="links">
+    <navlinks class="can-hide">
+
+
+
+  
+    <div class="nav-item">
+      <a href="https://www.linkedin.com/in/tharsissouza/"
+        class="nav-link external">
+          Author <outboundlink></outboundlink>
+      </a>
+    </div>
+  
+    <div class="nav-item">
+      <a href="https://github.com/souzatharsis/tamingllms"
+        class="nav-link external">
+          Github <outboundlink></outboundlink>
+      </a>
+    </div>
+  
+
+    </navlinks>
+  </div>
+</navbar>
+
+      
+      <div class="sidebar-mask" @click="toggleSidebar(false)">
+      </div>
+        <sidebar @toggle-sidebar="toggleSidebar">
+          
+          <navlinks>
+            
+
+
+
+  
+    <div class="nav-item">
+      <a href="https://www.linkedin.com/in/tharsissouza/"
+        class="nav-link external">
+          Author <outboundlink></outboundlink>
+      </a>
+    </div>
+  
+    <div class="nav-item">
+      <a href="https://github.com/souzatharsis/tamingllms"
+        class="nav-link external">
+          Github <outboundlink></outboundlink>
+      </a>
+    </div>
+  
+
+            
+          </navlinks><div id="searchbox" class="searchbox" role="search">
+  <div class="caption"><span class="caption-text">Quick search</span>
+    <div class="searchformwrapper">
+      <form class="search" action="../search.html" method="get">
+        <input type="text" name="q" />
+        <input type="submit" value="Search" />
+        <input type="hidden" name="check_keywords" value="yes" />
+        <input type="hidden" name="area" value="default" />
+      </form>
+    </div>
+  </div>
+</div><div class="sidebar-links" role="navigation" aria-label="main navigation">
+  
+    <div class="sidebar-group">
+      <p class="caption">
+        <span class="caption-text"><a href="../markdown/toc.html#taming-llms">taming llms</a></span>
+      </p>
+      <ul class="current">
+        
+          <li class="toctree-l1 ">
+            
+              <a href="../markdown/preface.html" class="reference internal ">Preface</a>
+            
+
+            
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="../markdown/intro.html" class="reference internal ">About the Book</a>
+            
+
+            
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="evals.html" class="reference internal ">The Evals Gap</a>
+            
+
+            
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="structured_output.html" class="reference internal ">Structured Output</a>
+            
+
+            
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="safety.html" class="reference internal ">Safety</a>
+            
+
+            
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="alignment.html" class="reference internal ">Preference-Based Alignment</a>
+            
+
+            
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="local.html" class="reference internal ">Local LLMs in Practice</a>
+            
+
+            
+          </li>
+
+        
+          <li class="toctree-l1 current">
+            
+              <a href="#" class="reference internal current">The Falling Cost Paradox</a>
+            
+
+            
+              <ul>
+                
+                  <li class="toctree-l2"><a href="#why-optimization-matters-more-than-ever" class="reference internal">Why Optimization Matters More Than Ever</a></li>
+                
+                  <li class="toctree-l2"><a href="#right-sizing-llms-a-strategic-approach" class="reference internal">Right-Sizing LLMs: A Strategic Approach</a></li>
+                
+                  <li class="toctree-l2"><a href="#quantization" class="reference internal">Quantization</a></li>
+                
+                  <li class="toctree-l2"><a href="#check-list" class="reference internal">Check-list</a></li>
+                
+                  <li class="toctree-l2"><a href="#conclusion" class="reference internal">Conclusion</a></li>
+                
+                  <li class="toctree-l2"><a href="#references" class="reference internal">References</a></li>
+                
+              </ul>
+            
+          </li>
+
+        
+      </ul>
+    </div>
+  
+</div>
+        </sidebar>
+
+      <page>
+          <div class="body-header" role="navigation" aria-label="navigation">
+  
+  <ul class="breadcrumbs">
+    <li><a href="../markdown/toc.html">Docs</a> &raquo;</li>
+    
+    <li><span class="section-number">8. </span>The Falling Cost Paradox</li>
+  </ul>
+  
+
+  <ul class="page-nav">
+  <li class="prev">
+    <a href="local.html"
+       title="previous chapter">← <span class="section-number">7. </span>Local LLMs in Practice</a>
+  </li>
+</ul>
+  
+</div>
+<hr>
+          <div class="content" role="main" v-pre>
+            
+  <section class="tex2jax_ignore mathjax_ignore" id="the-falling-cost-paradox">
+<span id="cost"></span><h1><a class="toc-backref" href="#id190" role="doc-backlink"><span class="section-number">8. </span>The Falling Cost Paradox</a><a class="headerlink" href="#the-falling-cost-paradox" title="Permalink to this heading">¶</a></h1>
+<blockquote class="epigraph">
+<div><p>It is a confusion of ideas to suppose that the economical use of fuel is equivalent to diminished consumption. <br>
+The very contrary is the truth.</p>
+<p class="attribution">—William Stanley Jevons</p>
+</div></blockquote>
+<nav class="contents" id="contents">
+<p class="topic-title">Contents</p>
+<ul class="simple">
+<li><p><a class="reference internal" href="#the-falling-cost-paradox" id="id190">The Falling Cost Paradox</a></p>
+<ul>
+<li><p><a class="reference internal" href="#why-optimization-matters-more-than-ever" id="id191">Why Optimization Matters More Than Ever</a></p></li>
+<li><p><a class="reference internal" href="#right-sizing-llms-a-strategic-approach" id="id192">Right-Sizing LLMs: A Strategic Approach</a></p>
+<ul>
+<li><p><a class="reference internal" href="#metrics" id="id193">Metrics</a></p></li>
+<li><p><a class="reference internal" href="#requirements" id="id194">Requirements</a></p>
+<ul>
+<li><p><a class="reference internal" href="#business-requirements" id="id195">Business Requirements</a></p></li>
+<li><p><a class="reference internal" href="#performance-requirements" id="id196">Performance Requirements</a></p></li>
+<li><p><a class="reference internal" href="#operational-requirements" id="id197">Operational Requirements</a></p></li>
+<li><p><a class="reference internal" href="#technical-requirements" id="id198">Technical Requirements</a></p></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><p><a class="reference internal" href="#quantization" id="id199">Quantization</a></p></li>
+<li><p><a class="reference internal" href="#check-list" id="id200">Check-list</a></p></li>
+<li><p><a class="reference internal" href="#conclusion" id="id201">Conclusion</a></p></li>
+<li><p><a class="reference internal" href="#references" id="id202">References</a></p></li>
+</ul>
+</li>
+</ul>
+</nav>
+<section id="why-optimization-matters-more-than-ever">
+<h2><a class="toc-backref" href="#id191" role="doc-backlink"><span class="section-number">8.1. </span>Why Optimization Matters More Than Ever</a><a class="headerlink" href="#why-optimization-matters-more-than-ever" title="Permalink to this heading">¶</a></h2>
+<p>According to recent analysis from a16z <span id="id1">[<a class="reference internal" href="#id93" title="Andreessen Horowitz. Llmflation: understanding and mitigating llm inference cost. Blog Post, 2024. Analysis of LLM inference costs and strategies for optimization. URL: https://a16z.com/llmflation-llm-inference-cost/.">Andreessen Horowitz, 2024</a>]</span>, the cost of LLM inference is decreasing by approximately 10x every year - a rate that outpaces even Moore’s Law in the PC revolution or Edholm’s Law during the bandwidth explosion of the dot-com era.</p>
+<figure class="align-center" id="llmflation">
+<a class="reference internal image-reference" href="../_images/llmflation.png"><img alt="LLMflation" src="../_images/llmflation.png" style="width: 2100.0px; height: 1549.1px;" /></a>
+<figcaption>
+<p><span class="caption-number">Fig. 8.1 </span><span class="caption-text">LLMflation <span id="id2">[<a class="reference internal" href="#id93" title="Andreessen Horowitz. Llmflation: understanding and mitigating llm inference cost. Blog Post, 2024. Analysis of LLM inference costs and strategies for optimization. URL: https://a16z.com/llmflation-llm-inference-cost/.">Andreessen Horowitz, 2024</a>]</span>: The cost of LLM inference is decreasing by approximately 10x every year.</span><a class="headerlink" href="#llmflation" title="Permalink to this image">¶</a></p>
+</figcaption>
+</figure>
+<p>A model achieving an MMLU score of 42 that cost $60 per million tokens in late 2021 can now be run for just $0.06 per million tokens. For higher-capability models scoring 83 on MMLU, prices have fallen by a factor of 62 since GPT-4’s introduction in March 2023.</p>
+<p>This dramatic decline stems from multiple compounding factors including:</p>
+<ul class="simple">
+<li><p>Improved GPU efficiency through architectural advances and Moore’s Law</p></li>
+<li><p>Model quantization progress, moving from 16-bit to 4-bit or lower precision</p></li>
+<li><p>Software optimizations reducing compute and memory bandwidth requirements</p></li>
+<li><p>Emergence of smaller yet similarly capable models</p></li>
+<li><p>Better instruction tuning techniques like RLHF and DPO</p></li>
+<li><p>Competition from open-source models and low-cost providers</p></li>
+</ul>
+<p>This trend raises a critical question: If LLM costs are plummeting so rapidly, why should businesses and developers invest precious time and resources in optimizing their LLM usage? Wouldn’t it make more sense to simply wait for the next wave of cost improvements rather than optimize today? In two words: <strong>Jevons Paradox</strong>.</p>
+<p>The Jevons Paradox was first observed by English economist William Stanley Jevons in 1865. Studying coal consumption during the Industrial Revolution, Jevons made a counterintuitive discovery: as steam engines became more efficient and coal use became more economical, total coal consumption increased rather than decreased driving the (Industrial Revolution) and the total spending up.</p>
+<p>This pattern has repeated throughout technological history:</p>
+<ul class="simple">
+<li><p>Computing Power: As cost per computation plummeted, we didn’t spend less on computing, instead we found new creative uses for computers, from smartphones to cloud servers</p></li>
+<li><p>Network Bandwidth: As data transmission got cheaper, we shifted from text messaging to HD video streaming and real-time gaming</p></li>
+<li><p>Data Storage: As cost per gigabyte fell, we moved from storing documents to hosting entire media libraries and training massive AI models</p></li>
+</ul>
+<p>One could argue that LLMs and Generative AI more broadly are following a similar trajectory. As costs decline, we’re seeing the emergence of new applications:</p>
+<ul class="simple">
+<li><p>Embedding AI capabilities into every application and workflow</p></li>
+<li><p>Real-time analysis of audio transcripts and conversations</p></li>
+<li><p>Running AI models directly on edge devices and smartphones</p></li>
+<li><p>Multimodal applications combining text, images, audio and video</p></li>
+</ul>
+<p>In this environment of rapidly falling costs but potential for exponential growth in usage, optimizing LLM costs becomes more, not less, important. Here’s why:</p>
+<p><strong>A) Scale Magnifies Everything</strong>. When operating at billions of tokens per day, even small inefficiencies have major effects:</p>
+<ul class="simple">
+<li><p>A single digit improvement in efficiency can save millions of dollars annually at scale</p></li>
+<li><p>Every 100 milliseconds of latency is about 8% difference in engagement rates (30% on mobile) <a class="footnote-reference brackets" href="#groklatency" id="id3" role="doc-noteref"><span class="fn-bracket">[</span>1<span class="fn-bracket">]</span></a></p></li>
+</ul>
+<p><strong>B) Tiered Pricing Persists</strong>. While average costs are declining, the market maintains a tiered structure:</p>
+<ul class="simple">
+<li><p>Different models offer varying price-performance tradeoffs</p></li>
+<li><p>ChatGPT Pro at $200 per month breaks the price drop trend perhaps triggering a new wave of premium models</p></li>
+<li><p>Cost optimization is still required to select the right model for each specific use case</p></li>
+</ul>
+<p><strong>C) Competition Drives Innovation</strong>. Companies that master LLM efficiency gain significant advantages:</p>
+<ul class="simple">
+<li><p>Ability to offer more competitive pricing</p></li>
+<li><p>Capacity to handle larger scale operations</p></li>
+<li><p>Resources to invest in product improvement</p></li>
+</ul>
+<p><strong>D) Performance and Cost Are Linked</strong>. Cost optimization often yields performance benefits:</p>
+<ul class="simple">
+<li><p>Resource efficiency enables handling larger user loads</p></li>
+<li><p>More efficiency and reduced latency leads to improved user experience</p></li>
+</ul>
+<p>In this environment, companies that master efficient LLM usage while exploiting new capabilities opened up by falling costs will be best positioned to innovate and scale. This dual focus - efficiency and innovation - will likely characterize successful AI companies in the years ahead.</p>
+<p>Motivated by this insight, in the next sections we will dive into the factors that drive LLM cost decay and how to optimize LLM usage in practical applications. The discussion will explore key optimization areas including inference optimization through techniques like Flash Attention and cached prompts, model compression via quantization and distillation, and practical implementation patterns such as response caching, batch processing, and early stopping - all aimed at achieving efficient usage and cost reductions while maintaining model performance and reliability.</p>
+</section>
+<section id="right-sizing-llms-a-strategic-approach">
+<h2><a class="toc-backref" href="#id192" role="doc-backlink"><span class="section-number">8.2. </span>Right-Sizing LLMs: A Strategic Approach</a><a class="headerlink" href="#right-sizing-llms-a-strategic-approach" title="Permalink to this heading">¶</a></h2>
+<p>Before implementing cost optimization strategies for LLMs, organizations must develop a comprehensive understanding of their own requirements and constraints. This systematic approach prevents both over-engineering and under-provisioning, leading to more efficient and cost-effective implementations.</p>
+<p>In this section, we define key performance and cost related metrics that will guide our discussion. Then we propose a set of requirements practitioners should consider before we dive into cost optimization techniques.</p>
+<section id="metrics">
+<h3><a class="toc-backref" href="#id193" role="doc-backlink"><span class="section-number">8.2.1. </span>Metrics</a><a class="headerlink" href="#metrics" title="Permalink to this heading">¶</a></h3>
+</section>
+<section id="requirements">
+<h3><a class="toc-backref" href="#id194" role="doc-backlink"><span class="section-number">8.2.2. </span>Requirements</a><a class="headerlink" href="#requirements" title="Permalink to this heading">¶</a></h3>
+<section id="business-requirements">
+<h4><a class="toc-backref" href="#id195" role="doc-backlink"><span class="section-number">8.2.2.1. </span>Business Requirements</a><a class="headerlink" href="#business-requirements" title="Permalink to this heading">¶</a></h4>
+<p>First, one needs to define the problem to be solved and to what extent it is worth to be solved. Use case requirements form the foundation of any LLM implementation project. A clear definition of the specific business problema and task to be accomplished must be established upfront, along with concrete performance metrics covering accuracy, latency and throughput. This should be accompanied by well-defined cost-per-transaction targets, clear ROI expectations, and a strategic allocation of budgets across different use cases to ensure resources are optimally distributed.</p>
+<p>Budget and ROI considerations are critical for ensuring the long-term viability of LLM implementations. Organizations must establish clear spending limits that align with their financial capabilities while defining realistic cost-per-transaction targets. ROI expectations need to be carefully established through detailed analysis, followed by a strategic allocation of budgets across various use cases based on their business impact and priority.</p>
+<p>Compliance and security requirements cannot be overlooked. This involves a thorough identification of all applicable regulatory requirements and the establishment of robust data handling standards. Organizations must specify comprehensive audit requirements to maintain transparency and accountability, while implementing appropriate security controls to protect sensitive data and system access.</p>
+<p>Future-proofing considerations help ensure the longevity and adaptability of LLM implementations. This requires careful planning for scale to accommodate future growth, along with the evaluation of multi-model strategies to reduce dependency on single solutions. Organizations should carefully assess vendor lock-in risks and explore open-source alternatives to maintain flexibility and control over their AI infrastructure.</p>
+<p>Chapter <a class="reference internal" href="local.html#local"><span class="std std-ref">Local LLMs in Practice</span></a> provides a detailed discussion on relevant considerations when <a class="reference internal" href="local.html#local-model-selection"><span class="std std-ref">Choosing your Model</span></a>.</p>
+</section>
+<section id="performance-requirements">
+<h4><a class="toc-backref" href="#id196" role="doc-backlink"><span class="section-number">8.2.2.2. </span>Performance Requirements</a><a class="headerlink" href="#performance-requirements" title="Permalink to this heading">¶</a></h4>
+<p>Accuracy and quality form the foundation of any LLM deployment’s performance requirements. At its core, this involves determining the minimum level of accuracy that the model must achieve to be considered successful. This serves as a critical baseline for evaluating model performance and making deployment decisions. Establishing clear evaluation metrics, whether through automated measures or human evaluation processes, provides concrete ways to assess if these thresholds are being met. Continuous monitoring of these accuracy metrics ensures the system maintains its performance over time as usage patterns and data distributions evolve. Chapter <a class="reference internal" href="evals.html#evals"><span class="std std-ref">The Evals Gap</span></a> provides a detailed discussion on how to evaluate the performance of LLM-based applications.</p>
+<p>Latency and throughput requirements are equally crucial for ensuring a positive user experience and system reliability. These specifications define how quickly the system must respond to requests and how many concurrent users it can handle. Response time requirements must be carefully balanced against the computational resources available, while peak load capabilities need to account for usage spikes and growth patterns. The decision between real-time processing for immediate responses versus batch processing for efficiency depends heavily on the use case and user expectations.</p>
+</section>
+<section id="operational-requirements">
+<h4><a class="toc-backref" href="#id197" role="doc-backlink"><span class="section-number">8.2.2.3. </span>Operational Requirements</a><a class="headerlink" href="#operational-requirements" title="Permalink to this heading">¶</a></h4>
+<p>Scale and capacity planning forms the foundation of operational requirements for LLM deployments. This involves a comprehensive analysis of expected system usage and growth patterns to ensure the infrastructure can handle both current and future demands. Organizations must carefully project their daily and monthly API call volumes while calculating the average number of tokens per request to accurately estimate resource needs. Understanding usage patterns, including seasonal variations, enables proper capacity planning. Additionally, developing 12-24 month growth projections helps ensure the infrastructure can scale appropriately as demand increases.</p>
+<p>Reliability and availability requirements are equally critical for maintaining consistent service quality. These specifications define the expected uptime percentage that the system must maintain, typically expressed as a percentage of total operational time. Organizations need to establish clear maintenance windows that minimize disruption to users while ensuring necessary system updates and optimizations can be performed. Comprehensive backup and failover requirements must be specified to ensure business continuity in case of failures. High availability needs should be clearly defined, including redundancy levels and recovery time objectives, to maintain service quality even during unexpected events.</p>
+</section>
+<section id="technical-requirements">
+<h4><a class="toc-backref" href="#id198" role="doc-backlink"><span class="section-number">8.2.2.4. </span>Technical Requirements</a><a class="headerlink" href="#technical-requirements" title="Permalink to this heading">¶</a></h4>
+<p>System integration requirements define how the LLM system will interact and communicate with existing infrastructure and applications. This involves carefully mapping all integration points where the LLM system needs to connect with other systems, establishing standardized data formats and interfaces for seamless communication, implementing robust security measures to protect data in transit, and identifying any technical constraints that could impact integration. Getting these integration requirements right is crucial for ensuring the LLM system can function effectively within the broader technical ecosystem.</p>
+<p>Data management requirements address how information will be stored, processed, and maintained within the LLM system. This encompasses determining appropriate storage solutions for maintaining conversation context and history, selecting and configuring vector databases to enable efficient retrieval-augmented generation (RAG), creating comprehensive data retention policies that balance operational needs with resource constraints, and ensuring all data handling practices comply with relevant privacy regulations. Proper data management is essential for both system performance and regulatory compliance, making it a critical consideration in any LLM implementation.</p>
+<p>This structured approach to requirements analysis enables organizations to:</p>
+<ol class="arabic simple">
+<li><p>Select appropriate models aligned with specific needs</p></li>
+<li><p>Identify targeted optimization opportunities</p></li>
+<li><p>Scale efficiently while controlling costs</p></li>
+<li><p>Develop realistic resource allocation strategies</p></li>
+</ol>
+<p>The following sections explore specific optimization techniques, but their implementation should always be guided by these foundational requirements.</p>
+</section>
+</section>
+</section>
+<section id="quantization">
+<h2><a class="toc-backref" href="#id199" role="doc-backlink"><span class="section-number">8.3. </span>Quantization</a><a class="headerlink" href="#quantization" title="Permalink to this heading">¶</a></h2>
+<p>Quantization is a common and relevant technique in making LLMs more efficient and accessible. At a high level, quantization reduces the number of bits used to represent a model’s parameters. The most common form of quantization is to represent model’s weights at lower precision at post-training phase. It has become a standard technique to generate a series of quantized models given a large pre-trained base model.</p>
+<p>While a standard pre-trained LLM might use 32-bit floating-point (FP32) or 16-bit floating-point (FP16) numbers to store its weights, quantized versions can operate at lower precision levels such as 8, 4 or even 2 bits per parameter, reducing memory footprint without proportional losses in performance, necessarily. For instance, for a model of 30 billion parameters, using FP32 means 4 bytes per weight or 120 GB for the whole weights. If the model is quantized such that weights are represented in 1 byte, the memory needed for the model’s weights decreases to 30 GB, hence potentially fitting into consumer grade hardware. This is done at the cost of precision loss, but the trade-off is often worth it though require careful analysis.</p>
+<p>Let’s take a look at model weights of a language model (<code class="docutils literal notranslate"><span class="pre">SmolLM2-135M-Instruct</span></code>) that has been quantized to 2-bit and 16-bit precisions. We will use an utility function <code class="docutils literal notranslate"><span class="pre">load_gguf</span></code> from the <code class="docutils literal notranslate"><span class="pre">taming_utils</span></code> package to load model weights of the quantized models directly from Hugging Face.</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">taming_utils</span> <span class="kn">import</span> <span class="n">load_gguf</span>
+
+<span class="n">MODEL_NAME</span> <span class="o">=</span> <span class="s2">&quot;bartowski/SmolLM2-135M-Instruct-GGUF&quot;</span>
+<span class="n">GGUF_FILE_Q2_K</span> <span class="o">=</span> <span class="s2">&quot;SmolLM2-135M-Instruct-Q2_K.gguf&quot;</span>
+<span class="n">GGUF_FILE_F16</span> <span class="o">=</span> <span class="s2">&quot;SmolLM2-135M-Instruct-F16.gguf&quot;</span>
+
+<span class="n">model_q2_k</span> <span class="o">=</span> <span class="n">load_gguf</span><span class="p">(</span><span class="n">model_name</span><span class="o">=</span><span class="n">MODEL_NAME</span><span class="p">,</span> 
+              <span class="n">gguf_file</span><span class="o">=</span><span class="n">GGUF_FILE_Q2_K</span><span class="p">)</span>
+
+<span class="n">model_f16</span> <span class="o">=</span> <span class="n">load_gguf</span><span class="p">(</span><span class="n">model_name</span><span class="o">=</span><span class="n">MODEL_NAME</span><span class="p">,</span> 
+              <span class="n">gguf_file</span><span class="o">=</span><span class="n">GGUF_FILE_F16</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p>We extract the MLP weights from the first layer of the model as a proxy.</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">mlp_weights_q2_k</span> <span class="o">=</span> <span class="n">model_q2_k</span><span class="o">.</span><span class="n">model</span><span class="o">.</span><span class="n">layers</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">mlp</span><span class="o">.</span><span class="n">gate_proj</span><span class="o">.</span><span class="n">weight</span>
+<span class="n">mlp_weights_f16</span> <span class="o">=</span> <span class="n">model_f16</span><span class="o">.</span><span class="n">model</span><span class="o">.</span><span class="n">layers</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">mlp</span><span class="o">.</span><span class="n">gate_proj</span><span class="o">.</span><span class="n">weight</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p>Original weights at 16-bit precision:</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">mlp_weights_f16</span>
+</pre></div>
+</div>
+</div>
+<div class="cell_output docutils container">
+<div class="output text_plain highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>Parameter containing:
+tensor([[-0.0145,  0.1826,  0.1377,  ...,  0.1719, -0.1387, -0.0298],
+        [-0.1631,  0.0781, -0.2051,  ..., -0.2070, -0.0334,  0.2891],
+        [-0.1768, -0.0488, -0.2393,  ..., -0.0396, -0.1348, -0.1533],
+        ...,
+        [ 0.0771,  0.0845, -0.0232,  ...,  0.0178, -0.1040, -0.0771],
+        [ 0.1582,  0.1167, -0.0474,  ...,  0.0845,  0.0359, -0.2500],
+        [ 0.0432,  0.0972,  0.0933,  ...,  0.2188,  0.0776,  0.0674]],
+       requires_grad=True)
+</pre></div>
+</div>
+</div>
+</div>
+<p>Quantized weights at 2-bit precision:</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">mlp_weights_q2_k</span>
+</pre></div>
+</div>
+</div>
+<div class="cell_output docutils container">
+<div class="output text_plain highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>Parameter containing:
+tensor([[-0.0028,  0.1852,  0.1396,  ...,  0.1506, -0.1635, -0.0043],
+        [-0.1768,  0.0680, -0.2257,  ..., -0.1890, -0.0464,  0.2960],
+        [-0.1840, -0.0451, -0.2395,  ..., -0.0413, -0.1446, -0.1446],
+        ...,
+        [ 0.0621,  0.0621, -0.0478,  ...,  0.0038, -0.0830, -0.0830],
+        [ 0.1473,  0.0926, -0.0547,  ...,  0.0824,  0.0429, -0.2737],
+        [ 0.0355,  0.0782,  0.0782,  ...,  0.2043,  0.0740,  0.0740]],
+       requires_grad=True)
+</pre></div>
+</div>
+</div>
+</div>
+<p>How do they compare? We arrive at a Pearson correlation of 99.7% between the two sets of weights.</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
+
+<span class="c1"># Convert tensors to numpy arrays (detach from computation graph if needed)</span>
+<span class="n">weights_f16</span> <span class="o">=</span> <span class="n">mlp_weights_f16</span><span class="o">.</span><span class="n">detach</span><span class="p">()</span><span class="o">.</span><span class="n">cpu</span><span class="p">()</span><span class="o">.</span><span class="n">numpy</span><span class="p">()</span>
+<span class="n">weights_q2_k</span> <span class="o">=</span> <span class="n">mlp_weights_q2_k</span><span class="o">.</span><span class="n">detach</span><span class="p">()</span><span class="o">.</span><span class="n">cpu</span><span class="p">()</span><span class="o">.</span><span class="n">numpy</span><span class="p">()</span>
+
+<span class="n">flat_f16</span> <span class="o">=</span> <span class="n">weights_f16</span><span class="o">.</span><span class="n">flatten</span><span class="p">()</span>
+<span class="n">flat_q2_k</span> <span class="o">=</span> <span class="n">weights_q2_k</span><span class="o">.</span><span class="n">flatten</span><span class="p">()</span>
+
+<span class="c1"># Calculate correlation</span>
+<span class="n">correlation</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">corrcoef</span><span class="p">(</span><span class="n">flat_f16</span><span class="p">,</span> <span class="n">flat_q2_k</span><span class="p">)[</span><span class="mi">0</span><span class="p">,</span><span class="mi">1</span><span class="p">]</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Pearson correlation: </span><span class="si">{</span><span class="n">correlation</span><span class="si">:</span><span class="s2">.4f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="cell_output docutils container">
+<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>Pearson correlation: 0.9970
+</pre></div>
+</div>
+</div>
+</div>
+<p>Quantization is a powerful technique for reducing the memory footprint of LLMs. This can be exemplified by the case of LLaMa 3.3 70B as quantized by <span id="id4">[<a class="reference internal" href="#id86" title="Unsloth. Llama-3.3-70b-instruct-gguf. Hugging Face Model, 2024. GGUF quantized version of Meta's Llama 3.3 70B instruction-tuned model. URL: https://huggingface.co/unsloth/Llama-3.3-70B-Instruct-GGUF.">Unsloth, 2024</a>]</span> <a class="footnote-reference brackets" href="#unsloth" id="id5" role="doc-noteref"><span class="fn-bracket">[</span>2<span class="fn-bracket">]</span></a>. The model’s memory requirements vary significantly based on the quantization level used as demonstrated in <a class="reference internal" href="#quantized"><span class="std std-numref">Fig. 8.2</span></a>.</p>
+<figure class="align-center" id="quantized">
+<a class="reference internal image-reference" href="../_images/quantized.png"><img alt="Quantized Model Size" src="../_images/quantized.png" style="width: 867.5px; height: 338.0px;" /></a>
+<figcaption>
+<p><span class="caption-number">Fig. 8.2 </span><span class="caption-text">Quantized Model Size: <code class="docutils literal notranslate"><span class="pre">unsloth/Llama-3.3-70B-Instruct-GGUF</span></code></span><a class="headerlink" href="#quantized" title="Permalink to this image">¶</a></p>
+</figcaption>
+</figure>
+<p>We observe that the quantization process yields remarkable reductions in model size, demonstrating a clear trade-off between precision and memory requirements. The transition from F16 (141.1 GB) to Q8_0 (75 GB) achieves a dramatic 47% reduction in model size while maintaining relatively high numerical precision. Further quantization levels reveal an interesting pattern of diminishing returns - each step down in precision yields progressively smaller absolute size reductions, though the cumulative effect remains significant. At the extreme end, the Q2_K model (26.4 GB) requires only 19% of the storage space of its F16 counterpart <a class="footnote-reference brackets" href="#quantization-levels" id="id6" role="doc-noteref"><span class="fn-bracket">[</span>3<span class="fn-bracket">]</span></a>.</p>
+<p>This wide spectrum of model sizes enables deployment across diverse hardware environments. The lightweight Q2_K variant opens possibilities for running inference on consumer-grade hardware like high-end laptops or desktop computers. In contrast, the full-precision F16 model demands enterprise-grade computing resources with substantial memory capacity. This flexibility in deployment options makes quantization a powerful tool for democratizing access to large language models while managing computational costs.</p>
+<p>While quantization has proven highly effective, there is a limit to how far it can be pushed - specifically, the 1-bit ceiling. A notable advancement in this space is BitNet <span id="id7">[<a class="reference internal" href="#id92" title="Jinheng Wang, Hansong Zhou, Ting Song, Shaoguang Mao, Shuming Ma, Hongyu Wang, Yan Xia, and Furu Wei. 1-bit ai infra: part 1.1, fast and lossless bitnet b1.58 inference on cpus. 2024. URL: https://arxiv.org/abs/2410.16144, arXiv:2410.16144.">Wang <em>et al.</em>, 2024</a>]</span> which pushes the boundaries of extreme quantization.</p>
+<p>BitNet’s implementation, bitnet.cpp, has demonstrated significant performance improvements across both ARM and x86 architectures (see <a class="reference internal" href="#bitnet"><span class="std std-numref">Fig. 8.3</span></a>). When compared to llama.cpp, the framework achieves speedups ranging from 1.37x to 5.07x on ARM processors and 2.37x to 6.17x on x86 systems. These performance gains scale with model size - larger models benefit more substantially from BitNet’s optimizations. The efficiency improvements extend beyond raw speed: energy consumption drops by 55-70% on ARM and 71-82% on x86 processors. Perhaps most impressively, bitnet.cpp enables running a 100B parameter BitNet b1.58 model on a single CPU at speeds matching human reading pace (5-7 tokens per second).</p>
+<figure class="align-center" id="bitnet">
+<a class="reference internal image-reference" href="../_images/bitnet.png"><img alt="BitNet" src="../_images/bitnet.png" style="width: 787.5px; height: 436.8px;" /></a>
+<figcaption>
+<p><span class="caption-number">Fig. 8.3 </span><span class="caption-text">BitNet: <span id="id8">[<a class="reference internal" href="#id92" title="Jinheng Wang, Hansong Zhou, Ting Song, Shaoguang Mao, Shuming Ma, Hongyu Wang, Yan Xia, and Furu Wei. 1-bit ai infra: part 1.1, fast and lossless bitnet b1.58 inference on cpus. 2024. URL: https://arxiv.org/abs/2410.16144, arXiv:2410.16144.">Wang <em>et al.</em>, 2024</a>]</span></span><a class="headerlink" href="#bitnet" title="Permalink to this image">¶</a></p>
+</figcaption>
+</figure>
+<p>The framework’s initial release focused on CPU inference optimization, with particular emphasis on 1-bit LLM architectures (BitNet b1.58). While initial testing shows promising results, these findings are specific to the tested models and kernels (its specialized kernels are carefully crafted to exploit the unique characteristics of these extremely quantized models). Further validation is needed before generalizing these results across different architectures and use cases.</p>
+<p>As a relatively recent innovation, 1-bit LLMs represent an exciting frontier in model compression. However, their full potential and limitations require additional research and real-world validation. The technology demonstrates how creative approaches to quantization can continue pushing the boundaries of efficient AI deployment.</p>
+<p>Beyond its memory footprint reduction, quantization delivers several compelling advantages: it accelerates computation through faster arithmetic operations and larger batch sizes, reduces costs by enabling deployment on less expensive hardware and making LLMs more accessible to smaller organizations, and improves energy efficiency by lowering memory bandwidth usage and power consumption - particularly beneficial for mobile and edge devices, ultimately contributing to more sustainable AI deployment.</p>
+<p>Each reduction in precision risks performance degradation. Finding optimal quantization schemes remains an active research area. See Case Study on Quantization for Local Models in Chapter <a class="reference internal" href="local.html#local"><span class="std std-ref">Local LLMs in Practice</span></a> for more details.</p>
+</section>
+<section id="check-list">
+<h2><a class="toc-backref" href="#id200" role="doc-backlink"><span class="section-number">8.4. </span>Check-list</a><a class="headerlink" href="#check-list" title="Permalink to this heading">¶</a></h2>
+<p><strong>Planning and Requirements</strong></p>
+<ul class="contains-task-list simple">
+<li class="task-list-item"><p><input class="task-list-item-checkbox" disabled="disabled" type="checkbox"> Start with a clear understanding of your application’s needs and the factors that contribute to LLM costs</p></li>
+<li class="task-list-item"><p><input class="task-list-item-checkbox" disabled="disabled" type="checkbox"> Choose the right model for your task, balancing performance and cost</p></li>
+<li class="task-list-item"><p><input class="task-list-item-checkbox" disabled="disabled" type="checkbox"> Be aware of the potential challenges and limitations of open-source LLMs and take appropriate measures to mitigate them</p></li>
+</ul>
+<p><strong>Model Optimization</strong></p>
+<ul class="contains-task-list simple">
+<li class="task-list-item"><p><input class="task-list-item-checkbox" disabled="disabled" type="checkbox"> Explore model compression and quantization to reduce model size and computational demands</p></li>
+<li class="task-list-item"><p><input class="task-list-item-checkbox" disabled="disabled" type="checkbox"> Fine-tune pre-trained models on domain-specific data to improve accuracy and efficiency</p></li>
+<li class="task-list-item"><p><input class="task-list-item-checkbox" disabled="disabled" type="checkbox"> Consider using RAG to enhance performance and reduce reliance on purely generative processes</p></li>
+</ul>
+<p><strong>Prompt Engineering</strong></p>
+<ul class="contains-task-list simple">
+<li class="task-list-item"><p><input class="task-list-item-checkbox" disabled="disabled" type="checkbox"> Optimize prompts and utilize prompt engineering techniques to minimize token usage</p></li>
+<li class="task-list-item"><p><input class="task-list-item-checkbox" disabled="disabled" type="checkbox"> Experiment with different prompting strategies to unlock the full potential of open-source LLMs</p></li>
+</ul>
+<p><strong>Infrastructure and Operations</strong></p>
+<ul class="contains-task-list simple">
+<li class="task-list-item"><p><input class="task-list-item-checkbox" disabled="disabled" type="checkbox"> Implement caching and batching strategies to optimize resource utilization</p></li>
+<li class="task-list-item"><p><input class="task-list-item-checkbox" disabled="disabled" type="checkbox"> Monitor LLM usage patterns and costs to identify areas for optimization</p></li>
+<li class="task-list-item"><p><input class="task-list-item-checkbox" disabled="disabled" type="checkbox"> Set up observability and logging to track model performance and costs</p></li>
+<li class="task-list-item"><p><input class="task-list-item-checkbox" disabled="disabled" type="checkbox"> Establish automated testing and evaluation pipelines</p></li>
+</ul>
+<p><strong>Cost Management</strong></p>
+<ul class="contains-task-list simple">
+<li class="task-list-item"><p><input class="task-list-item-checkbox" disabled="disabled" type="checkbox"> Track and analyze inference costs across different model variants</p></li>
+<li class="task-list-item"><p><input class="task-list-item-checkbox" disabled="disabled" type="checkbox"> Implement cost allocation and chargeback mechanisms</p></li>
+<li class="task-list-item"><p><input class="task-list-item-checkbox" disabled="disabled" type="checkbox"> Set up cost alerts and budgeting controls</p></li>
+<li class="task-list-item"><p><input class="task-list-item-checkbox" disabled="disabled" type="checkbox"> Regularly review and optimize resource utilization</p></li>
+</ul>
+</section>
+<section id="conclusion">
+<h2><a class="toc-backref" href="#id201" role="doc-backlink"><span class="section-number">8.5. </span>Conclusion</a><a class="headerlink" href="#conclusion" title="Permalink to this heading">¶</a></h2>
+<p><a class="reference external" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="CC BY-NC-SA 4.0" src="https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png" /></a></p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="nd">@misc</span><span class="p">{</span><span class="n">tharsistpsouza2024tamingllms</span><span class="p">,</span>
+  <span class="n">author</span> <span class="o">=</span> <span class="p">{</span><span class="n">Tharsis</span> <span class="n">T</span><span class="o">.</span> <span class="n">P</span><span class="o">.</span> <span class="n">Souza</span><span class="p">},</span>
+  <span class="n">title</span> <span class="o">=</span> <span class="p">{</span><span class="n">Taming</span> <span class="n">LLMs</span><span class="p">:</span> <span class="n">A</span> <span class="n">Practical</span> <span class="n">Guide</span> <span class="n">to</span> <span class="n">LLM</span> <span class="n">Pitfalls</span> <span class="k">with</span> <span class="n">Open</span> <span class="n">Source</span> <span class="n">Software</span><span class="p">},</span>
+  <span class="n">year</span> <span class="o">=</span> <span class="p">{</span><span class="mi">2024</span><span class="p">},</span>
+  <span class="n">chapter</span> <span class="o">=</span> <span class="p">{</span><span class="n">The</span> <span class="n">Falling</span> <span class="n">Cost</span> <span class="n">Paradox</span><span class="p">},</span>
+  <span class="n">journal</span> <span class="o">=</span> <span class="p">{</span><span class="n">GitHub</span> <span class="n">repository</span><span class="p">},</span>
+  <span class="n">url</span> <span class="o">=</span> <span class="p">{</span><span class="n">https</span><span class="p">:</span><span class="o">//</span><span class="n">github</span><span class="o">.</span><span class="n">com</span><span class="o">/</span><span class="n">souzatharsis</span><span class="o">/</span><span class="n">tamingLLMs</span><span class="p">)</span>
+<span class="p">}</span>
+</pre></div>
+</div>
+</section>
+<section id="references">
+<h2><a class="toc-backref" href="#id202" role="doc-backlink"><span class="section-number">8.6. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
+<div class="docutils container" id="id9">
+<div class="citation" id="id92" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span>WZS+24<span class="fn-bracket">]</span></span>
+<span class="backrefs">(<a role="doc-backlink" href="#id7">1</a>,<a role="doc-backlink" href="#id8">2</a>)</span>
+<p>Jinheng Wang, Hansong Zhou, Ting Song, Shaoguang Mao, Shuming Ma, Hongyu Wang, Yan Xia, and Furu Wei. 1-bit ai infra: part 1.1, fast and lossless bitnet b1.58 inference on cpus. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2410.16144">https://arxiv.org/abs/2410.16144</a>, <a class="reference external" href="https://arxiv.org/abs/2410.16144">arXiv:2410.16144</a>.</p>
+</div>
+<div class="citation" id="id93" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span>AndreessenHorowitz24<span class="fn-bracket">]</span></span>
+<span class="backrefs">(<a role="doc-backlink" href="#id1">1</a>,<a role="doc-backlink" href="#id2">2</a>)</span>
+<p>Andreessen Horowitz. Llmflation: understanding and mitigating llm inference cost. Blog Post, 2024. Analysis of LLM inference costs and strategies for optimization. URL: <a class="reference external" href="https://a16z.com/llmflation-llm-inference-cost/">https://a16z.com/llmflation-llm-inference-cost/</a>.</p>
+</div>
+<div class="citation" id="id94" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id189">HuggingFace4w</a><span class="fn-bracket">]</span></span>
+<p>Hugging Face. Gguf quantization types. Online Documentation, 2024w. Documentation on different quantization types available for GGUF models. URL: <a class="reference external" href="https://huggingface.co/docs/hub/gguf#quantization-types">https://huggingface.co/docs/hub/gguf#quantization-types</a>.</p>
+</div>
+<div class="citation" id="id86" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id4">Unsloth24</a><span class="fn-bracket">]</span></span>
+<p>Unsloth. Llama-3.3-70b-instruct-gguf. Hugging Face Model, 2024. GGUF quantized version of Meta's Llama 3.3 70B instruction-tuned model. URL: <a class="reference external" href="https://huggingface.co/unsloth/Llama-3.3-70B-Instruct-GGUF">https://huggingface.co/unsloth/Llama-3.3-70B-Instruct-GGUF</a>.</p>
+</div>
+</div>
+</div>
+<hr class="footnotes docutils" />
+<aside class="footnote-list brackets">
+<aside class="footnote brackets" id="groklatency" role="note">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id3">1</a><span class="fn-bracket">]</span></span>
+<p>Quote from Jonathan Ross, CEO of Groq, a company that specializes in AI Inference services.</p>
+</aside>
+<aside class="footnote brackets" id="unsloth" role="note">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id5">2</a><span class="fn-bracket">]</span></span>
+<p>Unsloth runs a business of making LLMs fine-tuning streamlined. Check them out at <a class="reference external" href="https://unsloth.ai">unsloth.ai</a>.</p>
+</aside>
+<aside class="footnote brackets" id="quantization-levels" role="note">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id6">3</a><span class="fn-bracket">]</span></span>
+<p>You may have noticed quantization levels have a special notation. Including the bit width in the name of the model but also quantization types (e.g. _K, _0). You can find more information about the quantization levels in <span id="id189">[<a class="reference internal" href="local.html#id125" title="Hugging Face. Gguf quantization types. Online Documentation, 2024w. Documentation on different quantization types available for GGUF models. URL: https://huggingface.co/docs/hub/gguf#quantization-types.">Hugging Face, 2024w</a>]</span>.</p>
+</aside>
+</aside>
+</section>
+</section>
+
+    <script type="text/x-thebe-config">
+    {
+        requestKernel: true,
+        binderOptions: {
+            repo: "binder-examples/jupyter-stacks-datascience",
+            ref: "master",
+        },
+        codeMirrorConfig: {
+            theme: "abcdef",
+            mode: "python"
+        },
+        kernelOptions: {
+            name: "python3",
+            path: "./notebooks"
+        },
+        predefinedOutput: true
+    }
+    </script>
+    <script>kernelName = 'python3'</script>
+
+          </div>
+          <div class="page-nav">
+            <div class="inner"><ul class="page-nav">
+  <li class="prev">
+    <a href="local.html"
+       title="previous chapter">← <span class="section-number">7. </span>Local LLMs in Practice</a>
+  </li>
+</ul><div class="footer" role="contentinfo">
+      &#169; Copyright Tharsis T. P. Souza, 2024.
+    <br>
+    Created using <a href="http://sphinx-doc.org/">Sphinx</a> 6.2.1 with <a href="https://github.com/schettino72/sphinx_press_theme">Press Theme</a> 0.9.1.
+</div>
+            </div>
+          </div>
+      </page>
+    </div></div>
+    
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/tamingllms/_build/html/notebooks/evals.html b/tamingllms/_build/html/notebooks/evals.html
index 854b1ba..585507d 100644
--- a/tamingllms/_build/html/notebooks/evals.html
+++ b/tamingllms/_build/html/notebooks/evals.html
@@ -212,6 +212,15 @@
           </li>
 
         
+          <li class="toctree-l1 ">
+            
+              <a href="cost.html" class="reference internal ">The Falling Cost Paradox</a>
+            
+
+            
+          </li>
+
+        
       </ul>
     </div>
   
@@ -244,7 +253,7 @@
           <div class="content" role="main" v-pre>
             
   <section class="tex2jax_ignore mathjax_ignore" id="the-evals-gap">
-<span id="evals"></span><h1><a class="toc-backref" href="#id221" role="doc-backlink"><span class="section-number">3. </span>The Evals Gap</a><a class="headerlink" href="#the-evals-gap" title="Permalink to this heading">¶</a></h1>
+<span id="evals"></span><h1><a class="toc-backref" href="#id224" role="doc-backlink"><span class="section-number">3. </span>The Evals Gap</a><a class="headerlink" href="#the-evals-gap" title="Permalink to this heading">¶</a></h1>
 <blockquote class="epigraph">
 <div><p>It doesn’t matter how beautiful your theory is, <br>
 it doesn’t matter how smart you are. <br>
@@ -254,49 +263,49 @@
 <nav class="contents" id="contents">
 <p class="topic-title">Contents</p>
 <ul class="simple">
-<li><p><a class="reference internal" href="#the-evals-gap" id="id221">The Evals Gap</a></p>
+<li><p><a class="reference internal" href="#the-evals-gap" id="id224">The Evals Gap</a></p>
 <ul>
-<li><p><a class="reference internal" href="#introduction" id="id222">Introduction</a></p></li>
-<li><p><a class="reference internal" href="#non-deterministic-generative-machines" id="id223">Non-Deterministic Generative Machines</a></p></li>
-<li><p><a class="reference internal" href="#emerging-properties" id="id224">Emerging Properties</a></p></li>
-<li><p><a class="reference internal" href="#problem-statement" id="id225">Problem Statement</a></p></li>
-<li><p><a class="reference internal" href="#evals-design" id="id226">Evals Design</a></p>
+<li><p><a class="reference internal" href="#introduction" id="id225">Introduction</a></p></li>
+<li><p><a class="reference internal" href="#non-deterministic-generative-machines" id="id226">Non-Deterministic Generative Machines</a></p></li>
+<li><p><a class="reference internal" href="#emerging-properties" id="id227">Emerging Properties</a></p></li>
+<li><p><a class="reference internal" href="#problem-statement" id="id228">Problem Statement</a></p></li>
+<li><p><a class="reference internal" href="#evals-design" id="id229">Evals Design</a></p>
 <ul>
-<li><p><a class="reference internal" href="#conceptual-overview" id="id227">Conceptual Overview</a></p></li>
-<li><p><a class="reference internal" href="#design-considerations" id="id228">Design Considerations</a></p></li>
+<li><p><a class="reference internal" href="#conceptual-overview" id="id230">Conceptual Overview</a></p></li>
+<li><p><a class="reference internal" href="#design-considerations" id="id231">Design Considerations</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#metrics" id="id229">Metrics</a></p></li>
-<li><p><a class="reference internal" href="#evaluators" id="id230">Evaluators</a></p>
+<li><p><a class="reference internal" href="#metrics" id="id232">Metrics</a></p></li>
+<li><p><a class="reference internal" href="#evaluators" id="id233">Evaluators</a></p>
 <ul>
-<li><p><a class="reference internal" href="#model-based-evaluation" id="id231">Model-Based Evaluation</a></p></li>
-<li><p><a class="reference internal" href="#evaluating-evaluators" id="id232">Evaluating Evaluators</a></p></li>
+<li><p><a class="reference internal" href="#model-based-evaluation" id="id234">Model-Based Evaluation</a></p></li>
+<li><p><a class="reference internal" href="#evaluating-evaluators" id="id235">Evaluating Evaluators</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#benchmarks-and-leaderboards" id="id233">Benchmarks and Leaderboards</a></p></li>
-<li><p><a class="reference internal" href="#tools" id="id234">Tools</a></p>
+<li><p><a class="reference internal" href="#benchmarks-and-leaderboards" id="id236">Benchmarks and Leaderboards</a></p></li>
+<li><p><a class="reference internal" href="#tools" id="id237">Tools</a></p>
 <ul>
-<li><p><a class="reference internal" href="#lighteval" id="id235">LightEval</a></p></li>
-<li><p><a class="reference internal" href="#langsmith" id="id236">LangSmith</a></p></li>
-<li><p><a class="reference internal" href="#promptfoo" id="id237">PromptFoo</a></p></li>
-<li><p><a class="reference internal" href="#comparison" id="id238">Comparison</a></p></li>
+<li><p><a class="reference internal" href="#lighteval" id="id238">LightEval</a></p></li>
+<li><p><a class="reference internal" href="#langsmith" id="id239">LangSmith</a></p></li>
+<li><p><a class="reference internal" href="#promptfoo" id="id240">PromptFoo</a></p></li>
+<li><p><a class="reference internal" href="#comparison" id="id241">Comparison</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#conclusion" id="id239">Conclusion</a></p></li>
-<li><p><a class="reference internal" href="#citation" id="id240">Citation</a></p></li>
-<li><p><a class="reference internal" href="#references" id="id241">References</a></p></li>
+<li><p><a class="reference internal" href="#conclusion" id="id242">Conclusion</a></p></li>
+<li><p><a class="reference internal" href="#citation" id="id243">Citation</a></p></li>
+<li><p><a class="reference internal" href="#references" id="id244">References</a></p></li>
 </ul>
 </li>
 </ul>
 </nav>
 <section id="introduction">
-<h2><a class="toc-backref" href="#id222" role="doc-backlink"><span class="section-number">3.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id225" role="doc-backlink"><span class="section-number">3.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
 <p>The advent of LLMs marks a pivotal shift in the landscape of software development and evaluation. Unlike traditional software systems, where deterministic outputs are the norm, LLMs introduce a realm of non-deterministic and generative behaviors that challenge conventional software engineering testing paradigms. This shift is not merely a technical evolution but a fundamental transformation in how we conceive, build, and assess software products.</p>
 <p>For those entrenched in traditional methodologies, the transition to LLM-driven systems may seem daunting. However, ignoring this change is not an option. The reliance on outdated testing frameworks that fail to account for the probabilistic nature of LLMs will inevitably lead to significant setbacks.</p>
 <p>To overcome these challenges, it is imperative to embrace the complexities of LLMs with a proactive mindset. This involves developing robust evaluation frameworks up-front, fostering a product development culture of continuous change, learning and adaptation.</p>
 </section>
 <section id="non-deterministic-generative-machines">
-<h2><a class="toc-backref" href="#id223" role="doc-backlink"><span class="section-number">3.2. </span>Non-Deterministic Generative Machines</a><a class="headerlink" href="#non-deterministic-generative-machines" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id226" role="doc-backlink"><span class="section-number">3.2. </span>Non-Deterministic Generative Machines</a><a class="headerlink" href="#non-deterministic-generative-machines" title="Permalink to this heading">¶</a></h2>
 <p>One of the most fundamental challenges when building products with Large Language Models (LLMs) is their generative and non-deterministic nature. Unlike traditional software systems where the same input reliably produces the same output, LLMs can generate novel text that may not exist in their training data, and produce different responses each time they’re queried - even with identical prompts and input data. This behavior is both a strength and a significant engineering challenge and product challenge.</p>
 <p>When you ask an LLM the same question multiple times, you’ll likely get different responses. This isn’t a bug - it’s a fundamental feature of how these models work. The “temperature” parameter, which controls the randomness of outputs, allows models to be creative and generate diverse responses. However, this same feature makes it difficult to build reliable, testable systems.</p>
 <p>Consider a financial services company using LLMs to generate investment advice. The non-deterministic nature of these models means that:</p>
@@ -431,7 +440,7 @@ <h2><a class="toc-backref" href="#id223" role="doc-backlink"><span class="sectio
 <p>How can one effectively test an LLM-powered system when the same prompt can yield radically different outputs based on a single parameter? Traditional testing relies on predictable inputs and outputs, but LLMs force us to grapple with probabilistic behavior. While lower temperatures may seem safer for critical applications, they don’t necessarily eliminate the underlying uncertainty. This highlights the need for new evaluation paradigms that can handle both deterministic and probabilistic aspects of LLM behavior.</p>
 </section>
 <section id="emerging-properties">
-<h2><a class="toc-backref" href="#id224" role="doc-backlink"><span class="section-number">3.3. </span>Emerging Properties</a><a class="headerlink" href="#emerging-properties" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id227" role="doc-backlink"><span class="section-number">3.3. </span>Emerging Properties</a><a class="headerlink" href="#emerging-properties" title="Permalink to this heading">¶</a></h2>
 <p>Beyond their non-deterministic nature, LLMs present another fascinating characteristic: emergent abilities that spontaneously arise as models scale up in size. These abilities - from basic question answering to complex reasoning - aren’t explicitly programmed but rather emerge “naturally” as the models grow larger and are trained on more data. This makes evaluation fundamentally different from traditional software testing, where capabilities are explicitly coded and can be tested against pre-defined specifications.</p>
 <p><a class="reference internal" href="#id4"><span class="std std-numref">Fig. 3.1</span></a> provides a list of emergent abilities of large language models and the scale. The relationship between model scale and emergent abilities follows a fascinating non-linear pattern. Below certain size thresholds, specific abilities may be completely absent from the model - it simply cannot perform certain tasks, no matter how much you try to coax them out. However, once the model reaches critical points in its scaling journey, these abilities can suddenly manifest in what researchers call a phase transition - a dramatic shift from inability to capability. This unpredictable emergence of capabilities stands in stark contrast to traditional software development, where features are deliberately implemented and can be systematically tested.</p>
 <figure class="align-center" id="id4">
@@ -443,7 +452,7 @@ <h2><a class="toc-backref" href="#id224" role="doc-backlink"><span class="sectio
 <p>The implications for evaluation are critical. While conventional software testing relies on stable test suites and well-defined acceptance criteria, LLM evaluation must contend with a constantly shifting landscape of capabilities. What worked to evaluate a 7B parameter model may be completely inadequate for a 70B parameter model that has developed new emergent abilities. This dynamic nature of LLM capabilities forces us to fundamentally rethink our approach to testing and evaluation.</p>
 </section>
 <section id="problem-statement">
-<h2><a class="toc-backref" href="#id225" role="doc-backlink"><span class="section-number">3.4. </span>Problem Statement</a><a class="headerlink" href="#problem-statement" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id228" role="doc-backlink"><span class="section-number">3.4. </span>Problem Statement</a><a class="headerlink" href="#problem-statement" title="Permalink to this heading">¶</a></h2>
 <p>Consider a practical example that illustrates these challenges: building a Math AI tutoring system for children powered by an LLM. In traditional software development, you would define specific features (like presenting math problems or checking answers) and write tests to verify each function. But with LLMs, you’re not just testing predefined features - you’re trying to evaluate emergent capabilities like adapting explanations to a child’s level, maintaining engagement through conversational learning, and providing age-appropriate safety-bound content.</p>
 <p>This fundamental difference raises critical questions about evaluation:</p>
 <ul class="simple">
@@ -493,7 +502,7 @@ <h2><a class="toc-backref" href="#id225" role="doc-backlink"><span class="sectio
 </table>
 </section>
 <section id="evals-design">
-<h2><a class="toc-backref" href="#id226" role="doc-backlink"><span class="section-number">3.5. </span>Evals Design</a><a class="headerlink" href="#evals-design" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id229" role="doc-backlink"><span class="section-number">3.5. </span>Evals Design</a><a class="headerlink" href="#evals-design" title="Permalink to this heading">¶</a></h2>
 <p>First, it’s important to make a distinction between evaluating an LLM versus evaluating an LLM-based application. While the latter offers foundation capabilities and are typically general-purpose, the former is more specific and tailored to a particular use case. Here, we define an LLM-based application as a system that uses one or more LLMs to perform a specific task. More specifically, an LLM-based application is the combination of one or more LLM models, their associated prompts and parameters to solve a particular business problem.</p>
 <p>That differentiation is important because it changes the scope of evaluation. LLMs are usually evaluated based on their capabilities, which include things like language understanding, reasoning and knowledge. LLM-based applications, instead, should be evaluated based on their end-to-end functionality, performance, and how well they meet business requirements. That distinction has key implications for the design of evaluation systems:</p>
 <ul class="simple">
@@ -580,7 +589,7 @@ <h2><a class="toc-backref" href="#id226" role="doc-backlink"><span class="sectio
 </tbody>
 </table>
 <section id="conceptual-overview">
-<h3><a class="toc-backref" href="#id227" role="doc-backlink"><span class="section-number">3.5.1. </span>Conceptual Overview</a><a class="headerlink" href="#conceptual-overview" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id230" role="doc-backlink"><span class="section-number">3.5.1. </span>Conceptual Overview</a><a class="headerlink" href="#conceptual-overview" title="Permalink to this heading">¶</a></h3>
 <p><a class="reference internal" href="#conceptual"><span class="std std-numref">Fig. 3.2</span></a> demonstrates a conceptual design of key components of LLM Application evaluation.</p>
 <figure class="align-center" id="conceptual">
 <a class="reference internal image-reference" href="../_images/conceptual.png"><img alt="Conceptual Overview" src="../_images/conceptual.png" style="width: 992.8000000000001px; height: 424.0px;" /></a>
@@ -661,7 +670,7 @@ <h3><a class="toc-backref" href="#id227" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="design-considerations">
-<h3><a class="toc-backref" href="#id228" role="doc-backlink"><span class="section-number">3.5.2. </span>Design Considerations</a><a class="headerlink" href="#design-considerations" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id231" role="doc-backlink"><span class="section-number">3.5.2. </span>Design Considerations</a><a class="headerlink" href="#design-considerations" title="Permalink to this heading">¶</a></h3>
 <p>The design of an LLM application evaluation system depends heavily on the specific use case and business requirements. Here we list important questions for planning an LLM application evaluation system pertaining to each of the key components previously introduced:</p>
 <p><strong>1. Examples (Input Dataset):</strong></p>
 <ul class="simple">
@@ -746,7 +755,7 @@ <h3><a class="toc-backref" href="#id228" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="metrics">
-<h2><a class="toc-backref" href="#id229" role="doc-backlink"><span class="section-number">3.6. </span>Metrics</a><a class="headerlink" href="#metrics" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id232" role="doc-backlink"><span class="section-number">3.6. </span>Metrics</a><a class="headerlink" href="#metrics" title="Permalink to this heading">¶</a></h2>
 <p>The choice of metric depends on the specific task and desired evaluation criteria. However, one can categorize metrics into two broad categories: <strong>intrinsic</strong> and <strong>extrinsic</strong>.</p>
 <ul class="simple">
 <li><p><strong>Intrinsic metrics</strong> focus on the model’s performance on its primary training objective, which is typically to predict the next token in a sequence.  Perplexity is a common intrinsic metric that measures how well the model predicts a given sample of text.</p></li>
@@ -1050,16 +1059,16 @@ <h2><a class="toc-backref" href="#id229" role="doc-backlink"><span class="sectio
 <ul class="simple">
 <li><p><strong>Task-specific nature</strong>:  Chosen set of metrics might not fully capture the nuances of complex generative-based tasks, especially those involving subjective human judgment.</p></li>
 <li><p><strong>Sensitivity to data distribution</strong>: Performance on these metrics can be influenced by the specific dataset used for evaluation, which might not represent real-world data distribution.</p></li>
-<li><p><strong>Subjective Acceptable Threshold</strong>: These metrics are not always easy to interpret and set a threshold for (see <span id="id5">[<a class="reference internal" href="#id124" title="Bhaskarjit Sarmah, Mingshu Li, Jingrao Lyu, Sebastian Frank, Nathalia Castellanos, Stefano Pasquali, and Dhagash Mehta. How to choose a threshold for an evaluation metric for large language models. 2024. URL: https://arxiv.org/abs/2412.12148, arXiv:2412.12148.">Sarmah <em>et al.</em>, 2024</a>]</span> for a discussion on how to choose a threshold for an evaluation metric for large language models).</p></li>
+<li><p><strong>Subjective Acceptable Threshold</strong>: These metrics are not always easy to interpret and set a threshold for (see <span id="id5">[<a class="reference internal" href="#id125" title="Bhaskarjit Sarmah, Mingshu Li, Jingrao Lyu, Sebastian Frank, Nathalia Castellanos, Stefano Pasquali, and Dhagash Mehta. How to choose a threshold for an evaluation metric for large language models. 2024. URL: https://arxiv.org/abs/2412.12148, arXiv:2412.12148.">Sarmah <em>et al.</em>, 2024</a>]</span> for a discussion on how to choose a threshold for an evaluation metric for large language models).</p></li>
 <li><p><strong>Inability to assess reasoning or factual accuracy</strong>: These metrics primarily focus on surface-level matching and might not reveal the underlying reasoning process of the LLM or its ability to generate factually correct information.</p></li>
 </ul>
 <p>In conclusion, selecting an appropriate extrinsic metrics set depends on the specific task, underlying business requirements and desired evaluation granularity.  Understanding the limitations of these metrics can provide a more comprehensive assessment of LLM performance in real-world applications.</p>
 <p>To address these limitations, alternative approaches like <strong>human-based evaluation</strong> and <strong>model-based evaluation</strong> are often used, which will be discussed in the following sections.</p>
 </section>
 <section id="evaluators">
-<h2><a class="toc-backref" href="#id230" role="doc-backlink"><span class="section-number">3.7. </span>Evaluators</a><a class="headerlink" href="#evaluators" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id233" role="doc-backlink"><span class="section-number">3.7. </span>Evaluators</a><a class="headerlink" href="#evaluators" title="Permalink to this heading">¶</a></h2>
 <section id="model-based-evaluation">
-<h3><a class="toc-backref" href="#id231" role="doc-backlink"><span class="section-number">3.7.1. </span>Model-Based Evaluation</a><a class="headerlink" href="#model-based-evaluation" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id234" role="doc-backlink"><span class="section-number">3.7.1. </span>Model-Based Evaluation</a><a class="headerlink" href="#model-based-evaluation" title="Permalink to this heading">¶</a></h3>
 <p>Traditional metrics like BLEU or ROUGE often fall short in capturing the nuanced, contextual, and creative outputs of LLMs. As an alternative we can consider a “Model-based evaluation” approach. A common approach is to use an LLM as a judge. This is an approach that leverages language models themselves to assess the quality of outputs from other language models. This method involves using a model (often a more capable one) to act as an automated judge, evaluating aspects like accuracy, coherence, and relevance of generated content. Unlike traditional metrics that rely on exact matching or statistical measures, model-based evaluation can capture nuanced aspects of language and provide more contextual assessment.</p>
 <p>As discussed in the paper <span id="id6">[<a class="reference internal" href="#id57" title="Zhen Li, Xiaohan Xu, Tao Shen, Can Xu, Jia-Chen Gu, Yuxuan Lai, Chongyang Tao, and Shuai Ma. Leveraging large language models for nlg evaluation: advances and challenges. 2024. URL: https://arxiv.org/abs/2401.07103, arXiv:2401.07103.">Li <em>et al.</em>, 2024</a>]</span>, LLM-based evaluation approaches generally fall into two main categories:</p>
 <ol class="arabic simple">
@@ -1300,7 +1309,7 @@ <h3><a class="toc-backref" href="#id231" role="doc-backlink"><span class="sectio
 <p>One open source solution trying to overcome some of these challenges is Glider <span id="id10">[<a class="reference internal" href="#id120" title="Darshan Deshpande, Selvan Sunitha Ravi, Sky CH-Wang, Bartosz Mielczarek, Anand Kannappan, and Rebecca Qian. Glider: grading llm interactions and decisions using explainable ranking. 2024. URL: https://arxiv.org/abs/2412.14140, arXiv:2412.14140.">Deshpande <em>et al.</em>, 2024</a>]</span>, a 3B evaluator LLM that can score any text input and associated context on arbitrary user defined criteria. Glider is an LLM model trained on 685 domains and 183 criteria whose judgement scores show 91.3% agreement with human judgments, making it suitable for a diverse range of real world applications.</p>
 </section>
 <section id="evaluating-evaluators">
-<h3><a class="toc-backref" href="#id232" role="doc-backlink"><span class="section-number">3.7.2. </span>Evaluating Evaluators</a><a class="headerlink" href="#evaluating-evaluators" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id235" role="doc-backlink"><span class="section-number">3.7.2. </span>Evaluating Evaluators</a><a class="headerlink" href="#evaluating-evaluators" title="Permalink to this heading">¶</a></h3>
 <p>We have discussed how LLMs can be used to evaluate LLM-based aplications. However, how can we evaluate the performance of LLMs that evaluate other LLMs? This is the question that meta evaluation aims to answer. Clearly, the discussion can become quite meta as we need to evaluate the performance of the evaluator to evaluate the performance of the evaluated model. However, one can make a case for two general options:</p>
 <ol class="arabic simple">
 <li><p>Use a gold-standard dataset that is used to evaluate the performance of LLM evaluators using a “metrics-based” approach.</p></li>
@@ -1344,7 +1353,7 @@ <h3><a class="toc-backref" href="#id232" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="benchmarks-and-leaderboards">
-<h2><a class="toc-backref" href="#id233" role="doc-backlink"><span class="section-number">3.8. </span>Benchmarks and Leaderboards</a><a class="headerlink" href="#benchmarks-and-leaderboards" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id236" role="doc-backlink"><span class="section-number">3.8. </span>Benchmarks and Leaderboards</a><a class="headerlink" href="#benchmarks-and-leaderboards" title="Permalink to this heading">¶</a></h2>
 <p>Benchmarks act as standardized tests for LLMs, evaluating their performance across a spectrum of tasks. These tasks simulate real-world applications such as answering questions, generating coherent text, solving mathematical problems, or even writing computer code. They also assess more abstract qualities like fairness, robustness, and cultural understanding.</p>
 <p>Benchmarks can be thought as comprehensive “exams” that probe different “subjects” in order to certify an LLM. They help researchers and developers compare models systematically, in a way LLM performance is comparable while enabling the identification of emergent behaviors or capabilities as models evolve in scale and sophistication.</p>
 <p>The history of LLM benchmarks reflects the evolving priorities of artificial intelligence research, starting with foundational tasks and moving toward complex, real-world challenges. It began in 2018 with the introduction of <strong>GLUE</strong>(General Language Understanding Evaluation) <span id="id12">[<a class="reference internal" href="#id83" title="Alex Wang, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel R. Bowman. Glue: a multi-task benchmark and analysis platform for natural language understanding. 2019. URL: https://arxiv.org/abs/1804.07461, arXiv:1804.07461.">Wang <em>et al.</em>, 2019</a>]</span>, which set a new standard for evaluating natural language understanding. GLUE measured performance on tasks like sentiment analysis and textual entailment, providing a baseline for assessing the fundamental capabilities of language models. A year later, <strong>SuperGLUE</strong> <span id="id13">[<a class="reference internal" href="#id84" title="Alex Wang, Yada Pruksachatkun, Nikita Nangia, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel R. Bowman. Superglue: a stickier benchmark for general-purpose language understanding systems. Advances in Neural Information Processing Systems, 2019.">Wang <em>et al.</em>, 2019</a>]</span> expanded on this foundation by introducing more nuanced tasks that tested reasoning and language comprehension at a deeper level, challenging the limits of models like BERT and its successors.</p>
@@ -1354,7 +1363,7 @@ <h2><a class="toc-backref" href="#id233" role="doc-backlink"><span class="sectio
 <p>The <strong>HuggingFace Open LLM</strong> <span id="id19">[<a class="reference internal" href="#id92" title="Hugging Face. Open llm leaderboard. Hugging Face Spaces, 2024. URL: https://huggingface.co/spaces/open-llm-leaderboard/blog.">Face, 2024</a>]</span> Leaderboard stands out for its transparency and accessibility in the open-source community. This leaderboard evaluates a wide range of LLMs across diverse tasks, including general knowledge, reasoning, and code-writing. Its commitment to reproducibility ensures that results are verifiable, enabling researchers and practitioners to replicate findings. By focusing on open-source models, it democratizes AI research and fosters innovation across communities, making it a valuable resource for both academics and industry professionals.</p>
 <p>The <strong>Chatbot Arena</strong> (2024) Leaderboard (an evolution of LMSYS)<span id="id20">[<a class="reference internal" href="#id91" title="Wei-Lin Chiang, Lianmin Zheng, Ying Sheng, Anastasios Nikolas Angelopoulos, Tianle Li, Dacheng Li, Hao Zhang, Banghua Zhu, Michael Jordan, Joseph E. Gonzalez, and Ion Stoica. Chatbot arena: an open platform for evaluating llms by human preference. 2024. URL: https://arxiv.org/abs/2403.04132, arXiv:2403.04132.">Chiang <em>et al.</em>, 2024</a>]</span> takes an alternative approach by measuring real-world performance through direct model comparisons. Its evaluation format compares models in live conversations, with human judges providing qualitative assessments. This methodology has gathered over 200,000 human evaluations, offering specific insights into practical model performance. The emphasis on interactive capabilities makes it relevant for developing user-facing applications like virtual assistants and chatbots.</p>
 <p>The <strong>AlpacaEval</strong> <span id="id21">[<a class="reference internal" href="#id93" title="Yann Dubois, Balázs Galambosi, Percy Liang, and Tatsunori B. Hashimoto. Length-controlled alpacaeval: a simple way to debias automatic evaluators. 2024. URL: https://arxiv.org/abs/2404.04475, arXiv:2404.04475.">Dubois <em>et al.</em>, 2024</a>]</span> and <strong>MT-Bench</strong> <span id="id22">[<a class="reference internal" href="#id94" title="Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric P. Xing, Hao Zhang, Joseph E. Gonzalez, and Ion Stoica. Judging llm-as-a-judge with mt-bench and chatbot arena. 2023. URL: https://arxiv.org/abs/2306.05685, arXiv:2306.05685.">Zheng <em>et al.</em>, 2023</a>]</span> Leaderboards implement automated evaluation using GPT-4 to assess model performance in multi-turn conversations. This approach enables consistent assessment of dialogue capabilities while reducing human bias. Their methodology measures key aspects of conversational AI, including contextual understanding and response consistency across multiple exchanges.</p>
-<p>An important recent development was the release of Global-MMLU <span id="id23">[<a class="reference internal" href="#id133" title="Shivalika Singh, Angelika Romanou, Clémentine Fourrier, David I. Adelani, Jian Gang Ngui, Daniel Vila-Suero, Peerat Limkonchotiwat, Kelly Marchisio, Wei Qi Leong, Yosephine Susanto, Raymond Ng, Shayne Longpre, Wei-Yin Ko, Madeline Smith, Antoine Bosselut, Alice Oh, Andre F. T. Martins, Leshem Choshen, Daphne Ippolito, Enzo Ferrante, Marzieh Fadaee, Beyza Ermis, and Sara Hooker. Global mmlu: understanding and addressing cultural and linguistic biases in multilingual evaluation. 2024. URL: https://arxiv.org/abs/2412.03304, arXiv:2412.03304.">Singh <em>et al.</em>, 2024</a>]</span>, an improved version of MMLU with evaluation coverage across 42 languages. This open dataset, built through collaboration between Argilla, the Hugging Face community, and researchers from leading institutions like Cohere For AI, Mila, MIT, and others, represents a significant step toward more inclusive multilingual LLM evaluation. Over 200 contributors used Argilla to annotate MMLU questions, revealing that 85% of questions requiring specific cultural knowledge were Western-centric. The newly released dataset is divided into two key subsets: Culturally Agnostic questions that require no specific regional or cultural knowledge, and Culturally Sensitive questions that depend on dialect, cultural, or geographic knowledge. With high-quality translations available for 25 languages, Global-MMLU enables better understanding of LLM capabilities and limitations across different languages and cultural contexts.</p>
+<p>An important recent development was the release of Global-MMLU <span id="id23">[<a class="reference internal" href="#id136" title="Shivalika Singh, Angelika Romanou, Clémentine Fourrier, David I. Adelani, Jian Gang Ngui, Daniel Vila-Suero, Peerat Limkonchotiwat, Kelly Marchisio, Wei Qi Leong, Yosephine Susanto, Raymond Ng, Shayne Longpre, Wei-Yin Ko, Madeline Smith, Antoine Bosselut, Alice Oh, Andre F. T. Martins, Leshem Choshen, Daphne Ippolito, Enzo Ferrante, Marzieh Fadaee, Beyza Ermis, and Sara Hooker. Global mmlu: understanding and addressing cultural and linguistic biases in multilingual evaluation. 2024. URL: https://arxiv.org/abs/2412.03304, arXiv:2412.03304.">Singh <em>et al.</em>, 2024</a>]</span>, an improved version of MMLU with evaluation coverage across 42 languages. This open dataset, built through collaboration between Argilla, the Hugging Face community, and researchers from leading institutions like Cohere For AI, Mila, MIT, and others, represents a significant step toward more inclusive multilingual LLM evaluation. Over 200 contributors used Argilla to annotate MMLU questions, revealing that 85% of questions requiring specific cultural knowledge were Western-centric. The newly released dataset is divided into two key subsets: Culturally Agnostic questions that require no specific regional or cultural knowledge, and Culturally Sensitive questions that depend on dialect, cultural, or geographic knowledge. With high-quality translations available for 25 languages, Global-MMLU enables better understanding of LLM capabilities and limitations across different languages and cultural contexts.</p>
 <p>A major challenge with these leaderboards and benchmarks is test set contamination - when test data ends up in newer models’ training sets, rendering the benchmarks ineffective. While some benchmarks try to address this through crowdsourced prompts and evaluations from humans or LLMs, these approaches introduce their own biases and struggle with difficult questions. <strong>LiveBench</strong> <span id="id24">[<a class="reference internal" href="#id82" title="Colin White, Samuel Dooley, Manley Roberts, Arka Pal, Ben Feuer, Siddhartha Jain, Ravid Shwartz-Ziv, Neel Jain, Khalid Saifullah, Siddartha Naidu, Chinmay Hegde, Yann LeCun, Tom Goldstein, Willie Neiswanger, and Micah Goldblum. Livebench: a challenging, contamination-free llm benchmark. 2024. URL: https://arxiv.org/abs/2406.19314, arXiv:2406.19314.">White <em>et al.</em>, 2024</a>]</span> represents a novel solution, designed specifically to be resilient to both contamination and evaluation biases. As the first benchmark with continuously updated questions from recent sources, automated objective scoring, and diverse challenging tasks across multiple domains, LiveBench maintains its effectiveness even as models improve. Drawing from recent math competitions, research papers, news, and datasets, it creates contamination-free versions of established benchmark tasks. Current results show even top models achieving below 70% accuracy, demonstrating LiveBench’s ability to meaningfully differentiate model capabilities. With monthly updates and an open collaborative approach, LiveBench aims to provide sustained value for model evaluation as the field advances.</p>
 <p>Another notable benchmark is ZebraLogic <span id="id25">[<a class="reference internal" href="#id98" title="Bill Yuchen Lin, Ronan Le Bras, and Yejin Choi. Zebralogic: benchmarking the logical reasoning ability of language models. 2024. URL: https://huggingface.co/spaces/allenai/ZebraLogic.">Lin <em>et al.</em>, 2024</a>]</span>, which evaluates logical reasoning capabilities of LLMs through Logic Grid Puzzles - a type of Constraint Satisfaction Problem <span id="id26">[<a class="reference internal" href="#id99" title="Sally C. Brailsford, Chris N. Potts, and Barbara M. Smith. Constraint satisfaction problems: algorithms and applications. European Journal of Operational Research, 119(3):557-581, 1999. URL: https://www.sciencedirect.com/science/article/pii/S0377221798003646, doi:https://doi.org/10.1016/S0377-2217(98)00364-6.">Brailsford <em>et al.</em>, 1999</a>]</span> commonly found in tests like the LSAT. These puzzles require assigning unique values to N houses across M different features based on given clues, demanding strategic reasoning and deduction to arrive at a unique correct solution. The benchmark’s programmatically generated puzzles range from 2x2 to 6x6 in size and test LLMs using one-shot examples with reasoning steps. While humans can solve these puzzles through strategic methods like reductio ad absurdum and elimination, LLMs demonstrate significant limitations in this type of logical reasoning. Even the best-performing model, Claude 3.5 Sonnet, only achieves 33.4% accuracy across all puzzles and 12.4% on hard puzzles, with smaller models (7-10B parameters) solving less than 1% of hard puzzles as of December 2024. These results reveal critical gaps in LLMs’ capabilities around counterfactual thinking, reflective reasoning, structured memorization, and compositional generalization.</p>
 <p>A significant shift in AI evaluation came with the launch of the <strong>The Alignment Research Center (ARC) Prize</strong> <span id="id27">[<a class="reference internal" href="#id95" title="Francois Chollet. Abstraction and reasoning challenge. ARC Prize Website, 2024. URL: https://arcprize.org/.">Chollet, 2024</a>]</span> by ARC Prize Inc., a non-profit for the public advancement of open artificial general intelligence. Hosted by Mike Knoop (Co-founder, Zapier) and François Chollet (Creator of ARC-AGI, Keras), this prize represents a paradigm shift in how we evaluate language models. Rather than focusing on narrow performance metrics, the ARC Prize assesses what it calls “cognitive sufficiency” - a model’s ability to generate meaningful insights and tackle open-ended challenges. This new way to think about LLM evaluation emphasizes creative thinking, sophisticated reasoning, and the capacity to make genuinely useful contributions to human knowledge as we seek to define and measure what it means to achieve AGI (Artificial General Intelligence).</p>
@@ -1389,16 +1398,16 @@ <h2><a class="toc-backref" href="#id233" role="doc-backlink"><span class="sectio
 <p>The ARC-AGI benchmark remained unbeaten for five years as of December 2024 (a minimum score of 85% is required to win) <span id="id28">[<a class="reference internal" href="#id96" title="Francois Chollet. Arc prize 2024 results. ARC Prize Website, 12/08/2024. URL: https://arcprize.org/2024-results.">Chollet, 12/08/2024</a>]</span>. While deep learning has significantly advanced in recent years, pure deep learning approaches perform poorly on the ARC-AGI benchmark. This is because traditional deep learning relies on relating new situations to those encountered during training and lacks the ability to adapt or recombine knowledge for entirely new tasks. ARC Prize 2024 spurred the development of novel AGI reasoning techniques, leading to a significant increase in the state-of-the-art score on the ARC-AGI private evaluation set from 33% in 2023 to 55.5% in 2024. A key takeaway is that algorithmic improvements, rather than massive computational resources, may be key to exceeding the target score for the ARC-AGI benchmark.</p>
 <p>In addition to the benchmarks discussed above, a growing set of domain-specific benchmarks is emerging to help evaluate LLMs in specific verticals, including:</p>
 <ul class="simple">
-<li><p>FinBench <span id="id29">[<a class="reference internal" href="#id211" title="Zhihan Zhang, Yixin Cao, and Lizi Liao. Finbench: benchmarking LLMs in complex financial problem solving and reasoning. 2024. URL: https://openreview.net/forum?id=AeGrf1uY0p.">Zhang <em>et al.</em>, 2024</a>]</span>: Evaluates LLMs in the financial domain, covering tasks such as terminology understanding, temporal reasoning, future forecasting, scenario planning, and numerical modelling.</p></li>
-<li><p>LegalBench <span id="id30">[<a class="reference internal" href="#id209" title="Neel Guha, Julian Nyarko, Daniel E. Ho, Christopher Ré, Adam Chilton, Aditya Narayana, Alex Chohlas-Wood, Austin Peters, Brandon Waldon, Daniel N. Rockmore, Diego Zambrano, Dmitry Talisman, Enam Hoque, Faiz Surani, Frank Fagan, Galit Sarfaty, Gregory M. Dickinson, Haggai Porat, Jason Hegland, Jessica Wu, Joe Nudell, Joel Niklaus, John Nay, Jonathan H. Choi, Kevin Tobia, Margaret Hagan, Megan Ma, Michael Livermore, Nikon Rasumov-Rahe, Nils Holzenberger, Noam Kolt, Peter Henderson, Sean Rehaag, Sharad Goel, Shang Gao, Spencer Williams, Sunny Gandhi, Tom Zur, Varun Iyer, and Zehua Li. Legalbench: a collaboratively built benchmark for measuring legal reasoning in large language models. 2023. URL: https://arxiv.org/abs/2308.11462, arXiv:2308.11462.">Guha <em>et al.</em>, 2023</a>]</span> : Assesses the legal reasoning abilities of LLMs through tasks crowdsourced by legal professionals</p></li>
-<li><p>Berkeley Function Leaderboard (BFCL) <span id="id31">[<a class="reference internal" href="#id212" title="Shishir G. Patil, Tianjun Zhang, Xin Wang, and Joseph E. Gonzalez. Gorilla: large language model connected with massive apis. arXiv preprint arXiv:2305.15334, 2023.">Patil <em>et al.</em>, 2023</a>]</span>: Evaluates LLMs’ function-calling abilities</p></li>
+<li><p>FinBench <span id="id29">[<a class="reference internal" href="#id214" title="Zhihan Zhang, Yixin Cao, and Lizi Liao. Finbench: benchmarking LLMs in complex financial problem solving and reasoning. 2024. URL: https://openreview.net/forum?id=AeGrf1uY0p.">Zhang <em>et al.</em>, 2024</a>]</span>: Evaluates LLMs in the financial domain, covering tasks such as terminology understanding, temporal reasoning, future forecasting, scenario planning, and numerical modelling.</p></li>
+<li><p>LegalBench <span id="id30">[<a class="reference internal" href="#id212" title="Neel Guha, Julian Nyarko, Daniel E. Ho, Christopher Ré, Adam Chilton, Aditya Narayana, Alex Chohlas-Wood, Austin Peters, Brandon Waldon, Daniel N. Rockmore, Diego Zambrano, Dmitry Talisman, Enam Hoque, Faiz Surani, Frank Fagan, Galit Sarfaty, Gregory M. Dickinson, Haggai Porat, Jason Hegland, Jessica Wu, Joe Nudell, Joel Niklaus, John Nay, Jonathan H. Choi, Kevin Tobia, Margaret Hagan, Megan Ma, Michael Livermore, Nikon Rasumov-Rahe, Nils Holzenberger, Noam Kolt, Peter Henderson, Sean Rehaag, Sharad Goel, Shang Gao, Spencer Williams, Sunny Gandhi, Tom Zur, Varun Iyer, and Zehua Li. Legalbench: a collaboratively built benchmark for measuring legal reasoning in large language models. 2023. URL: https://arxiv.org/abs/2308.11462, arXiv:2308.11462.">Guha <em>et al.</em>, 2023</a>]</span> : Assesses the legal reasoning abilities of LLMs through tasks crowdsourced by legal professionals</p></li>
+<li><p>Berkeley Function Leaderboard (BFCL) <span id="id31">[<a class="reference internal" href="#id215" title="Shishir G. Patil, Tianjun Zhang, Xin Wang, and Joseph E. Gonzalez. Gorilla: large language model connected with massive apis. arXiv preprint arXiv:2305.15334, 2023.">Patil <em>et al.</em>, 2023</a>]</span>: Evaluates LLMs’ function-calling abilities</p></li>
 </ul>
 <p>As language models continue to advance in capability and complexity, evaluation frameworks must evolve. Modern benchmarks increasingly incorporate tests for nuanced reasoning, ethical decision-making, and emergent capabilities that weren’t previously measurable. This ongoing evolution reflects a deeper understanding that the true value of language models lies not in achieving high scores on standardized tests with narrow task-specific metrics, but in their ability to meaningfully contribute to human understanding and help solve real-world problems while demonstrating the ability to learn and adapt to new tasks.</p>
 </section>
 <section id="tools">
-<h2><a class="toc-backref" href="#id234" role="doc-backlink"><span class="section-number">3.9. </span>Tools</a><a class="headerlink" href="#tools" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id237" role="doc-backlink"><span class="section-number">3.9. </span>Tools</a><a class="headerlink" href="#tools" title="Permalink to this heading">¶</a></h2>
 <section id="lighteval">
-<h3><a class="toc-backref" href="#id235" role="doc-backlink"><span class="section-number">3.9.1. </span>LightEval</a><a class="headerlink" href="#lighteval" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id238" role="doc-backlink"><span class="section-number">3.9.1. </span>LightEval</a><a class="headerlink" href="#lighteval" title="Permalink to this heading">¶</a></h3>
 <p>LightEval <span id="id32">[<a class="reference internal" href="#id63" title="Clémentine Fourrier, Nathan Habib, Thomas Wolf, and Lewis Tunstall. Lighteval: a lightweight framework for llm evaluation. 2023. URL: https://github.com/huggingface/lighteval.">Fourrier <em>et al.</em>, 2023</a>]</span> is a lightweight framework for evaluation of LLMs across a variety of standard and bespoke metrics and tasks across multiple inference backends via Python SDK and CLI.</p>
 <p>As a motivating example, consider a scenario where financial data has been extracted from SEC financial filings and require econometric analysis. Tasks like estimating autoregressive models for time series forecasting or conducting hypothesis tests on market efficiency are common in financial analysis. Let’s evaluate how well different models perform on this type of task.</p>
 <p>First, we need to select a benchmark to assess LLMs capabilities in this domain. MMLU has a sub-benchmark called Econometrics we can use for this task. <a class="reference internal" href="#mmlu-econometrics"><span class="std std-numref">Table 3.4</span></a> shows a sample of the benchmark dataset from MMLU Econometrics. It consists of multiple-choice questions from econometrics and expected answers.</p>
@@ -1587,7 +1596,7 @@ <h3><a class="toc-backref" href="#id235" role="doc-backlink"><span class="sectio
 <p>In summary, LightEval is a simple yet flexible and comprehensive framework for evaluating LLMs across a wide variety of tasks and metrics. It can serve as a first step in selecting your next LLM for a specific task given the exponential growth in number of (open source) models available <span id="id40">[<a class="reference internal" href="#id73" title="Hugging Face. Number of models on hugging face. https://huggingface.co/spaces/huggingface/open-source-ai-year-in-review-2024?day=4, 2024. Accessed: 12/06/2024.">Hugging Face, 2024</a>]</span>. Its integration with the Hugging Face ecosystem and modular architecture make it particularly powerful for evaluating open source models. For further details, visit the <a class="reference external" href="https://github.com/huggingface/lighteval">official repository</a> <span id="id41">[<a class="reference internal" href="#id63" title="Clémentine Fourrier, Nathan Habib, Thomas Wolf, and Lewis Tunstall. Lighteval: a lightweight framework for llm evaluation. 2023. URL: https://github.com/huggingface/lighteval.">Fourrier <em>et al.</em>, 2023</a>]</span>.</p>
 </section>
 <section id="langsmith">
-<h3><a class="toc-backref" href="#id236" role="doc-backlink"><span class="section-number">3.9.2. </span>LangSmith</a><a class="headerlink" href="#langsmith" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id239" role="doc-backlink"><span class="section-number">3.9.2. </span>LangSmith</a><a class="headerlink" href="#langsmith" title="Permalink to this heading">¶</a></h3>
 <p>Let’s revisit our evaluation example when we were interested in evaluating the quality of summaries generated by different (smaller and cheaper) LLM models compared to a benchmark model (larger and more expensive). Recal the setup:</p>
 <ul class="simple">
 <li><p>Benchmark model: gpt-4o</p></li>
@@ -1995,7 +2004,7 @@ <h3><a class="toc-backref" href="#id236" role="doc-backlink"><span class="sectio
 </figure>
 </section>
 <section id="promptfoo">
-<h3><a class="toc-backref" href="#id237" role="doc-backlink"><span class="section-number">3.9.3. </span>PromptFoo</a><a class="headerlink" href="#promptfoo" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id240" role="doc-backlink"><span class="section-number">3.9.3. </span>PromptFoo</a><a class="headerlink" href="#promptfoo" title="Permalink to this heading">¶</a></h3>
 <p>Promptfoo <span id="id43">[<a class="reference internal" href="#id103" title="promptfoo. Promptfoo: llm testing and evaluation framework. 2024. Open source framework for testing and evaluating LLM prompts. URL: https://www.promptfoo.dev/.">promptfoo, 2024</a>]</span> is an open-source framework designed for evaluating applications that utilize large language models (LLMs). Key features include:</p>
 <ol class="arabic simple">
 <li><p><strong>Automated Testing</strong>: Promptfoo provides automated testing capabilities, allowing developers to run custom evaluations tailored to their applications.</p></li>
@@ -2260,7 +2269,7 @@ <h3 class="rubric" id="prompt-comparison-results-by-section">Prompt Comparison R
 <p>In conclusion, Promptfoo can serve as an effective LLM application evaluation tool particularly for its ability to decouple several components of the evaluation process. Hence enabling the user to focus on the most important aspects of the evaluation given the particular application and criteria making it a valuable and flexible tool for LLM application development.</p>
 </section>
 <section id="comparison">
-<h3><a class="toc-backref" href="#id238" role="doc-backlink"><span class="section-number">3.9.4. </span>Comparison</a><a class="headerlink" href="#comparison" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id241" role="doc-backlink"><span class="section-number">3.9.4. </span>Comparison</a><a class="headerlink" href="#comparison" title="Permalink to this heading">¶</a></h3>
 <p>The following table provides a summarized comparative analysis of three open source frameworks for language models evaluation we have discussed: Lighteval, LangSmith, and Promptfoo. Each framework is assessed based on key features such as integration capabilities, customization options, ease of use, and the ability to facilitate human and LLM collaboration.</p>
 <table class="docutils align-default" id="tool-comparison">
 <caption><span class="caption-number">Table 3.6 </span><span class="caption-text">Comparison of Lighteval, LangSmith, and Promptfoo</span><a class="headerlink" href="#tool-comparison" title="Permalink to this table">¶</a></caption>
@@ -2297,13 +2306,13 @@ <h3><a class="toc-backref" href="#id238" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="conclusion">
-<h2><a class="toc-backref" href="#id239" role="doc-backlink"><span class="section-number">3.10. </span>Conclusion</a><a class="headerlink" href="#conclusion" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id242" role="doc-backlink"><span class="section-number">3.10. </span>Conclusion</a><a class="headerlink" href="#conclusion" title="Permalink to this heading">¶</a></h2>
 <p>Language models have fundamentally transformed how software is developed and evaluated. Unlike conventional systems that produce predictable outputs, LLMs generate varied, probabilistic responses that defy traditional testing approaches. While developers accustomed to deterministic systems may find this shift challenging, continuing to rely on legacy testing methods is unsustainable. These frameworks were not designed to handle the inherent variability of LLM outputs and will ultimately prove inadequate.</p>
 <p>Success requires embracing this new paradigm by implementing comprehensive evaluation strategies early - this is the new Product Requirements Document (PRD) - and cultivating an organizational mindset focused on iteration, experimentation and growth.</p>
 <p>The shift from traditional software testing to LLM evaluation is not just a change in tools but a transformation in mindset. Those who recognize and adapt to this shift will lead the way in harnessing the power of LLMs. However, the cost of inaction is not just technological stagnation, but potential business failure.</p>
 </section>
 <section id="citation">
-<h2><a class="toc-backref" href="#id240" role="doc-backlink"><span class="section-number">3.11. </span>Citation</a><a class="headerlink" href="#citation" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id243" role="doc-backlink"><span class="section-number">3.11. </span>Citation</a><a class="headerlink" href="#citation" title="Permalink to this heading">¶</a></h2>
 <p><a class="reference external" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="CC BY-NC-SA 4.0" src="https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png" /></a></p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="nd">@misc</span><span class="p">{</span><span class="n">tharsistpsouza2024tamingllms</span><span class="p">,</span>
   <span class="n">author</span> <span class="o">=</span> <span class="p">{</span><span class="n">Tharsis</span> <span class="n">T</span><span class="o">.</span> <span class="n">P</span><span class="o">.</span> <span class="n">Souza</span><span class="p">},</span>
@@ -2317,7 +2326,7 @@ <h2><a class="toc-backref" href="#id240" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="references">
-<h2><a class="toc-backref" href="#id241" role="doc-backlink"><span class="section-number">3.12. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id244" role="doc-backlink"><span class="section-number">3.12. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
 <div class="docutils container" id="id44">
 <div class="citation" id="id65" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id39">ALB+24</a><span class="fn-bracket">]</span></span>
@@ -2384,7 +2393,7 @@ <h2><a class="toc-backref" href="#id241" role="doc-backlink"><span class="sectio
 <span class="backrefs">(<a role="doc-backlink" href="#id32">1</a>,<a role="doc-backlink" href="#id41">2</a>)</span>
 <p>Clémentine Fourrier, Nathan Habib, Thomas Wolf, and Lewis Tunstall. Lighteval: a lightweight framework for llm evaluation. 2023. URL: <a class="reference external" href="https://github.com/huggingface/lighteval">https://github.com/huggingface/lighteval</a>.</p>
 </div>
-<div class="citation" id="id209" role="doc-biblioentry">
+<div class="citation" id="id212" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id30">GNH+23</a><span class="fn-bracket">]</span></span>
 <p>Neel Guha, Julian Nyarko, Daniel E. Ho, Christopher Ré, Adam Chilton, Aditya Narayana, Alex Chohlas-Wood, Austin Peters, Brandon Waldon, Daniel N. Rockmore, Diego Zambrano, Dmitry Talisman, Enam Hoque, Faiz Surani, Frank Fagan, Galit Sarfaty, Gregory M. Dickinson, Haggai Porat, Jason Hegland, Jessica Wu, Joe Nudell, Joel Niklaus, John Nay, Jonathan H. Choi, Kevin Tobia, Margaret Hagan, Megan Ma, Michael Livermore, Nikon Rasumov-Rahe, Nils Holzenberger, Noam Kolt, Peter Henderson, Sean Rehaag, Sharad Goel, Shang Gao, Spencer Williams, Sunny Gandhi, Tom Zur, Varun Iyer, and Zehua Li. Legalbench: a collaboratively built benchmark for measuring legal reasoning in large language models. 2023. URL: <a class="reference external" href="https://arxiv.org/abs/2308.11462">https://arxiv.org/abs/2308.11462</a>, <a class="reference external" href="https://arxiv.org/abs/2308.11462">arXiv:2308.11462</a>.</p>
 </div>
@@ -2417,7 +2426,7 @@ <h2><a class="toc-backref" href="#id241" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id15">LHE22</a><span class="fn-bracket">]</span></span>
 <p>Stephanie Lin, Jacob Hilton, and Owain Evans. Truthfulqa: measuring how models mimic human falsehoods. 2022. URL: <a class="reference external" href="https://arxiv.org/abs/2109.07958">https://arxiv.org/abs/2109.07958</a>, <a class="reference external" href="https://arxiv.org/abs/2109.07958">arXiv:2109.07958</a>.</p>
 </div>
-<div class="citation" id="id212" role="doc-biblioentry">
+<div class="citation" id="id215" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id31">PZWG23</a><span class="fn-bracket">]</span></span>
 <p>Shishir G. Patil, Tianjun Zhang, Xin Wang, and Joseph E. Gonzalez. Gorilla: large language model connected with massive apis. <em>arXiv preprint arXiv:2305.15334</em>, 2023.</p>
 </div>
@@ -2429,11 +2438,11 @@ <h2><a class="toc-backref" href="#id241" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id2">Ras24</a><span class="fn-bracket">]</span></span>
 <p>Sebastian Raschka. <em>Build A Large Language Model (From Scratch)</em>. Manning, 2024. ISBN 978-1633437166. URL: <a class="reference external" href="https://www.manning.com/books/build-a-large-language-model-from-scratch">https://www.manning.com/books/build-a-large-language-model-from-scratch</a>.</p>
 </div>
-<div class="citation" id="id124" role="doc-biblioentry">
+<div class="citation" id="id125" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id5">SLL+24</a><span class="fn-bracket">]</span></span>
 <p>Bhaskarjit Sarmah, Mingshu Li, Jingrao Lyu, Sebastian Frank, Nathalia Castellanos, Stefano Pasquali, and Dhagash Mehta. How to choose a threshold for an evaluation metric for large language models. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2412.12148">https://arxiv.org/abs/2412.12148</a>, <a class="reference external" href="https://arxiv.org/abs/2412.12148">arXiv:2412.12148</a>.</p>
 </div>
-<div class="citation" id="id133" role="doc-biblioentry">
+<div class="citation" id="id136" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id23">SRF+24</a><span class="fn-bracket">]</span></span>
 <p>Shivalika Singh, Angelika Romanou, Clémentine Fourrier, David I. Adelani, Jian Gang Ngui, Daniel Vila-Suero, Peerat Limkonchotiwat, Kelly Marchisio, Wei Qi Leong, Yosephine Susanto, Raymond Ng, Shayne Longpre, Wei-Yin Ko, Madeline Smith, Antoine Bosselut, Alice Oh, Andre F. T. Martins, Leshem Choshen, Daphne Ippolito, Enzo Ferrante, Marzieh Fadaee, Beyza Ermis, and Sara Hooker. Global mmlu: understanding and addressing cultural and linguistic biases in multilingual evaluation. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2412.03304">https://arxiv.org/abs/2412.03304</a>, <a class="reference external" href="https://arxiv.org/abs/2412.03304">arXiv:2412.03304</a>.</p>
 </div>
@@ -2461,7 +2470,7 @@ <h2><a class="toc-backref" href="#id241" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id38">YYH+24</a><span class="fn-bracket">]</span></span>
 <p>An Yang, Baosong Yang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Zhou, Chengpeng Li, Chengyuan Li, Dayiheng Liu, Fei Huang, Guanting Dong, Haoran Wei, Huan Lin, Jialong Tang, Jialin Wang, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Ma, Jin Xu, Jingren Zhou, Jinze Bai, Jinzheng He, Junyang Lin, Kai Dang, Keming Lu, Keqin Chen, Kexin Yang, Mei Li, Mingfeng Xue, Na Ni, Pei Zhang, Peng Wang, Ru Peng, Rui Men, Ruize Gao, Runji Lin, Shijie Wang, Shuai Bai, Sinan Tan, Tianhang Zhu, Tianhao Li, Tianyu Liu, Wenbin Ge, Xiaodong Deng, Xiaohuan Zhou, Xingzhang Ren, Xinyu Zhang, Xipin Wei, Xuancheng Ren, Yang Fan, Yang Yao, Yichang Zhang, Yu Wan, Yunfei Chu, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, and Zhihao Fan. Qwen2 technical report. <em>arXiv preprint arXiv:2407.10671</em>, 2024.</p>
 </div>
-<div class="citation" id="id211" role="doc-biblioentry">
+<div class="citation" id="id214" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id29">ZCL24</a><span class="fn-bracket">]</span></span>
 <p>Zhihan Zhang, Yixin Cao, and Lizi Liao. Finbench: benchmarking LLMs in complex financial problem solving and reasoning. 2024. URL: <a class="reference external" href="https://openreview.net/forum?id=AeGrf1uY0p">https://openreview.net/forum?id=AeGrf1uY0p</a>.</p>
 </div>
diff --git a/tamingllms/_build/html/notebooks/local.html b/tamingllms/_build/html/notebooks/local.html
index 0263f16..e0df261 100644
--- a/tamingllms/_build/html/notebooks/local.html
+++ b/tamingllms/_build/html/notebooks/local.html
@@ -39,6 +39,7 @@
     
   <link rel="index" title="Index" href="../genindex.html" />
   <link rel="search" title="Search" href="../search.html" />
+  <link rel="next" title="8. The Falling Cost Paradox" href="cost.html" />
   <link rel="prev" title="6. Preference-Based Alignment" href="alignment.html" /> 
   </head>
 
@@ -186,7 +187,7 @@
                 
                   <li class="toctree-l2"><a href="#introduction" class="reference internal">Introduction</a></li>
                 
-                  <li class="toctree-l2"><a href="#models-considerations" class="reference internal">Models Considerations</a></li>
+                  <li class="toctree-l2"><a href="#choosing-your-model" class="reference internal">Choosing your Model</a></li>
                 
                   <li class="toctree-l2"><a href="#tools-for-local-llm-deployment" class="reference internal">Tools for Local LLM Deployment</a></li>
                 
@@ -194,8 +195,6 @@
                 
                   <li class="toctree-l2"><a href="#conclusion" class="reference internal">Conclusion</a></li>
                 
-                  <li class="toctree-l2"><a href="#citation" class="reference internal">Citation</a></li>
-                
                   <li class="toctree-l2"><a href="#references" class="reference internal">References</a></li>
                 
               </ul>
@@ -203,6 +202,15 @@
           </li>
 
         
+          <li class="toctree-l1 ">
+            
+              <a href="cost.html" class="reference internal ">The Falling Cost Paradox</a>
+            
+
+            
+          </li>
+
+        
       </ul>
     </div>
   
@@ -224,6 +232,10 @@
     <a href="alignment.html"
        title="previous chapter">← <span class="section-number">6. </span>Preference-Based Alignment</a>
   </li>
+  <li class="next">
+    <a href="cost.html"
+       title="next chapter"><span class="section-number">8. </span>The Falling Cost Paradox →</a>
+  </li>
 </ul>
   
 </div>
@@ -231,7 +243,7 @@
           <div class="content" role="main" v-pre>
             
   <section class="tex2jax_ignore mathjax_ignore" id="local-llms-in-practice">
-<h1><a class="toc-backref" href="#id217" role="doc-backlink"><span class="section-number">7. </span>Local LLMs in Practice</a><a class="headerlink" href="#local-llms-in-practice" title="Permalink to this heading">¶</a></h1>
+<span id="local"></span><h1><a class="toc-backref" href="#id220" role="doc-backlink"><span class="section-number">7. </span>Local LLMs in Practice</a><a class="headerlink" href="#local-llms-in-practice" title="Permalink to this heading">¶</a></h1>
 <blockquote class="epigraph">
 <div><p>Freedom is something that dies unless it’s used.</p>
 <p class="attribution">—Hunter S. Thompson</p>
@@ -239,56 +251,55 @@ <h1><a class="toc-backref" href="#id217" role="doc-backlink"><span class="sectio
 <nav class="contents" id="contents">
 <p class="topic-title">Contents</p>
 <ul class="simple">
-<li><p><a class="reference internal" href="#local-llms-in-practice" id="id217">Local LLMs in Practice</a></p>
+<li><p><a class="reference internal" href="#local-llms-in-practice" id="id220">Local LLMs in Practice</a></p>
 <ul>
-<li><p><a class="reference internal" href="#introduction" id="id218">Introduction</a></p></li>
-<li><p><a class="reference internal" href="#models-considerations" id="id219">Models Considerations</a></p>
+<li><p><a class="reference internal" href="#introduction" id="id221">Introduction</a></p></li>
+<li><p><a class="reference internal" href="#choosing-your-model" id="id222">Choosing your Model</a></p>
 <ul>
-<li><p><a class="reference internal" href="#task-suitability" id="id220">Task Suitability</a></p></li>
-<li><p><a class="reference internal" href="#performance-cost" id="id221">Performance &amp; Cost</a></p></li>
-<li><p><a class="reference internal" href="#licensing" id="id222">Licensing</a></p></li>
-<li><p><a class="reference internal" href="#community-support" id="id223">Community Support</a></p></li>
-<li><p><a class="reference internal" href="#customization" id="id224">Customization</a></p></li>
+<li><p><a class="reference internal" href="#task-suitability" id="id223">Task Suitability</a></p></li>
+<li><p><a class="reference internal" href="#performance-cost" id="id224">Performance &amp; Cost</a></p></li>
+<li><p><a class="reference internal" href="#licensing" id="id225">Licensing</a></p></li>
+<li><p><a class="reference internal" href="#community-support" id="id226">Community Support</a></p></li>
+<li><p><a class="reference internal" href="#customization" id="id227">Customization</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#tools-for-local-llm-deployment" id="id225">Tools for Local LLM Deployment</a></p>
+<li><p><a class="reference internal" href="#tools-for-local-llm-deployment" id="id228">Tools for Local LLM Deployment</a></p>
 <ul>
-<li><p><a class="reference internal" href="#serving-models" id="id226">Serving Models</a></p>
+<li><p><a class="reference internal" href="#serving-models" id="id229">Serving Models</a></p>
 <ul>
-<li><p><a class="reference internal" href="#llama-cpp" id="id227">LLama.cpp</a></p></li>
-<li><p><a class="reference internal" href="#llamafile" id="id228">Llamafile</a></p></li>
-<li><p><a class="reference internal" href="#ollama" id="id229">Ollama</a></p></li>
-<li><p><a class="reference internal" href="#comparison" id="id230">Comparison</a></p></li>
+<li><p><a class="reference internal" href="#llama-cpp" id="id230">LLama.cpp</a></p></li>
+<li><p><a class="reference internal" href="#llamafile" id="id231">Llamafile</a></p></li>
+<li><p><a class="reference internal" href="#ollama" id="id232">Ollama</a></p></li>
+<li><p><a class="reference internal" href="#comparison" id="id233">Comparison</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#ui" id="id231">UI</a></p>
+<li><p><a class="reference internal" href="#ui" id="id234">UI</a></p>
 <ul>
-<li><p><a class="reference internal" href="#lm-studio" id="id232">LM Studio</a></p></li>
-<li><p><a class="reference internal" href="#jan" id="id233">Jan</a></p></li>
-<li><p><a class="reference internal" href="#open-webui" id="id234">Open WebUI</a></p></li>
-<li><p><a class="reference internal" href="#id36" id="id235">Comparison</a></p></li>
+<li><p><a class="reference internal" href="#lm-studio" id="id235">LM Studio</a></p></li>
+<li><p><a class="reference internal" href="#jan" id="id236">Jan</a></p></li>
+<li><p><a class="reference internal" href="#open-webui" id="id237">Open WebUI</a></p></li>
+<li><p><a class="reference internal" href="#id36" id="id238">Comparison</a></p></li>
 </ul>
 </li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#case-study-the-effect-of-quantization-on-llm-performance" id="id236">Case Study: The Effect of Quantization on LLM Performance</a></p>
+<li><p><a class="reference internal" href="#case-study-the-effect-of-quantization-on-llm-performance" id="id239">Case Study: The Effect of Quantization on LLM Performance</a></p>
 <ul>
-<li><p><a class="reference internal" href="#prompts-dataset" id="id237">Prompts Dataset</a></p></li>
-<li><p><a class="reference internal" href="#quantization" id="id238">Quantization</a></p></li>
-<li><p><a class="reference internal" href="#benchmarking" id="id239">Benchmarking</a></p></li>
-<li><p><a class="reference internal" href="#results" id="id240">Results</a></p></li>
-<li><p><a class="reference internal" href="#takeaways" id="id241">Takeaways</a></p></li>
+<li><p><a class="reference internal" href="#prompts-dataset" id="id240">Prompts Dataset</a></p></li>
+<li><p><a class="reference internal" href="#quantization" id="id241">Quantization</a></p></li>
+<li><p><a class="reference internal" href="#benchmarking" id="id242">Benchmarking</a></p></li>
+<li><p><a class="reference internal" href="#results" id="id243">Results</a></p></li>
+<li><p><a class="reference internal" href="#takeaways" id="id244">Takeaways</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#conclusion" id="id242">Conclusion</a></p></li>
-<li><p><a class="reference internal" href="#citation" id="id243">Citation</a></p></li>
-<li><p><a class="reference internal" href="#references" id="id244">References</a></p></li>
+<li><p><a class="reference internal" href="#conclusion" id="id245">Conclusion</a></p></li>
+<li><p><a class="reference internal" href="#references" id="id246">References</a></p></li>
 </ul>
 </li>
 </ul>
 </nav>
 <section id="introduction">
-<h2><a class="toc-backref" href="#id218" role="doc-backlink"><span class="section-number">7.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id221" role="doc-backlink"><span class="section-number">7.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
 <p>Running Open Source LLMs locally versus depending on proprietary cloud-based models represents more than just a technical choice - it’s a fundamental re-imagining of how we interact with AI technology, putting control back in the hands of users.</p>
 <p>Privacy concerns are a key driver for running LLMs locally. Individual users may want to process personal documents, photos, emails, and chat messages without sharing sensitive data with third parties. For enterprise use cases, organizations handling medical records must comply with HIPAA regulations that require data to remain on-premise. Similarly, businesses processing confidential documents and intellectual property, as well as organizations subject to GDPR and other privacy regulations, need to maintain strict control over their data processing pipeline.</p>
 <p>Cost considerations are another key driver. Organizations and individual consumers can better control expenses by matching model capabilities to their specific needs rather than paying for multiple cloud API subscriptions. For organizations with high-volume applications, this customization and control over costs becomes especially valuable compared to the often prohibitive per-request pricing of cloud solutions. For consumers, running multiple open source models locally eliminates the need to maintain separate subscriptions to access different model capabilities.</p>
@@ -297,8 +308,8 @@ <h2><a class="toc-backref" href="#id218" role="doc-backlink"><span class="sectio
 <p>However, local deployment introduces its own set of challenges and considerations. In this Chapter, we explore the landscape of local LLM deployment focused on Open Source models and tools. When choosing a local open source model, organizations must carefully evaluate several interconnected factors, from task suitability and performance requirements to resource constraints and licensing.</p>
 <p>We also cover key tools enabling local model serving and inference, including open source solutions such as LLama.cpp, Llamafile, and Ollama, along with user-friendly frontend interfaces that make local LLM usage more accessible. We conclude with a detailed case study, analyzing how different quantization approaches impact model performance in resource-constrained environments. This analysis reveals the critical tradeoffs between model size, inference speed, and output quality that practitioners must navigate.</p>
 </section>
-<section id="models-considerations">
-<h2><a class="toc-backref" href="#id219" role="doc-backlink"><span class="section-number">7.2. </span>Models Considerations</a><a class="headerlink" href="#models-considerations" title="Permalink to this heading">¶</a></h2>
+<section id="choosing-your-model">
+<span id="local-model-selection"></span><h2><a class="toc-backref" href="#id222" role="doc-backlink"><span class="section-number">7.2. </span>Choosing your Model</a><a class="headerlink" href="#choosing-your-model" title="Permalink to this heading">¶</a></h2>
 <p>The landscape of open source LLMs is rapidly evolving, with new models emerging by the day. While proprietary LLMs have garnered significant attention, open source LLMs are gaining traction due to their flexibility, customization options, and cost-effectiveness.</p>
 <p>It is important to observe long-term strategic considerations when choosing a model. These entails prioritization dimensions that may enable competitive advantage in the long-term, including:</p>
 <ol class="arabic simple">
@@ -310,7 +321,7 @@ <h2><a class="toc-backref" href="#id219" role="doc-backlink"><span class="sectio
 </ol>
 <p>In this section, we aim to provide a comprehensive set of considerations to selecting the right open-source LLM for your specific needs, emphasizing the importance of aligning the LLM’s capabilities with the intended task and considering resources constraints.</p>
 <section id="task-suitability">
-<h3><a class="toc-backref" href="#id220" role="doc-backlink"><span class="section-number">7.2.1. </span>Task Suitability</a><a class="headerlink" href="#task-suitability" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id223" role="doc-backlink"><span class="section-number">7.2.1. </span>Task Suitability</a><a class="headerlink" href="#task-suitability" title="Permalink to this heading">¶</a></h3>
 <p>When evaluating an open source LLM, task suitability is a critical first consideration. A model that performs well on general benchmarks may struggle with specific domain tasks. Understanding the intended use case helps narrow down model options based on their demonstrated strengths.</p>
 <p><strong>Task Categories</strong></p>
 <p>When determining which LLM task to prioritize, carefully consider your specific use case and end-user needs. Different applications require distinct model capabilities and optimizations. Common LLM Task Categories include:</p>
@@ -344,8 +355,8 @@ <h3><a class="toc-backref" href="#id220" role="doc-backlink"><span class="sectio
 <p><span class="caption-number">Fig. 7.2 </span><span class="caption-text">Model Types.</span><a class="headerlink" href="#model-types" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
-<p>The Llama 2 model family <span id="id4">[<a class="reference internal" href="#id161" title="Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and Thomas Scialom. Llama 2: open foundation and fine-tuned chat models. 2023. URL: https://arxiv.org/abs/2307.09288, arXiv:2307.09288.">Touvron <em>et al.</em>, 2023</a>]</span> illustrates these distinctions well. The base Llama 2, trained on 2 trillion tokens of public data, demonstrates general-purpose capabilities across text generation and translation tasks. Its chat-optimized instruction-tuned variant, Llama 2-Chat, underwent additional fine-tuning on over 1 million human-annotated conversational examples, making it particularly adept at natural dialogue.</p>
-<p>Benchmark results <span id="id5">[<a class="reference internal" href="#id115" title="Meta AI. Llama-2-70b-chat-hf. Hugging Face Model, 2024c. 70 billion parameter chat model from Meta's Llama 2 family. URL: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf.">Meta AI, 2024c</a>]</span> in <a class="reference internal" href="#llama2-benchmark"><span class="std std-numref">Table 7.1</span></a> highlight the impact of model specialization. On the TruthfulQA <span id="id6">[<a class="reference internal" href="safety.html#id108" title="Stephanie Lin, Jacob Hilton, and Owain Evans. Truthfulqa: measuring how models mimic human falsehoods. 2022. URL: https://arxiv.org/abs/2109.07958, arXiv:2109.07958.">Lin <em>et al.</em>, 2022</a>]</span> and Toxigen <span id="id7">[<a class="reference internal" href="#id119" title="Khalid Alnajjar and others. Toxigen dataset. Papers with Code Dataset, 2024. Dataset for evaluating and mitigating toxic language generation in language models. URL: https://paperswithcode.com/dataset/toxigen.">Alnajjar and others, 2024</a>]</span> benchmarks measuring truthful and informative responses. We observe that the chat-optimized variants show substantially improved truthfulness. Similarly, on the ToxiGen benchmark measuring toxic content generation, Llama 2-Chat models demonstrate near-zero toxicity compared to base models’ 21-26% rates.</p>
+<p>The Llama 2 model family <span id="id4">[<a class="reference internal" href="#id164" title="Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and Thomas Scialom. Llama 2: open foundation and fine-tuned chat models. 2023. URL: https://arxiv.org/abs/2307.09288, arXiv:2307.09288.">Touvron <em>et al.</em>, 2023</a>]</span> illustrates these distinctions well. The base Llama 2, trained on 2 trillion tokens of public data, demonstrates general-purpose capabilities across text generation and translation tasks. Its chat-optimized instruction-tuned variant, Llama 2-Chat, underwent additional fine-tuning on over 1 million human-annotated conversational examples, making it particularly adept at natural dialogue.</p>
+<p>Benchmark results <span id="id5">[<a class="reference internal" href="#id115" title="Meta AI. Llama-2-70b-chat-hf. Hugging Face Model, 2024c. 70 billion parameter chat model from Meta's Llama 2 family. URL: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf.">Meta AI, 2024c</a>]</span> in <a class="reference internal" href="#llama2-benchmark"><span class="std std-numref">Table 7.1</span></a> highlight the impact of model specialization. On the TruthfulQA <span id="id6">[<a class="reference internal" href="safety.html#id108" title="Stephanie Lin, Jacob Hilton, and Owain Evans. Truthfulqa: measuring how models mimic human falsehoods. 2022. URL: https://arxiv.org/abs/2109.07958, arXiv:2109.07958.">Lin <em>et al.</em>, 2022</a>]</span> and Toxigen <span id="id7">[<a class="reference internal" href="#id120" title="Khalid Alnajjar and others. Toxigen dataset. Papers with Code Dataset, 2024. Dataset for evaluating and mitigating toxic language generation in language models. URL: https://paperswithcode.com/dataset/toxigen.">Alnajjar and others, 2024</a>]</span> benchmarks measuring truthful and informative responses. We observe that the chat-optimized variants show substantially improved truthfulness. Similarly, on the ToxiGen benchmark measuring toxic content generation, Llama 2-Chat models demonstrate near-zero toxicity compared to base models’ 21-26% rates.</p>
 <table class="docutils align-center" id="llama2-benchmark">
 <caption><span class="caption-number">Table 7.1 </span><span class="caption-text">Benchmark results for Llama 2 family of models.</span><a class="headerlink" href="#llama2-benchmark" title="Permalink to this table">¶</a></caption>
 <thead>
@@ -400,9 +411,9 @@ <h3><a class="toc-backref" href="#id220" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="performance-cost">
-<h3><a class="toc-backref" href="#id221" role="doc-backlink"><span class="section-number">7.2.2. </span>Performance &amp; Cost</a><a class="headerlink" href="#performance-cost" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id224" role="doc-backlink"><span class="section-number">7.2.2. </span>Performance &amp; Cost</a><a class="headerlink" href="#performance-cost" title="Permalink to this heading">¶</a></h3>
 <p>General benchmarks are useful for comparing models across different standard tasks. Open Source models are becoming more competitive with proprietary models with LLama, Qwen and Mistral model families being some of the most powerful open source models available today.</p>
-<p>Qwen model family <span id="id9">[<a class="reference internal" href="#id153" title="Qwen, :, An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoran Wei, Huan Lin, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Yang, Jiaxi Yang, Jingren Zhou, Junyang Lin, Kai Dang, Keming Lu, Keqin Bao, Kexin Yang, Le Yu, Mei Li, Mingfeng Xue, Pei Zhang, Qin Zhu, Rui Men, Runji Lin, Tianhao Li, Tingyu Xia, Xingzhang Ren, Xuancheng Ren, Yang Fan, Yang Su, Yichang Zhang, Yu Wan, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, and Zihan Qiu. Qwen2.5 technical report. 2024. URL: https://arxiv.org/abs/2412.15115, arXiv:2412.15115.">Qwen <em>et al.</em>, 2024</a>]</span> emerged in 2024 as a model family achieving competitive performance with smaller parameter counts compared to its competitors. The flagship Qwen2.5-72B-Instruct model demonstrates performance comparable to the much larger Llama-3-405B-Instruct while being about 5 times smaller. The models excel in specialized tasks like mathematics and coding, handle structured data effectively, and offer enhanced support for tool use and long-text generation as shown in <a class="reference internal" href="#qwen-perf"><span class="std std-numref">Fig. 7.3</span></a>.</p>
+<p>Qwen model family <span id="id9">[<a class="reference internal" href="#id156" title="Qwen, :, An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoran Wei, Huan Lin, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Yang, Jiaxi Yang, Jingren Zhou, Junyang Lin, Kai Dang, Keming Lu, Keqin Bao, Kexin Yang, Le Yu, Mei Li, Mingfeng Xue, Pei Zhang, Qin Zhu, Rui Men, Runji Lin, Tianhao Li, Tingyu Xia, Xingzhang Ren, Xuancheng Ren, Yang Fan, Yang Su, Yichang Zhang, Yu Wan, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, and Zihan Qiu. Qwen2.5 technical report. 2024. URL: https://arxiv.org/abs/2412.15115, arXiv:2412.15115.">Qwen <em>et al.</em>, 2024</a>]</span> emerged in 2024 as a model family achieving competitive performance with smaller parameter counts compared to its competitors. The flagship Qwen2.5-72B-Instruct model demonstrates performance comparable to the much larger Llama-3-405B-Instruct while being about 5 times smaller. The models excel in specialized tasks like mathematics and coding, handle structured data effectively, and offer enhanced support for tool use and long-text generation as shown in <a class="reference internal" href="#qwen-perf"><span class="std std-numref">Fig. 7.3</span></a>.</p>
 <figure class="align-center" id="qwen-perf">
 <a class="reference internal image-reference" href="../_images/qwen_perf.png"><img alt="Qwen Performance" src="../_images/qwen_perf.png" style="width: 764.0px; height: 551.6px;" /></a>
 <figcaption>
@@ -468,7 +479,7 @@ <h3><a class="toc-backref" href="#id221" role="doc-backlink"><span class="sectio
 <p>Regular re-evaluation of these metrics is recommended as the landscape evolves rapidly. What represents the optimal choice today may change as new models are released and existing ones are updated.</p>
 </section>
 <section id="licensing">
-<h3><a class="toc-backref" href="#id222" role="doc-backlink"><span class="section-number">7.2.3. </span>Licensing</a><a class="headerlink" href="#licensing" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id225" role="doc-backlink"><span class="section-number">7.2.3. </span>Licensing</a><a class="headerlink" href="#licensing" title="Permalink to this heading">¶</a></h3>
 <p>When evaluating open-source LLMs, it’s important to consider licensing and data usage policies. Some models may require attribution or commercial use licenses, while others may be more permissive. Additionally, ensure that the model’s training data is compatible with your intended use case and complies with relevant data protection laws.</p>
 <p>The licensing landscape for LLMs spans from highly permissive to custom and restricted usage. <a class="reference internal" href="#open-source-llms"><span class="std std-numref">Table 7.2</span></a> provides a summary of the licensing terms for some of the most popular open source LLMs. We observe two types of licenses:</p>
 <ul class="simple">
@@ -524,14 +535,14 @@ <h3><a class="toc-backref" href="#id222" role="doc-backlink"><span class="sectio
 <p>When selecting an open-source LLM for deployment, practitioners must carefully evaluate licensing terms that align with intended usage (whether commercial, research, or other). While permissive licenses like Apache 2.0 and MIT allow broad usage rights, custom licenses may impose specific restrictions on commercial applications or model derivatives, making thorough license review essential for sustainable implementation.</p>
 <p>The training data sources for LLMs represent another critical consideration. Models vary significantly in their training data foundations - some leverage purely public datasets while others incorporate proprietary or restricted content with the added complexity that public data does not mean free data. These data choices fundamentally impact not only model capabilities but also legal and regulatory compliance.</p>
 <p>The legal landscape surrounding LLM training data has grown increasingly complex, particularly regarding copyright infringement concerns. The high-profile lawsuit between OpenAI and The New York Times <span id="id12">[<a class="reference internal" href="#id47" title="Harvard Law Review. Nyt v. openai: the times's about-face. https://harvardlawreview.org/blog/2024/04/nyt-v-openai-the-timess-about-face/, 2024. Accessed: 2024.">Review, 2024</a>]</span> serves as a pivotal example, where the Times claims its copyrighted materials were used without authorization to train language models. This litigation has far-reaching consequences for developers building LLM-powered applications. Should courts rule in favor of copyright holders, model providers may need to withdraw and retrain models containing protected content. These legal uncertainties introduce substantial complexity into LLM implementation strategies, demanding careful consideration during project planning phases.</p>
-<p>Recent LLM releases demonstrate varying levels of data transparency. For instance, Qwen2.5’s approach <span id="id13">[<a class="reference internal" href="#id153" title="Qwen, :, An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoran Wei, Huan Lin, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Yang, Jiaxi Yang, Jingren Zhou, Junyang Lin, Kai Dang, Keming Lu, Keqin Bao, Kexin Yang, Le Yu, Mei Li, Mingfeng Xue, Pei Zhang, Qin Zhu, Rui Men, Runji Lin, Tianhao Li, Tingyu Xia, Xingzhang Ren, Xuancheng Ren, Yang Fan, Yang Su, Yichang Zhang, Yu Wan, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, and Zihan Qiu. Qwen2.5 technical report. 2024. URL: https://arxiv.org/abs/2412.15115, arXiv:2412.15115.">Qwen <em>et al.</em>, 2024</a>]</span> illustrates common industry practices in both its achievements and limitations. On the training data scale front, Qwen2.5 does provide some transparency by discussing some training data methodology compared to previous versions such as expanding from 7 trillion to 18 trillion tokens, while implementing sophisticated quality filtering and carefully balancing domain representation through sampling adjustments.</p>
+<p>Recent LLM releases demonstrate varying levels of data transparency. For instance, Qwen2.5’s approach <span id="id13">[<a class="reference internal" href="#id156" title="Qwen, :, An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoran Wei, Huan Lin, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Yang, Jiaxi Yang, Jingren Zhou, Junyang Lin, Kai Dang, Keming Lu, Keqin Bao, Kexin Yang, Le Yu, Mei Li, Mingfeng Xue, Pei Zhang, Qin Zhu, Rui Men, Runji Lin, Tianhao Li, Tingyu Xia, Xingzhang Ren, Xuancheng Ren, Yang Fan, Yang Su, Yichang Zhang, Yu Wan, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, and Zihan Qiu. Qwen2.5 technical report. 2024. URL: https://arxiv.org/abs/2412.15115, arXiv:2412.15115.">Qwen <em>et al.</em>, 2024</a>]</span> illustrates common industry practices in both its achievements and limitations. On the training data scale front, Qwen2.5 does provide some transparency by discussing some training data methodology compared to previous versions such as expanding from 7 trillion to 18 trillion tokens, while implementing sophisticated quality filtering and carefully balancing domain representation through sampling adjustments.</p>
 <p>However, like many commercial LLMs, Qwen2.5 exhibits transparency limitations. The report provides incomplete disclosure of data sources and limited information about the proportions of different data types used in training. The preprocessing methodologies remain unclear, and there is minimal discussion of potential biases that may exist in the training data.</p>
-<p>Similarly, in the Llama 3 paper <span id="id14">[<a class="reference internal" href="#id147" title="Meta AI. The llama 3 herd of models. 2024c. URL: https://arxiv.org/abs/2407.21783, arXiv:2407.21783.">AI, 2024c</a>]</span>, Meta AI does share some details about the pre-training corpus stating simply stating that it was around 15T multilingual tokens, compared to 1.8T tokens for Llama 2. The exact sources of data used for pre-training and post-training are not explicitly listed.</p>
+<p>Similarly, in the Llama 3 paper <span id="id14">[<a class="reference internal" href="#id150" title="Meta AI. The llama 3 herd of models. 2024c. URL: https://arxiv.org/abs/2407.21783, arXiv:2407.21783.">AI, 2024c</a>]</span>, Meta AI does share some details about the pre-training corpus stating simply stating that it was around 15T multilingual tokens, compared to 1.8T tokens for Llama 2. The exact sources of data used for pre-training and post-training are not explicitly listed.</p>
 <p>These gaps in transparency reflect a broader industry challenge in balancing commercial interests with the need for openness and scientific reproducibility.</p>
 <p>A significant advancement in open-source language model training data is HuggingFace’s release of the FineWeb datasets. In its first release <span id="id15">[<a class="reference internal" href="#id46" title="Guilherme Penedo, Hynek Kydlíček, Loubna Ben allal, Anton Lozhkov, Margaret Mitchell, Colin Raffel, Leandro Von Werra, and Thomas Wolf. The fineweb datasets: decanting the web for the finest text data at scale. 2024. URL: https://arxiv.org/abs/2406.17557, arXiv:2406.17557.">Penedo <em>et al.</em>, 2024</a>]</span>, FineWeb is made of a 15-trillion token dataset derived from 96 Common Crawl snapshots that produces better-performing LLMs than other open pretraining datasets. Additionally, data curation codebase and all of the models trained during our ablation experiments are made available. FineWeb is a fine example of an initiative that helps minimize the gap between proprietary and public knowledge.</p>
 </section>
 <section id="community-support">
-<h3><a class="toc-backref" href="#id223" role="doc-backlink"><span class="section-number">7.2.4. </span>Community Support</a><a class="headerlink" href="#community-support" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id226" role="doc-backlink"><span class="section-number">7.2.4. </span>Community Support</a><a class="headerlink" href="#community-support" title="Permalink to this heading">¶</a></h3>
 <p>Community support plays a vital role in the open-source LLM ecosystem. Active communities contribute to model development, provide technical assistance, and share valuable resources. When evaluating open-source LLMs, the strength and engagement of the community should be a key consideration, as it directly impacts the model’s long-term viability and practical utility.</p>
 <p>The popularity of different model families reflects their community adoption. In 2024, the Qwen and Llama families have emerged as clear favorites, with Qwen2.5-1.5B-Instruct alone representing 35% of total open source models downloads in 2024.</p>
 <figure class="align-center" id="downloads">
@@ -540,13 +551,13 @@ <h3><a class="toc-backref" href="#id223" role="doc-backlink"><span class="sectio
 <p><span class="caption-number">Fig. 7.8 </span><span class="caption-text">Hugging Face Model Downloads in 2024 as of December 22 of the same year <span id="id16">[<a class="reference internal" href="#id45" title="Hugging Face. Open source ai year in review 2024. https://huggingface.co/spaces/huggingface/open-source-ai-year-in-review-2024, 2024t. Accessed: 2024.">Face, 2024t</a>]</span>.</span><a class="headerlink" href="#downloads" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
-<p>Strong communities accelerate model innovation through collective effort. When developers and researchers collaborate on model development, they create a powerful ecosystem of continuous improvement. Through transparent sharing of findings, they enable rapid development of novel applications and specialized model variants for specific domains. This collaborative environment naturally leads to the establishment of best practices and frameworks that benefit the entire community. The success of this community-driven approach is evident in models like Qwen2.5-1.5B-Instruct, which has spawned 200+ derivative models through post-training adaptations <span id="id17">[<a class="reference internal" href="#id152" title="Qwen. Qwen2.5-1.5b-instruct. 2024b. Accessed: December 22, 2024. URL: https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct.">Qwen, 2024b</a>]</span>.</p>
+<p>Strong communities accelerate model innovation through collective effort. When developers and researchers collaborate on model development, they create a powerful ecosystem of continuous improvement. Through transparent sharing of findings, they enable rapid development of novel applications and specialized model variants for specific domains. This collaborative environment naturally leads to the establishment of best practices and frameworks that benefit the entire community. The success of this community-driven approach is evident in models like Qwen2.5-1.5B-Instruct, which has spawned 200+ derivative models through post-training adaptations <span id="id17">[<a class="reference internal" href="#id155" title="Qwen. Qwen2.5-1.5b-instruct. 2024b. Accessed: December 22, 2024. URL: https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct.">Qwen, 2024b</a>]</span>.</p>
 </section>
 <section id="customization">
-<h3><a class="toc-backref" href="#id224" role="doc-backlink"><span class="section-number">7.2.5. </span>Customization</a><a class="headerlink" href="#customization" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id227" role="doc-backlink"><span class="section-number">7.2.5. </span>Customization</a><a class="headerlink" href="#customization" title="Permalink to this heading">¶</a></h3>
 <p>Model customization is an important consideration when selecting an open-source LLM. Adapting and fine-tuning to specific use cases can significantly impact practical utility and performance in production environments.</p>
 <p>Model providers increasingly offer streamlined fine-tuning services. For example, Mistral demonstrates an accessible approach to model customization.
-The code below shows Mistral’s straightforward fine-tuning API. The example shows how to create and start a fine-tuning job with just a few lines of code. The fine-tuning job is configured with the base model “open-mistral-7b” and uses training and validation files from the Ultrachat dataset <span id="id18">[<a class="reference internal" href="#id149" title="Hugging Face. Ultrachat-200k dataset. 2024u. Accessed: 2024. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k.">Face, 2024u</a>]</span>. This API design makes it easy to experiment with model customization while maintaining control over the training process.</p>
+The code below shows Mistral’s straightforward fine-tuning API. The example shows how to create and start a fine-tuning job with just a few lines of code. The fine-tuning job is configured with the base model “open-mistral-7b” and uses training and validation files from the Ultrachat dataset <span id="id18">[<a class="reference internal" href="#id152" title="Hugging Face. Ultrachat-200k dataset. 2024u. Accessed: 2024. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k.">Face, 2024u</a>]</span>. This API design makes it easy to experiment with model customization while maintaining control over the training process.</p>
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># create a fine-tuning job</span>
 <span class="n">created_jobs</span> <span class="o">=</span> <span class="n">client</span><span class="o">.</span><span class="n">fine_tuning</span><span class="o">.</span><span class="n">jobs</span><span class="o">.</span><span class="n">create</span><span class="p">(</span>
     <span class="n">model</span><span class="o">=</span><span class="s2">&quot;open-mistral-7b&quot;</span><span class="p">,</span> 
@@ -565,7 +576,7 @@ <h3><a class="toc-backref" href="#id224" role="doc-backlink"><span class="sectio
 <span class="n">created_jobs</span>
 </pre></div>
 </div>
-<p>For more comprehensive customization needs, Hugging Face’s Transformer Reinforcement Learning (TRL) toolkit provides robust capabilities for model adaptation. Built on the Transformers library, TRL supports <span id="id19">[<a class="reference internal" href="#id170" title="Hugging Face. Trl. 2024d. TRL. URL: https://huggingface.co/docs/trl/en/index.">Face, 2024d</a>]</span>:</p>
+<p>For more comprehensive customization needs, Hugging Face’s Transformer Reinforcement Learning (TRL) toolkit provides robust capabilities for model adaptation. Built on the Transformers library, TRL supports <span id="id19">[<a class="reference internal" href="#id173" title="Hugging Face. Trl. 2024d. TRL. URL: https://huggingface.co/docs/trl/en/index.">Face, 2024d</a>]</span>:</p>
 <ul class="simple">
 <li><p>Supervised Fine-Tuning (SFT)</p></li>
 <li><p>Reward Modeling (RM)</p></li>
@@ -611,7 +622,7 @@ <h3><a class="toc-backref" href="#id224" role="doc-backlink"><span class="sectio
 </tr>
 </tbody>
 </table>
-<p>Small language models can serve as a lightweight alternative to customization compared to large models. Recent research has shown that smaller models can achieve competitive performance compared to larger models <span id="id21">[<a class="reference internal" href="#id148" title="Hugging Face. Scaling test time compute. 2024v. Accessed: 2024. URL: https://huggingface.co/spaces/HuggingFaceH4/blogpost-scaling-test-time-compute.">Face, 2024v</a>, <a class="reference internal" href="#id150" title="Justin Zhao, Timothy Wang, Wael Abid, Geoffrey Angus, Arnav Garg, Jeffery Kinnison, Alex Sherstinsky, Piero Molino, Travis Addair, and Devvret Rishi. Lora land: 310 fine-tuned llms that rival gpt-4, a technical report. 2024. URL: https://arxiv.org/abs/2405.00732, arXiv:2405.00732.">Zhao <em>et al.</em>, 2024</a>]</span>. A noteworthy example is Hugging Face’s SmolLM2 <span id="id22">[<a class="reference internal" href="#id61" title="Loubna Ben Allal, Anton Lozhkov, Elie Bakouch, Gabriel Martín Blázquez, Lewis Tunstall, Agustín Piqueres, Andres Marafioti, Cyril Zakka, Leandro von Werra, and Thomas Wolf. Smollm2 - with great data, comes great performance. 2024.">Allal <em>et al.</em>, 2024</a>]</span>, a family of compact language models designed with several key advantages:</p>
+<p>Small language models can serve as a lightweight alternative to customization compared to large models. Recent research has shown that smaller models can achieve competitive performance compared to larger models <span id="id21">[<a class="reference internal" href="#id151" title="Hugging Face. Scaling test time compute. 2024v. Accessed: 2024. URL: https://huggingface.co/spaces/HuggingFaceH4/blogpost-scaling-test-time-compute.">Face, 2024v</a>, <a class="reference internal" href="#id153" title="Justin Zhao, Timothy Wang, Wael Abid, Geoffrey Angus, Arnav Garg, Jeffery Kinnison, Alex Sherstinsky, Piero Molino, Travis Addair, and Devvret Rishi. Lora land: 310 fine-tuned llms that rival gpt-4, a technical report. 2024. URL: https://arxiv.org/abs/2405.00732, arXiv:2405.00732.">Zhao <em>et al.</em>, 2024</a>]</span>. A noteworthy example is Hugging Face’s SmolLM2 <span id="id22">[<a class="reference internal" href="#id61" title="Loubna Ben Allal, Anton Lozhkov, Elie Bakouch, Gabriel Martín Blázquez, Lewis Tunstall, Agustín Piqueres, Andres Marafioti, Cyril Zakka, Leandro von Werra, and Thomas Wolf. Smollm2 - with great data, comes great performance. 2024.">Allal <em>et al.</em>, 2024</a>]</span>, a family of compact language models designed with several key advantages:</p>
 <ol class="arabic simple">
 <li><p>Compact Sizes:</p></li>
 </ol>
@@ -641,10 +652,10 @@ <h3><a class="toc-backref" href="#id224" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="tools-for-local-llm-deployment">
-<h2><a class="toc-backref" href="#id225" role="doc-backlink"><span class="section-number">7.3. </span>Tools for Local LLM Deployment</a><a class="headerlink" href="#tools-for-local-llm-deployment" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id228" role="doc-backlink"><span class="section-number">7.3. </span>Tools for Local LLM Deployment</a><a class="headerlink" href="#tools-for-local-llm-deployment" title="Permalink to this heading">¶</a></h2>
 <p>Local LLM deployment tools generally fall into two categories: inference-focused tools that prioritize performance and programmability for technical users requiring production-grade deployments, and user interface (UI) tools that emphasize accessibility through graphical interfaces for non-technical users, trading some performance for ease of use and broader adoption. In the following sections we will explore some of these tools discussing their features, capabilities, and trade-offs.</p>
 <section id="serving-models">
-<h3><a class="toc-backref" href="#id226" role="doc-backlink"><span class="section-number">7.3.1. </span>Serving Models</a><a class="headerlink" href="#serving-models" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id229" role="doc-backlink"><span class="section-number">7.3.1. </span>Serving Models</a><a class="headerlink" href="#serving-models" title="Permalink to this heading">¶</a></h3>
 <p>Serving an LLM model involves making it available for inference by setting up infrastructure to process requests and manage resources efficiently. This serving layer handles several key responsibilities, from loading model weights and managing compute resources to processing requests and optimizing performance. Let’s examine the core components of model serving:</p>
 <ol class="arabic simple">
 <li><p><strong>Model Loading and Initialization</strong></p></li>
@@ -697,8 +708,8 @@ <h3><a class="toc-backref" href="#id226" role="doc-backlink"><span class="sectio
 </ul>
 <p>Let’s explore each of these options in detail.</p>
 <section id="llama-cpp">
-<h4><a class="toc-backref" href="#id227" role="doc-backlink"><span class="section-number">7.3.1.1. </span>LLama.cpp</a><a class="headerlink" href="#llama-cpp" title="Permalink to this heading">¶</a></h4>
-<p>LLama.cpp <span id="id23">[<a class="reference internal" href="#id125" title="Georgi Gerganov and contributors. Llama.cpp. GitHub Repository, 2024a. High-performance inference of LLaMA models in pure C/C++. URL: https://github.com/ggerganov/llama.cpp.">Gerganov and contributors, 2024a</a>]</span> is an MIT-licensed open source optimized implementation of the <strong>LLama</strong> model architecture designed to run efficiently on machines with limited memory.</p>
+<h4><a class="toc-backref" href="#id230" role="doc-backlink"><span class="section-number">7.3.1.1. </span>LLama.cpp</a><a class="headerlink" href="#llama-cpp" title="Permalink to this heading">¶</a></h4>
+<p>LLama.cpp <span id="id23">[<a class="reference internal" href="#id128" title="Georgi Gerganov and contributors. Llama.cpp. GitHub Repository, 2024a. High-performance inference of LLaMA models in pure C/C++. URL: https://github.com/ggerganov/llama.cpp.">Gerganov and contributors, 2024a</a>]</span> is an MIT-licensed open source optimized implementation of the <strong>LLama</strong> model architecture designed to run efficiently on machines with limited memory.</p>
 <p>Originally developed by Georgi Gerganov and today counting with hundreds of contributors, this C/C++ LLama version provides a simplified interface and advanced features that allow language models to run locally without overwhelming systems. With the ability to run in resource-constrained environments, LLama.cpp makes powerful language models more accessible and practical for a variety of applications.</p>
 <p>In its “Manifesto” <span id="id24">[<a class="reference internal" href="#id109" title="Georgi Gerganov and others. Quantization of llama models - discussion. GitHub Discussion, 2023. Discussion thread about quantization techniques and tradeoffs in llama.cpp. URL: https://github.com/ggerganov/llama.cpp/discussions/205.">Gerganov and others, 2023</a>]</span>, the author highlights the significant potential in bringing AI from cloud to edge devices, emphasizing the importance of keeping development lightweight, experimental, and enjoyable rather than getting bogged down in complex engineering challenges. The author states a vision that emphasizes maintaining an exploratory, hacker-minded approach while building practical edge computing solutions highlighting the following core principles:</p>
 <ul class="simple">
@@ -715,7 +726,7 @@ <h4><a class="toc-backref" href="#id227" role="doc-backlink"><span class="sectio
 <li><p><strong>Ease of Implementation</strong>: Although it’s a lighter solution, LLama.cpp doesn’t sacrifice result quality. It maintains the ability to generate texts and perform NLP tasks with high precision.</p></li>
 </ol>
 <p><strong>GGUF</strong></p>
-<p>GGUF (GPT-Generated Unified Format) <span id="id25">[<a class="reference internal" href="#id128" title="Georgi Gerganov and contributors. Gguf file format specification. GitHub Repository, 2024b. Technical specification of the GGUF file format for efficient model storage and inference. URL: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md.">Gerganov and contributors, 2024b</a>]</span> is the latest model format used by LLama.cpp, replacing the older GGML format. It was designed specifically for efficient inference of large language models on consumer hardware. The key features that make GGUF particularly valuable include <span id="id26">[<a class="reference internal" href="#id126" title="IBM Think. Gguf vs ggml: what's the difference? 2024. Comparison of GGUF and GGML model formats. URL: https://www.ibm.com/think/topics/gguf-versus-ggml.">IBM Think, 2024</a>]</span>:</p>
+<p>GGUF (GPT-Generated Unified Format) <span id="id25">[<a class="reference internal" href="#id131" title="Georgi Gerganov and contributors. Gguf file format specification. GitHub Repository, 2024b. Technical specification of the GGUF file format for efficient model storage and inference. URL: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md.">Gerganov and contributors, 2024b</a>]</span> is the latest model format used by LLama.cpp, replacing the older GGML format. It was designed specifically for efficient inference of large language models on consumer hardware. The key features that make GGUF particularly valuable include <span id="id26">[<a class="reference internal" href="#id129" title="IBM Think. Gguf vs ggml: what's the difference? 2024. Comparison of GGUF and GGML model formats. URL: https://www.ibm.com/think/topics/gguf-versus-ggml.">IBM Think, 2024</a>]</span>:</p>
 <ul class="simple">
 <li><p>Improved quantization: GGUF supports multiple quantization levels to reduce model size while preserving performance. Common quantization schemes that are supported by GGUF include:</p>
 <ul>
@@ -729,9 +740,9 @@ <h4><a class="toc-backref" href="#id227" role="doc-backlink"><span class="sectio
 <li><p>Architecture-specific optimizations: Takes advantage of CPU/GPU specific instructions for faster inference</p></li>
 <li><p>Versioning support: Includes proper versioning to handle format evolution and backwards compatibility</p></li>
 </ul>
-<p>These capabilities make GGUF models significantly more practical for running LLMs locally compared to full-precision formats, often dramatically reducing memory requirements. Hugging Face hosts a growing collection of pre-converted GGUF models <span id="id27">[<a class="reference internal" href="#id127" title="Hugging Face. Gguf models on hugging face. Online Repository, 2024x. Collection of models in GGUF format for efficient local inference. URL: https://huggingface.co/models?search=gguf.">Hugging Face, 2024x</a>]</span> and provides a tool (ggml-org/gguf-my-repo) to convert existing models to GGUF format, making it easier for developers to access and deploy optimized versions of popular language models.</p>
+<p>These capabilities make GGUF models significantly more practical for running LLMs locally compared to full-precision formats, often dramatically reducing memory requirements. Hugging Face hosts a growing collection of pre-converted GGUF models <span id="id27">[<a class="reference internal" href="#id130" title="Hugging Face. Gguf models on hugging face. Online Repository, 2024x. Collection of models in GGUF format for efficient local inference. URL: https://huggingface.co/models?search=gguf.">Hugging Face, 2024x</a>]</span> and provides a tool (ggml-org/gguf-my-repo) to convert existing models to GGUF format, making it easier for developers to access and deploy optimized versions of popular language models.</p>
 <p><strong>Setup</strong></p>
-<p>Please follow the instructions from the LLama.cpp <a class="reference external" href="https://github.com/ggerganov/llama.cpp">GitHub repository</a> <span id="id28">[<a class="reference internal" href="#id125" title="Georgi Gerganov and contributors. Llama.cpp. GitHub Repository, 2024a. High-performance inference of LLaMA models in pure C/C++. URL: https://github.com/ggerganov/llama.cpp.">Gerganov and contributors, 2024a</a>]</span> to install and compile the library.</p>
+<p>Please follow the instructions from the LLama.cpp <a class="reference external" href="https://github.com/ggerganov/llama.cpp">GitHub repository</a> <span id="id28">[<a class="reference internal" href="#id128" title="Georgi Gerganov and contributors. Llama.cpp. GitHub Repository, 2024a. High-performance inference of LLaMA models in pure C/C++. URL: https://github.com/ggerganov/llama.cpp.">Gerganov and contributors, 2024a</a>]</span> to install and compile the library.</p>
 <p>Here, we will compile the library from source on a Linux machine with 8 jobs in parallel for enhanced performance (add the <code class="docutils literal notranslate"><span class="pre">-j</span></code> argument to run multiple jobs in parallel).</p>
 <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>sudo<span class="w"> </span>apt<span class="w"> </span>install<span class="w"> </span>cmake
 
@@ -830,7 +841,7 @@ <h4><a class="toc-backref" href="#id227" role="doc-backlink"><span class="sectio
 </pre></div>
 </div>
 <p><strong>Grammars</strong></p>
-<p>It is worth noting Llama.cpp provides a way to use grammars <span id="id30">[<a class="reference internal" href="#id123" title="Georgi Gerganov and contributors. Llama.cpp grammars documentation. GitHub Repository, 2024. Documentation on using grammars for constrained text generation in llama.cpp. URL: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md.">Gerganov and contributors, 2024</a>]</span> to constrain the output of the model as demonstrated below. This is the same technique Ollama uses, a similar approach to Outlines’ to generate structured outputs from LLMs. See Chapter <a class="reference internal" href="structured_output.html#structure"><span class="std std-ref">Structured Output</span></a> for more details.</p>
+<p>It is worth noting Llama.cpp provides a way to use grammars <span id="id30">[<a class="reference internal" href="#id126" title="Georgi Gerganov and contributors. Llama.cpp grammars documentation. GitHub Repository, 2024. Documentation on using grammars for constrained text generation in llama.cpp. URL: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md.">Gerganov and contributors, 2024</a>]</span> to constrain the output of the model as demonstrated below. This is the same technique Ollama uses, a similar approach to Outlines’ to generate structured outputs from LLMs. See Chapter <a class="reference internal" href="structured_output.html#structure"><span class="std std-ref">Structured Output</span></a> for more details.</p>
 <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>./build/bin/llama-cli<span class="w"> </span>-m<span class="w"> </span>./models/qwen2.5-0.5b-instruct-q8_0.gguf<span class="w"> </span>--grammar-file<span class="w"> </span>grammars/json.gbnf<span class="w"> </span>-p<span class="w"> </span><span class="s1">&#39;Request: schedule a call at 8pm; Command:&#39;</span>
 
 <span class="c1"># {&quot;appointmentTime&quot;: &quot;8pm&quot;, &quot;appointmentDetails&quot;: &quot;schedule a a call&quot;}</span>
@@ -892,7 +903,7 @@ <h4><a class="toc-backref" href="#id227" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="llamafile">
-<h4><a class="toc-backref" href="#id228" role="doc-backlink"><span class="section-number">7.3.1.2. </span>Llamafile</a><a class="headerlink" href="#llamafile" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id231" role="doc-backlink"><span class="section-number">7.3.1.2. </span>Llamafile</a><a class="headerlink" href="#llamafile" title="Permalink to this heading">¶</a></h4>
 <p>Developed by Occupy Wall Street’s former activist, Justine Tunney, Llamafile <span id="id32">[<a class="reference internal" href="#id111" title="Mozilla Ocho. Llamafile: distribute and run llms with a single file. GitHub Repository, 2024. Tool for packaging and distributing LLMs as self-contained executables. URL: https://github.com/Mozilla-Ocho/llamafile.">Mozilla Ocho, 2024</a>]</span> is an Appache 2.0 licensed open source tool that combines the power of LLama.cpp with <strong>Cosmopolitan Libc</strong>, a universal C standard library that allows creating portable executables compatible with multiple operating systems.</p>
 <p>In this way, Llamafile reduces all the complexity of LLMs to a single executable file (called a “llamafile”) that runs locally without installation. Key advantages of Llamafile over plain Llama.cpp include:</p>
 <ol class="arabic simple">
@@ -937,7 +948,7 @@ <h4><a class="toc-backref" href="#id228" role="doc-backlink"><span class="sectio
 <p>As a result, a model server is running on <a class="reference external" href="http://localhost:8080">http://localhost:8080</a>. And we can use it as demonstrated in the previous section.</p>
 </section>
 <section id="ollama">
-<h4><a class="toc-backref" href="#id229" role="doc-backlink"><span class="section-number">7.3.1.3. </span>Ollama</a><a class="headerlink" href="#ollama" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id232" role="doc-backlink"><span class="section-number">7.3.1.3. </span>Ollama</a><a class="headerlink" href="#ollama" title="Permalink to this heading">¶</a></h4>
 <p>Ollama is a lightweight, MIT-licensed open-source tool for running LLMs locally. It provides a simple interface for interacting with a wide range of language models, including popular models like Llama 3.1 and Llama 3.2. Ollama is designed to be easy to install and use, making it a popular choice for developers who want to run LLMs locally without the need for extensive setup or configuration. Ollama’s key advantages include:</p>
 <ol class="arabic simple">
 <li><p><strong>Model Management</strong></p></li>
@@ -1031,7 +1042,7 @@ <h4><a class="toc-backref" href="#id229" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="comparison">
-<h4><a class="toc-backref" href="#id230" role="doc-backlink"><span class="section-number">7.3.1.4. </span>Comparison</a><a class="headerlink" href="#comparison" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id233" role="doc-backlink"><span class="section-number">7.3.1.4. </span>Comparison</a><a class="headerlink" href="#comparison" title="Permalink to this heading">¶</a></h4>
 <p>Each solution offers distinct advantages and tradeoffs that make them suitable for different use cases. At a high-level, Ollama is the easiest to install and use and has become the most popular choice for your average use case, Llamafile is the easiest to distribute and a good choice when portability is a priority, and Llama.cpp is the most customizable and performant solution as summarized in <a class="reference internal" href="#feature-comparison-local"><span class="std std-numref">Table 7.4</span></a>.</p>
 <table class="docutils align-center" id="feature-comparison-local">
 <caption><span class="caption-number">Table 7.4 </span><span class="caption-text">lama.cpp vs Ollama vs Llamafile Comparison</span><a class="headerlink" href="#feature-comparison-local" title="Permalink to this table">¶</a></caption>
@@ -1087,10 +1098,10 @@ <h4><a class="toc-backref" href="#id230" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="ui">
-<h3><a class="toc-backref" href="#id231" role="doc-backlink"><span class="section-number">7.3.2. </span>UI</a><a class="headerlink" href="#ui" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id234" role="doc-backlink"><span class="section-number">7.3.2. </span>UI</a><a class="headerlink" href="#ui" title="Permalink to this heading">¶</a></h3>
 <p>There is a growing number of UI tools for local LLM deployment that aim at providing a more user-friendly experience. Ranging from closed-source to open-source solutions across a range of features and capabilities. We will discuss LM Studio, Jan, and OpenWebUI.</p>
 <section id="lm-studio">
-<h4><a class="toc-backref" href="#id232" role="doc-backlink"><span class="section-number">7.3.2.1. </span>LM Studio</a><a class="headerlink" href="#lm-studio" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id235" role="doc-backlink"><span class="section-number">7.3.2.1. </span>LM Studio</a><a class="headerlink" href="#lm-studio" title="Permalink to this heading">¶</a></h4>
 <p>LM Studio <span id="id34">[<a class="reference internal" href="#id110" title="LM Studio. Lm studio - discover, download, and run local llms. Website, 2024. Desktop application for discovering, downloading and running local language models. URL: https://lmstudio.ai/.">LM Studio, 2024</a>]</span> is a closed-source GUI for running LLMs locally. In the context of local deployment, LM Studio positions itself as a more user-friendly, feature-rich solution compared to the other tools. It’s particularly valuable for developers transitioning from cloud APIs to local deployment, and for users who prefer graphical interfaces over command-line tools. Key Features of LM Studio include:</p>
 <ul class="simple">
 <li><p><strong>Model Parameter Customization</strong>: Allows adjusting temperature, maximum tokens, frequency penalty, and other settings</p></li>
@@ -1114,7 +1125,7 @@ <h4><a class="toc-backref" href="#id232" role="doc-backlink"><span class="sectio
 <p>One important feature of LM Studio is that it provides machine specification verification capabilities, checking computer specifications like GPU and memory to report compatible models therefore helping users choose the right model. It also includes a local inference server for developers that allows setting up a local HTTP server similar to OpenAI’s API. Importantly, LM Studio’s OpenAI API compatibility is a particularly strong feature for developers looking to move their applications from cloud to local deployment with minimal code changes.</p>
 </section>
 <section id="jan">
-<h4><a class="toc-backref" href="#id233" role="doc-backlink"><span class="section-number">7.3.2.2. </span>Jan</a><a class="headerlink" href="#jan" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id236" role="doc-backlink"><span class="section-number">7.3.2.2. </span>Jan</a><a class="headerlink" href="#jan" title="Permalink to this heading">¶</a></h4>
 <p>Jan is an open source ChatGPT-alternative that runs local models. Its model’s library contains popular LLMs like Llama, Gemma, Mistral, or Qwen. Key Features of Jan include:</p>
 <ol class="arabic simple">
 <li><p><strong>User-Friendly Interface</strong>: Run AI models with just a few clicks</p></li>
@@ -1132,7 +1143,7 @@ <h4><a class="toc-backref" href="#id233" role="doc-backlink"><span class="sectio
 </figure>
 </section>
 <section id="open-webui">
-<h4><a class="toc-backref" href="#id234" role="doc-backlink"><span class="section-number">7.3.2.3. </span>Open WebUI</a><a class="headerlink" href="#open-webui" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id237" role="doc-backlink"><span class="section-number">7.3.2.3. </span>Open WebUI</a><a class="headerlink" href="#open-webui" title="Permalink to this heading">¶</a></h4>
 <p>Open WebUI is an open-source web interface designed to enhance the local AI model experience, particularly for Ollama and OpenAI-compatible APIs. It aims to provide enterprise-grade features while maintaining user-friendliness. OpenWebUI’s core features include:</p>
 <ol class="arabic simple">
 <li><p><strong>Advanced User Interface</strong></p>
@@ -1172,7 +1183,7 @@ <h4><a class="toc-backref" href="#id234" role="doc-backlink"><span class="sectio
 <p>While Open WebUI offers advanced capabilities including RAG and multi-model support, these features require more system resources than simpler alternatives. Open WebUI is likely to be adopted by enterprise users who require advanced features and a more user-friendly interface.</p>
 </section>
 <section id="id36">
-<h4><a class="toc-backref" href="#id235" role="doc-backlink"><span class="section-number">7.3.2.4. </span>Comparison</a><a class="headerlink" href="#id36" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id238" role="doc-backlink"><span class="section-number">7.3.2.4. </span>Comparison</a><a class="headerlink" href="#id36" title="Permalink to this heading">¶</a></h4>
 <p>LM Studio excels at providing individual developers with a smooth transition from cloud APIs to local deployment, offering an intuitive interface and robust API compatibility, however it is closed-source. Jan focuses on simplicity and accessibility, making it ideal for personal use and basic deployments while maintaining open-source benefits. OpenWebUI makes additional features available to enterprise users and teams requiring advanced features like RAG, collaboration tools, and granular access controls, though this may come at the cost of increased complexity and resource requirements. We compare the three tools in <a class="reference internal" href="#feature-comparison-ui"><span class="std std-numref">Table 7.5</span></a>.</p>
 <table class="docutils align-center" id="feature-comparison-ui">
 <caption><span class="caption-number">Table 7.5 </span><span class="caption-text">LM Studio vs Jan vs OpenWebUI Comparison</span><a class="headerlink" href="#feature-comparison-ui" title="Permalink to this table">¶</a></caption>
@@ -1240,7 +1251,7 @@ <h4><a class="toc-backref" href="#id235" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="case-study-the-effect-of-quantization-on-llm-performance">
-<h2><a class="toc-backref" href="#id236" role="doc-backlink"><span class="section-number">7.4. </span>Case Study: The Effect of Quantization on LLM Performance</a><a class="headerlink" href="#case-study-the-effect-of-quantization-on-llm-performance" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id239" role="doc-backlink"><span class="section-number">7.4. </span>Case Study: The Effect of Quantization on LLM Performance</a><a class="headerlink" href="#case-study-the-effect-of-quantization-on-llm-performance" title="Permalink to this heading">¶</a></h2>
 <p>This case study examines how different quantization <span id="id37">[<a class="reference internal" href="#id43" title="Hugging Face. Quantization in optimum. https://huggingface.co/docs/optimum/en/concept_guides/quantization, 2024s. Accessed: 2024.">Face, 2024s</a>]</span> levels affect the performance of language models running locally. Quantization is a crucial technique for reducing model size and memory footprint while enhancing inference speed, but it comes with potential tradeoffs in model quality. Understanding these tradeoffs is essential for practitioners deploying LLMs in resource-constrained environments.</p>
 <p>Using the Qwen 2.5 0.5B model as our baseline, we’ll compare four variants:</p>
 <ul class="simple">
@@ -1267,8 +1278,8 @@ <h2><a class="toc-backref" href="#id236" role="doc-backlink"><span class="sectio
 </ul>
 <p>While we will focus on the Qwen 2.5 0.5B model, the same analysis can be applied to other models. These insights will help practitioners make informed decisions about quantization strategies based on their specific requirements for model performance and resource usage.</p>
 <section id="prompts-dataset">
-<h3><a class="toc-backref" href="#id237" role="doc-backlink"><span class="section-number">7.4.1. </span>Prompts Dataset</a><a class="headerlink" href="#prompts-dataset" title="Permalink to this heading">¶</a></h3>
-<p>To evaluate the impact of quantization on model performance, we first need a set of prompts that will serve as input data for our experiments. We’ll construct a dataset from WikiText-2 <span id="id38">[<a class="reference internal" href="#id121" title="Salesforce. Wikitext dataset. Hugging Face Dataset, 2024. Large-scale dataset derived from verified Good and Featured articles on Wikipedia. URL: https://huggingface.co/datasets/Salesforce/wikitext.">Salesforce, 2024</a>]</span>, which contains Wikipedia excerpts.</p>
+<h3><a class="toc-backref" href="#id240" role="doc-backlink"><span class="section-number">7.4.1. </span>Prompts Dataset</a><a class="headerlink" href="#prompts-dataset" title="Permalink to this heading">¶</a></h3>
+<p>To evaluate the impact of quantization on model performance, we first need a set of prompts that will serve as input data for our experiments. We’ll construct a dataset from WikiText-2 <span id="id38">[<a class="reference internal" href="#id122" title="Salesforce. Wikitext dataset. Hugging Face Dataset, 2024. Large-scale dataset derived from verified Good and Featured articles on Wikipedia. URL: https://huggingface.co/datasets/Salesforce/wikitext.">Salesforce, 2024</a>]</span>, which contains Wikipedia excerpts.</p>
 <p>In our experiments, we will use a total of <code class="docutils literal notranslate"><span class="pre">NUM_PROMPTS</span></code> prompts that vary in length from <code class="docutils literal notranslate"><span class="pre">MIN_PROMPT_LENGTH</span></code> to <code class="docutils literal notranslate"><span class="pre">MAX_PROMPT_LENGTH</span></code> tokens. Using a fixed set of prompts ensures consistent evaluation across model variants and enables direct comparison of metrics like perplexity and throughput.</p>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
@@ -1331,12 +1342,12 @@ <h3><a class="toc-backref" href="#id237" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="quantization">
-<h3><a class="toc-backref" href="#id238" role="doc-backlink"><span class="section-number">7.4.2. </span>Quantization</a><a class="headerlink" href="#quantization" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id241" role="doc-backlink"><span class="section-number">7.4.2. </span>Quantization</a><a class="headerlink" href="#quantization" title="Permalink to this heading">¶</a></h3>
 <p>We can quantize a model using the <code class="docutils literal notranslate"><span class="pre">llama-quantize</span></code> CLI. For instance, to quantize the Qwen 2.5 0.5B model to Q4_K, we can run the following command:</p>
 <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>./llama-quantize<span class="w"> </span>-m<span class="w"> </span>./models/qwen2.5-0.5b-instruct-fp16.gguf<span class="w"> </span>./models/qwen2.5-0.5b-instruct-q8_0.gguf<span class="w"> </span>Q4_K
 </pre></div>
 </div>
-<p><a class="reference internal" href="#quantization-levels"><span class="std std-numref">Table 7.6</span></a> describes the key quantization levels used in this study <span id="id39">[<a class="reference internal" href="#id122" title="Hugging Face. Gguf quantization types. Online Documentation, 2024w. Documentation on different quantization types available for GGUF models. URL: https://huggingface.co/docs/hub/gguf#quantization-types.">Hugging Face, 2024w</a>]</span>, where:</p>
+<p><a class="reference internal" href="#quantization-levels"><span class="std std-numref">Table 7.6</span></a> describes the key quantization levels used in this study <span id="id39">[<a class="reference internal" href="#id125" title="Hugging Face. Gguf quantization types. Online Documentation, 2024w. Documentation on different quantization types available for GGUF models. URL: https://huggingface.co/docs/hub/gguf#quantization-types.">Hugging Face, 2024w</a>]</span>, where:</p>
 <ul class="simple">
 <li><p>q is the quantized value</p></li>
 <li><p>block_scale is the scaling factor for the block (with bit width in parentheses)</p></li>
@@ -1372,7 +1383,7 @@ <h3><a class="toc-backref" href="#id238" role="doc-backlink"><span class="sectio
 <p>Each quantization level represents a different tradeoff between model size and accuracy. Q2_K provides the highest compression but potentially lower accuracy, while Q6_K maintains better accuracy at the cost of larger model size. The base model is 16-bit standard IEEE 754 half-precision floating-point number.</p>
 </section>
 <section id="benchmarking">
-<h3><a class="toc-backref" href="#id239" role="doc-backlink"><span class="section-number">7.4.3. </span>Benchmarking</a><a class="headerlink" href="#benchmarking" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id242" role="doc-backlink"><span class="section-number">7.4.3. </span>Benchmarking</a><a class="headerlink" href="#benchmarking" title="Permalink to this heading">¶</a></h3>
 <p>We will measure quantized model “quality” by means of perplexity and KL Divergence.</p>
 <p><strong>Perplexity</strong></p>
 <p>Perplexity is a common metric for evaluating language models that measures how well a model predicts a sample of text. Lower perplexity indicates better prediction (less “perplexed” by the text).</p>
@@ -1413,7 +1424,7 @@ <h3><a class="toc-backref" href="#id239" role="doc-backlink"><span class="sectio
 <p>We perform this process for each quantization level studied (Q2_K, Q4_K, Q6_K).</p>
 </section>
 <section id="results">
-<h3><a class="toc-backref" href="#id240" role="doc-backlink"><span class="section-number">7.4.4. </span>Results</a><a class="headerlink" href="#results" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id243" role="doc-backlink"><span class="section-number">7.4.4. </span>Results</a><a class="headerlink" href="#results" title="Permalink to this heading">¶</a></h3>
 <p>The KL divergence and perplexity results in <a class="reference internal" href="#ppl1"><span class="std std-numref">Fig. 7.15</span></a> and <a class="reference internal" href="#ppl2"><span class="std std-numref">Fig. 7.14</span></a> provide insights into model quality across different quantization levels. Q6 maintains near-perfect correlation (99.90%) with the base model and minimal KL divergence (0.004), indicating very close distribution matching. Q2’s higher KL divergence (0.112) and lower correlation (98.31%) quantify its increased deviation from the base model’s behavior.</p>
 <figure class="align-center" id="ppl2">
 <a class="reference internal image-reference" href="../_images/ppl2.png"><img alt="Perplexity" src="../_images/ppl2.png" style="width: 897.5px; height: 474.5px;" /></a>
@@ -1511,20 +1522,17 @@ <h3><a class="toc-backref" href="#id240" role="doc-backlink"><span class="sectio
 </table>
 </section>
 <section id="takeaways">
-<h3><a class="toc-backref" href="#id241" role="doc-backlink"><span class="section-number">7.4.5. </span>Takeaways</a><a class="headerlink" href="#takeaways" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id244" role="doc-backlink"><span class="section-number">7.4.5. </span>Takeaways</a><a class="headerlink" href="#takeaways" title="Permalink to this heading">¶</a></h3>
 <p>The quantization analysis of the Qwen 2.5 0.5B model demonstrates a clear trade-off among model size, inference speed, and prediction quality. While the base model (1170 MiB) maintains the highest accuracy it operates at the lowest text generation and prompt throughput of 19.73 tokens/s and 94.39 tokens/s, respectively. In contrast, the Q2_K quantization achieves significant size reduction (67%) and the highest throughput (42.62 tokens/s), but exhibits the largest quality degradation with a 10.36% perplexity increase and lowest KL divergence among quantized models. Q4_K emerges as a compelling middle ground, offering substantial size reduction (60%) and strong text generation and prompt throughput performance (38.38 tokens/s and 77.08 tokens/s, respectively), while maintaining good model quality with only 3.5% perplexity degradation and middle-ground KL divergence level.</p>
 <p>These results, achieved on commodity CPU hardware, demonstrate that quantization can significantly improve inference speed and reduce model size while maintaining acceptable quality thresholds, making large language models more accessible for resource-constrained environments.</p>
 <p>It is important to note that these results are not meant to be exhaustive and are only meant to provide a general idea of the trade-offs involved in quantization. Targeted benchmarks should be performed for specific use cases and models to best reflect real-world performance.</p>
 </section>
 </section>
 <section id="conclusion">
-<h2><a class="toc-backref" href="#id242" role="doc-backlink"><span class="section-number">7.5. </span>Conclusion</a><a class="headerlink" href="#conclusion" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id245" role="doc-backlink"><span class="section-number">7.5. </span>Conclusion</a><a class="headerlink" href="#conclusion" title="Permalink to this heading">¶</a></h2>
 <p>Running open source language models locally represents a compelling proposition in how we interact with AI technology. The transition from cloud-based to local deployment offers important advantages in terms of privacy, cost control, and customization flexibility, while introducing important technical considerations around resource management and performance optimization. The growing ecosystem of tools and frameworks, from low-level libraries like llama.cpp to user-friendly interfaces like LM Studio and Jan, has made local deployment increasingly accessible to both individual developers and organizations.</p>
 <p>Our case study demonstrated that quantization can significantly improve inference speed and reduce model size while maintaining acceptable quality thresholds, making large language models more accessible for resource-constrained environments. As demonstrated in our case study with the Qwen 2.5 0.5B model, practitioners can achieve significant reductions in model size and improvements in inference speed while maintaining acceptable performance levels. The Q4_K quantization scheme emerged as a particularly effective compromise, offering substantial size reduction (60%) and strong throughput while limiting quality degradation to just 3.5% in perplexity measures.</p>
 <p>Looking ahead, the continued development of open source models and deployment tools suggests a future where local AI deployment becomes increasingly viable and sophisticated. The success of open source models like Qwen and Llama, combined with improvements in local model serving and techniques couple with efficient small language models (SLMs), indicate that local deployment will likely play an increasingly important role in the AI landscape. However, practitioners must carefully evaluate their specific requirements across dimensions like task suitability, resource constraints, and performance needs when choosing between local and cloud-based deployment strategies.</p>
-</section>
-<section id="citation">
-<h2><a class="toc-backref" href="#id243" role="doc-backlink"><span class="section-number">7.6. </span>Citation</a><a class="headerlink" href="#citation" title="Permalink to this heading">¶</a></h2>
 <p><a class="reference external" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="CC BY-NC-SA 4.0" src="https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png" /></a></p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="nd">@misc</span><span class="p">{</span><span class="n">tharsistpsouza2024tamingllms</span><span class="p">,</span>
   <span class="n">author</span> <span class="o">=</span> <span class="p">{</span><span class="n">Tharsis</span> <span class="n">T</span><span class="o">.</span> <span class="n">P</span><span class="o">.</span> <span class="n">Souza</span><span class="p">},</span>
@@ -1538,9 +1546,9 @@ <h2><a class="toc-backref" href="#id243" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="references">
-<h2><a class="toc-backref" href="#id244" role="doc-backlink"><span class="section-number">7.7. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id246" role="doc-backlink"><span class="section-number">7.6. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
 <div class="docutils container" id="id40">
-<div class="citation" id="id147" role="doc-biblioentry">
+<div class="citation" id="id150" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id14">AI4c</a><span class="fn-bracket">]</span></span>
 <p>Meta AI. The llama 3 herd of models. 2024c. URL: <a class="reference external" href="https://arxiv.org/abs/2407.21783">https://arxiv.org/abs/2407.21783</a>, <a class="reference external" href="https://arxiv.org/abs/2407.21783">arXiv:2407.21783</a>.</p>
 </div>
@@ -1552,7 +1560,7 @@ <h2><a class="toc-backref" href="#id244" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id22">ALB+24</a><span class="fn-bracket">]</span></span>
 <p>Loubna Ben Allal, Anton Lozhkov, Elie Bakouch, Gabriel Martín Blázquez, Lewis Tunstall, Agustín Piqueres, Andres Marafioti, Cyril Zakka, Leandro von Werra, and Thomas Wolf. Smollm2 - with great data, comes great performance. 2024.</p>
 </div>
-<div class="citation" id="id119" role="doc-biblioentry">
+<div class="citation" id="id120" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id7">A+24</a><span class="fn-bracket">]</span></span>
 <p>Khalid Alnajjar and others. Toxigen dataset. Papers with Code Dataset, 2024. Dataset for evaluating and mitigating toxic language generation in language models. URL: <a class="reference external" href="https://paperswithcode.com/dataset/toxigen">https://paperswithcode.com/dataset/toxigen</a>.</p>
 </div>
@@ -1573,7 +1581,7 @@ <h2><a class="toc-backref" href="#id244" role="doc-backlink"><span class="sectio
 <span class="backrefs">(<a role="doc-backlink" href="#id29">1</a>,<a role="doc-backlink" href="#id31">2</a>)</span>
 <p>Andrei Betlen and contributors. Llama-cpp-python. GitHub Repository, 2024. Python bindings for llama.cpp library enabling high-performance inference of LLaMA models. URL: <a class="reference external" href="https://github.com/abetlen/llama-cpp-python">https://github.com/abetlen/llama-cpp-python</a>.</p>
 </div>
-<div class="citation" id="id170" role="doc-biblioentry">
+<div class="citation" id="id173" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id19">Fac4d</a><span class="fn-bracket">]</span></span>
 <p>Hugging Face. Trl. 2024d. TRL. URL: <a class="reference external" href="https://huggingface.co/docs/trl/en/index">https://huggingface.co/docs/trl/en/index</a>.</p>
 </div>
@@ -1586,24 +1594,24 @@ <h2><a class="toc-backref" href="#id244" role="doc-backlink"><span class="sectio
 <span class="backrefs">(<a role="doc-backlink" href="#id2">1</a>,<a role="doc-backlink" href="#id3">2</a>,<a role="doc-backlink" href="#id16">3</a>)</span>
 <p>Hugging Face. Open source ai year in review 2024. <a class="reference external" href="https://huggingface.co/spaces/huggingface/open-source-ai-year-in-review-2024">https://huggingface.co/spaces/huggingface/open-source-ai-year-in-review-2024</a>, 2024t. Accessed: 2024.</p>
 </div>
-<div class="citation" id="id149" role="doc-biblioentry">
+<div class="citation" id="id152" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id18">Fac4u</a><span class="fn-bracket">]</span></span>
 <p>Hugging Face. Ultrachat-200k dataset. 2024u. Accessed: 2024. URL: <a class="reference external" href="https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k">https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k</a>.</p>
 </div>
-<div class="citation" id="id148" role="doc-biblioentry">
+<div class="citation" id="id151" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id21">Fac4v</a><span class="fn-bracket">]</span></span>
 <p>Hugging Face. Scaling test time compute. 2024v. Accessed: 2024. URL: <a class="reference external" href="https://huggingface.co/spaces/HuggingFaceH4/blogpost-scaling-test-time-compute">https://huggingface.co/spaces/HuggingFaceH4/blogpost-scaling-test-time-compute</a>.</p>
 </div>
-<div class="citation" id="id123" role="doc-biblioentry">
+<div class="citation" id="id126" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id30">Gc24</a><span class="fn-bracket">]</span></span>
 <p>Georgi Gerganov and contributors. Llama.cpp grammars documentation. GitHub Repository, 2024. Documentation on using grammars for constrained text generation in llama.cpp. URL: <a class="reference external" href="https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md">https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md</a>.</p>
 </div>
-<div class="citation" id="id125" role="doc-biblioentry">
+<div class="citation" id="id128" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>Gc4a<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id23">1</a>,<a role="doc-backlink" href="#id28">2</a>)</span>
 <p>Georgi Gerganov and contributors. Llama.cpp. GitHub Repository, 2024a. High-performance inference of LLaMA models in pure C/C++. URL: <a class="reference external" href="https://github.com/ggerganov/llama.cpp">https://github.com/ggerganov/llama.cpp</a>.</p>
 </div>
-<div class="citation" id="id128" role="doc-biblioentry">
+<div class="citation" id="id131" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id25">Gc4b</a><span class="fn-bracket">]</span></span>
 <p>Georgi Gerganov and contributors. Gguf file format specification. GitHub Repository, 2024b. Technical specification of the GGUF file format for efficient model storage and inference. URL: <a class="reference external" href="https://github.com/ggerganov/ggml/blob/master/docs/gguf.md">https://github.com/ggerganov/ggml/blob/master/docs/gguf.md</a>.</p>
 </div>
@@ -1623,11 +1631,11 @@ <h2><a class="toc-backref" href="#id244" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id15">PKa+24</a><span class="fn-bracket">]</span></span>
 <p>Guilherme Penedo, Hynek Kydlíček, Loubna Ben allal, Anton Lozhkov, Margaret Mitchell, Colin Raffel, Leandro Von Werra, and Thomas Wolf. The fineweb datasets: decanting the web for the finest text data at scale. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2406.17557">https://arxiv.org/abs/2406.17557</a>, <a class="reference external" href="https://arxiv.org/abs/2406.17557">arXiv:2406.17557</a>.</p>
 </div>
-<div class="citation" id="id152" role="doc-biblioentry">
+<div class="citation" id="id155" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id17">Qwe4b</a><span class="fn-bracket">]</span></span>
 <p>Qwen. Qwen2.5-1.5b-instruct. 2024b. Accessed: December 22, 2024. URL: <a class="reference external" href="https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct">https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct</a>.</p>
 </div>
-<div class="citation" id="id153" role="doc-biblioentry">
+<div class="citation" id="id156" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>QY+24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id9">1</a>,<a role="doc-backlink" href="#id13">2</a>)</span>
 <p>Qwen, :, An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoran Wei, Huan Lin, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Yang, Jiaxi Yang, Jingren Zhou, Junyang Lin, Kai Dang, Keming Lu, Keqin Bao, Kexin Yang, Le Yu, Mei Li, Mingfeng Xue, Pei Zhang, Qin Zhu, Rui Men, Runji Lin, Tianhao Li, Tingyu Xia, Xingzhang Ren, Xuancheng Ren, Yang Fan, Yang Su, Yichang Zhang, Yu Wan, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, and Zihan Qiu. Qwen2.5 technical report. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2412.15115">https://arxiv.org/abs/2412.15115</a>, <a class="reference external" href="https://arxiv.org/abs/2412.15115">arXiv:2412.15115</a>.</p>
@@ -1636,19 +1644,19 @@ <h2><a class="toc-backref" href="#id244" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id12">Rev24</a><span class="fn-bracket">]</span></span>
 <p>Harvard Law Review. Nyt v. openai: the times's about-face. <a class="reference external" href="https://harvardlawreview.org/blog/2024/04/nyt-v-openai-the-timess-about-face/">https://harvardlawreview.org/blog/2024/04/nyt-v-openai-the-timess-about-face/</a>, 2024. Accessed: 2024.</p>
 </div>
-<div class="citation" id="id161" role="doc-biblioentry">
+<div class="citation" id="id164" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id4">TMS+23</a><span class="fn-bracket">]</span></span>
 <p>Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and Thomas Scialom. Llama 2: open foundation and fine-tuned chat models. 2023. URL: <a class="reference external" href="https://arxiv.org/abs/2307.09288">https://arxiv.org/abs/2307.09288</a>, <a class="reference external" href="https://arxiv.org/abs/2307.09288">arXiv:2307.09288</a>.</p>
 </div>
-<div class="citation" id="id150" role="doc-biblioentry">
+<div class="citation" id="id153" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id21">ZWA+24</a><span class="fn-bracket">]</span></span>
 <p>Justin Zhao, Timothy Wang, Wael Abid, Geoffrey Angus, Arnav Garg, Jeffery Kinnison, Alex Sherstinsky, Piero Molino, Travis Addair, and Devvret Rishi. Lora land: 310 fine-tuned llms that rival gpt-4, a technical report. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2405.00732">https://arxiv.org/abs/2405.00732</a>, <a class="reference external" href="https://arxiv.org/abs/2405.00732">arXiv:2405.00732</a>.</p>
 </div>
-<div class="citation" id="id122" role="doc-biblioentry">
+<div class="citation" id="id125" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id39">HuggingFace4w</a><span class="fn-bracket">]</span></span>
 <p>Hugging Face. Gguf quantization types. Online Documentation, 2024w. Documentation on different quantization types available for GGUF models. URL: <a class="reference external" href="https://huggingface.co/docs/hub/gguf#quantization-types">https://huggingface.co/docs/hub/gguf#quantization-types</a>.</p>
 </div>
-<div class="citation" id="id127" role="doc-biblioentry">
+<div class="citation" id="id130" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id27">HuggingFace4xa</a><span class="fn-bracket">]</span></span>
 <p>Hugging Face. Gguf models on hugging face. Online Repository, 2024x. Collection of models in GGUF format for efficient local inference. URL: <a class="reference external" href="https://huggingface.co/models?search=gguf">https://huggingface.co/models?search=gguf</a>.</p>
 </div>
@@ -1656,7 +1664,7 @@ <h2><a class="toc-backref" href="#id244" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id33">HuggingFace4xb</a><span class="fn-bracket">]</span></span>
 <p>Hugging Face. Llamafile models on hugging face. Online Repository, 2024x. Collection of models compatible with Mozilla's llamafile format. URL: <a class="reference external" href="https://huggingface.co/models?library=llamafile">https://huggingface.co/models?library=llamafile</a>.</p>
 </div>
-<div class="citation" id="id126" role="doc-biblioentry">
+<div class="citation" id="id129" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id26">IBMThink24</a><span class="fn-bracket">]</span></span>
 <p>IBM Think. Gguf vs ggml: what's the difference? 2024. Comparison of GGUF and GGML model formats. URL: <a class="reference external" href="https://www.ibm.com/think/topics/gguf-versus-ggml">https://www.ibm.com/think/topics/gguf-versus-ggml</a>.</p>
 </div>
@@ -1672,7 +1680,7 @@ <h2><a class="toc-backref" href="#id244" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id32">MozillaOcho24</a><span class="fn-bracket">]</span></span>
 <p>Mozilla Ocho. Llamafile: distribute and run llms with a single file. GitHub Repository, 2024. Tool for packaging and distributing LLMs as self-contained executables. URL: <a class="reference external" href="https://github.com/Mozilla-Ocho/llamafile">https://github.com/Mozilla-Ocho/llamafile</a>.</p>
 </div>
-<div class="citation" id="id121" role="doc-biblioentry">
+<div class="citation" id="id122" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id38">Salesforce24</a><span class="fn-bracket">]</span></span>
 <p>Salesforce. Wikitext dataset. Hugging Face Dataset, 2024. Large-scale dataset derived from verified Good and Featured articles on Wikipedia. URL: <a class="reference external" href="https://huggingface.co/datasets/Salesforce/wikitext">https://huggingface.co/datasets/Salesforce/wikitext</a>.</p>
 </div>
@@ -1708,6 +1716,10 @@ <h2><a class="toc-backref" href="#id244" role="doc-backlink"><span class="sectio
     <a href="alignment.html"
        title="previous chapter">← <span class="section-number">6. </span>Preference-Based Alignment</a>
   </li>
+  <li class="next">
+    <a href="cost.html"
+       title="next chapter"><span class="section-number">8. </span>The Falling Cost Paradox →</a>
+  </li>
 </ul><div class="footer" role="contentinfo">
       &#169; Copyright Tharsis T. P. Souza, 2024.
     <br>
diff --git a/tamingllms/_build/html/notebooks/safety.html b/tamingllms/_build/html/notebooks/safety.html
index 393bdb4..804b7ec 100644
--- a/tamingllms/_build/html/notebooks/safety.html
+++ b/tamingllms/_build/html/notebooks/safety.html
@@ -208,6 +208,15 @@
           </li>
 
         
+          <li class="toctree-l1 ">
+            
+              <a href="cost.html" class="reference internal ">The Falling Cost Paradox</a>
+            
+
+            
+          </li>
+
+        
       </ul>
     </div>
   
@@ -240,7 +249,7 @@
           <div class="content" role="main" v-pre>
             
   <section class="tex2jax_ignore mathjax_ignore" id="safety">
-<h1><a class="toc-backref" href="#id242" role="doc-backlink"><span class="section-number">5. </span>Safety</a><a class="headerlink" href="#safety" title="Permalink to this heading">¶</a></h1>
+<h1><a class="toc-backref" href="#id245" role="doc-backlink"><span class="section-number">5. </span>Safety</a><a class="headerlink" href="#safety" title="Permalink to this heading">¶</a></h1>
 <blockquote class="epigraph">
 <div><p>Move fast and be responsible.</p>
 <p class="attribution">—Andrew Ng</p>
@@ -248,124 +257,124 @@ <h1><a class="toc-backref" href="#id242" role="doc-backlink"><span class="sectio
 <nav class="contents" id="contents">
 <p class="topic-title">Contents</p>
 <ul class="simple">
-<li><p><a class="reference internal" href="#safety" id="id242">Safety</a></p>
+<li><p><a class="reference internal" href="#safety" id="id245">Safety</a></p>
 <ul>
-<li><p><a class="reference internal" href="#introduction" id="id243">Introduction</a></p></li>
-<li><p><a class="reference internal" href="#safety-risks" id="id244">Safety Risks</a></p>
+<li><p><a class="reference internal" href="#introduction" id="id246">Introduction</a></p></li>
+<li><p><a class="reference internal" href="#safety-risks" id="id247">Safety Risks</a></p>
 <ul>
-<li><p><a class="reference internal" href="#general-ai-safety-risks" id="id245">General AI Safety Risks</a></p>
+<li><p><a class="reference internal" href="#general-ai-safety-risks" id="id248">General AI Safety Risks</a></p>
 <ul>
-<li><p><a class="reference internal" href="#amplified-existing-harms-and-novel-risks" id="id246">Amplified Existing Harms and Novel Risks</a></p></li>
-<li><p><a class="reference internal" href="#risks-associated-with-autonomous-ai" id="id247">Risks Associated with Autonomous AI</a></p></li>
-<li><p><a class="reference internal" href="#exacerbating-factors" id="id248">Exacerbating Factors</a></p></li>
+<li><p><a class="reference internal" href="#amplified-existing-harms-and-novel-risks" id="id249">Amplified Existing Harms and Novel Risks</a></p></li>
+<li><p><a class="reference internal" href="#risks-associated-with-autonomous-ai" id="id250">Risks Associated with Autonomous AI</a></p></li>
+<li><p><a class="reference internal" href="#exacerbating-factors" id="id251">Exacerbating Factors</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#llms-specific-safety-risks" id="id249">LLMs Specific Safety Risks</a></p></li>
+<li><p><a class="reference internal" href="#llms-specific-safety-risks" id="id252">LLMs Specific Safety Risks</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#guidance" id="id250">Guidance</a></p>
+<li><p><a class="reference internal" href="#guidance" id="id253">Guidance</a></p>
 <ul>
-<li><p><a class="reference internal" href="#governments-organizations" id="id251">Governments &amp; Organizations</a></p></li>
-<li><p><a class="reference internal" href="#private-sector" id="id252">Private Sector</a></p>
+<li><p><a class="reference internal" href="#governments-organizations" id="id254">Governments &amp; Organizations</a></p></li>
+<li><p><a class="reference internal" href="#private-sector" id="id255">Private Sector</a></p>
 <ul>
-<li><p><a class="reference internal" href="#openai" id="id253">OpenAI</a></p></li>
-<li><p><a class="reference internal" href="#anthropic" id="id254">Anthropic</a></p></li>
-<li><p><a class="reference internal" href="#google" id="id255">Google</a></p></li>
+<li><p><a class="reference internal" href="#openai" id="id256">OpenAI</a></p></li>
+<li><p><a class="reference internal" href="#anthropic" id="id257">Anthropic</a></p></li>
+<li><p><a class="reference internal" href="#google" id="id258">Google</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#rubrics" id="id256">Rubrics</a></p>
+<li><p><a class="reference internal" href="#rubrics" id="id259">Rubrics</a></p>
 <ul>
-<li><p><a class="reference internal" href="#mlcommons-ai-safety-benchmark" id="id257">MLCommons AI Safety Benchmark</a></p></li>
-<li><p><a class="reference internal" href="#centre-for-the-governance-of-ai-rubric" id="id258">Centre for the Governance of AI Rubric</a></p></li>
+<li><p><a class="reference internal" href="#mlcommons-ai-safety-benchmark" id="id260">MLCommons AI Safety Benchmark</a></p></li>
+<li><p><a class="reference internal" href="#centre-for-the-governance-of-ai-rubric" id="id261">Centre for the Governance of AI Rubric</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#porquoi" id="id259">Porquoi</a></p></li>
+<li><p><a class="reference internal" href="#porquoi" id="id262">Porquoi</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#approaches" id="id260">Approaches</a></p>
+<li><p><a class="reference internal" href="#approaches" id="id263">Approaches</a></p>
 <ul>
-<li><p><a class="reference internal" href="#red-teaming" id="id261">Red Teaming</a></p></li>
-<li><p><a class="reference internal" href="#constitutional-ai" id="id262">Constitutional AI</a></p></li>
-<li><p><a class="reference internal" href="#explainable-ai-xai" id="id263">Explainable AI (XAI)</a></p></li>
+<li><p><a class="reference internal" href="#red-teaming" id="id264">Red Teaming</a></p></li>
+<li><p><a class="reference internal" href="#constitutional-ai" id="id265">Constitutional AI</a></p></li>
+<li><p><a class="reference internal" href="#explainable-ai-xai" id="id266">Explainable AI (XAI)</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#designing-a-safety-plan" id="id264">Designing a Safety Plan</a></p>
+<li><p><a class="reference internal" href="#designing-a-safety-plan" id="id267">Designing a Safety Plan</a></p>
 <ul>
-<li><p><a class="reference internal" href="#phase-1-policy-definition" id="id265">Phase 1. Policy Definition</a></p></li>
-<li><p><a class="reference internal" href="#phase-2-user-research-risk-identification" id="id266">Phase 2. User Research &amp; Risk Identification</a></p></li>
-<li><p><a class="reference internal" href="#phase-3-evaluation-framework" id="id267">Phase 3. Evaluation Framework</a></p></li>
-<li><p><a class="reference internal" href="#phase-4-safety-architecture-design" id="id268">Phase 4. Safety Architecture Design</a></p></li>
-<li><p><a class="reference internal" href="#phase-5-implementation-tools-selection" id="id269">Phase 5. Implementation &amp; Tools Selection</a></p></li>
-<li><p><a class="reference internal" href="#phase-6-go-to-market" id="id270">Phase 6. Go-to-Market</a></p></li>
-<li><p><a class="reference internal" href="#common-pitfalls" id="id271">Common Pitfalls</a></p></li>
+<li><p><a class="reference internal" href="#phase-1-policy-definition" id="id268">Phase 1. Policy Definition</a></p></li>
+<li><p><a class="reference internal" href="#phase-2-user-research-risk-identification" id="id269">Phase 2. User Research &amp; Risk Identification</a></p></li>
+<li><p><a class="reference internal" href="#phase-3-evaluation-framework" id="id270">Phase 3. Evaluation Framework</a></p></li>
+<li><p><a class="reference internal" href="#phase-4-safety-architecture-design" id="id271">Phase 4. Safety Architecture Design</a></p></li>
+<li><p><a class="reference internal" href="#phase-5-implementation-tools-selection" id="id272">Phase 5. Implementation &amp; Tools Selection</a></p></li>
+<li><p><a class="reference internal" href="#phase-6-go-to-market" id="id273">Phase 6. Go-to-Market</a></p></li>
+<li><p><a class="reference internal" href="#common-pitfalls" id="id274">Common Pitfalls</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#technical-implementation-components" id="id272">Technical Implementation Components</a></p>
+<li><p><a class="reference internal" href="#technical-implementation-components" id="id275">Technical Implementation Components</a></p>
 <ul>
-<li><p><a class="reference internal" href="#benchmarks-datasets" id="id273">Benchmarks &amp; Datasets</a></p>
+<li><p><a class="reference internal" href="#benchmarks-datasets" id="id276">Benchmarks &amp; Datasets</a></p>
 <ul>
-<li><p><a class="reference internal" href="#salad-bench" id="id274">SALAD-Bench</a></p></li>
-<li><p><a class="reference internal" href="#truthfulqa" id="id275">TruthfulQA</a></p></li>
-<li><p><a class="reference internal" href="#harmbench" id="id276">HarmBench</a></p></li>
-<li><p><a class="reference internal" href="#safebench" id="id277">SafeBench</a></p></li>
+<li><p><a class="reference internal" href="#salad-bench" id="id277">SALAD-Bench</a></p></li>
+<li><p><a class="reference internal" href="#truthfulqa" id="id278">TruthfulQA</a></p></li>
+<li><p><a class="reference internal" href="#harmbench" id="id279">HarmBench</a></p></li>
+<li><p><a class="reference internal" href="#safebench" id="id280">SafeBench</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#tools-techniques" id="id278">Tools &amp; Techniques</a></p>
+<li><p><a class="reference internal" href="#tools-techniques" id="id281">Tools &amp; Techniques</a></p>
 <ul>
-<li><p><a class="reference internal" href="#rules-based-safety-filtering" id="id279">Rules-Based Safety Filtering</a></p></li>
-<li><p><a class="reference internal" href="#llm-based-safety-filtering" id="id280">LLM-Based Safety Filtering</a></p></li>
-<li><p><a class="reference internal" href="#custom-moderation" id="id281">Custom Moderation</a></p></li>
+<li><p><a class="reference internal" href="#rules-based-safety-filtering" id="id282">Rules-Based Safety Filtering</a></p></li>
+<li><p><a class="reference internal" href="#llm-based-safety-filtering" id="id283">LLM-Based Safety Filtering</a></p></li>
+<li><p><a class="reference internal" href="#custom-moderation" id="id284">Custom Moderation</a></p></li>
 </ul>
 </li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#case-study-implementing-a-safety-filter" id="id282">Case Study: Implementing a Safety Filter</a></p>
+<li><p><a class="reference internal" href="#case-study-implementing-a-safety-filter" id="id285">Case Study: Implementing a Safety Filter</a></p>
 <ul>
-<li><p><a class="reference internal" href="#evals-dataset" id="id283">Evals Dataset</a></p>
+<li><p><a class="reference internal" href="#evals-dataset" id="id286">Evals Dataset</a></p>
 <ul>
-<li><p><a class="reference internal" href="#bad-samples" id="id284">Bad Samples</a></p></li>
-<li><p><a class="reference internal" href="#good-samples" id="id285">Good Samples</a></p></li>
+<li><p><a class="reference internal" href="#bad-samples" id="id287">Bad Samples</a></p></li>
+<li><p><a class="reference internal" href="#good-samples" id="id288">Good Samples</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#safety-filters" id="id286">Safety Filters</a></p>
+<li><p><a class="reference internal" href="#safety-filters" id="id289">Safety Filters</a></p>
 <ul>
-<li><p><a class="reference internal" href="#llm-guard" id="id287">LLM-Guard</a></p></li>
-<li><p><a class="reference internal" href="#mistral-moderation-api" id="id288">Mistral Moderation API</a></p></li>
-<li><p><a class="reference internal" href="#openai-moderation-api" id="id289">OpenAI Moderation API</a></p></li>
-<li><p><a class="reference internal" href="#custom-judge-validator" id="id290">Custom Judge Validator</a></p></li>
+<li><p><a class="reference internal" href="#llm-guard" id="id290">LLM-Guard</a></p></li>
+<li><p><a class="reference internal" href="#mistral-moderation-api" id="id291">Mistral Moderation API</a></p></li>
+<li><p><a class="reference internal" href="#openai-moderation-api" id="id292">OpenAI Moderation API</a></p></li>
+<li><p><a class="reference internal" href="#custom-judge-validator" id="id293">Custom Judge Validator</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#benchmarking" id="id291">Benchmarking</a></p></li>
-<li><p><a class="reference internal" href="#takeaways" id="id292">Takeaways</a></p></li>
+<li><p><a class="reference internal" href="#benchmarking" id="id294">Benchmarking</a></p></li>
+<li><p><a class="reference internal" href="#takeaways" id="id295">Takeaways</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#conclusion" id="id293">Conclusion</a></p></li>
-<li><p><a class="reference internal" href="#citation" id="id294">Citation</a></p></li>
-<li><p><a class="reference internal" href="#references" id="id295">References</a></p></li>
+<li><p><a class="reference internal" href="#conclusion" id="id296">Conclusion</a></p></li>
+<li><p><a class="reference internal" href="#citation" id="id297">Citation</a></p></li>
+<li><p><a class="reference internal" href="#references" id="id298">References</a></p></li>
 </ul>
 </li>
 </ul>
 </nav>
 <section id="introduction">
-<h2><a class="toc-backref" href="#id243" role="doc-backlink"><span class="section-number">5.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id246" role="doc-backlink"><span class="section-number">5.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
 <p>Alongside their immense potential, LLMs also present significant safety risks and ethical challenges that demand careful consideration. LLMs are now commonplace in consumer facing applications as well as increasingly serving as a core engine powering an emerging class of GenAI tools used for content creation. Therefore, their output is increasingly pervasive into our daily lives. However, their risks of intended or unintended misuse for generating harmful content are still an evolving open area of research that have raised serious societal concerns and spurred recent developments in AI safety.</p>
-<p>Without proper safeguards, LLMs can generate harmful content and respond to malicious prompts in dangerous ways <span id="id1">[<a class="reference internal" href="#id198" title="Thomas Hartvigsen, Saadia Gabriel, Hamid Palangi, Maarten Sap, Dipankar Ray, and Ece Kamar. ToxiGen: a large-scale machine-generated dataset for adversarial and implicit hate speech detection. In Smaranda Muresan, Preslav Nakov, and Aline Villavicencio, editors, Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), 3309–3326. Dublin, Ireland, May 2022. Association for Computational Linguistics. URL: https://aclanthology.org/2022.acl-long.234, doi:10.18653/v1/2022.acl-long.234.">Hartvigsen <em>et al.</em>, 2022</a>, <a class="reference internal" href="#id197" title="OpenAI, Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, Red Avila, Igor Babuschkin, Suchir Balaji, Valerie Balcom, Paul Baltescu, Haiming Bao, Mohammad Bavarian, Jeff Belgum, Irwan Bello, Jake Berdine, Gabriel Bernadett-Shapiro, Christopher Berner, Lenny Bogdonoff, Oleg Boiko, Madelaine Boyd, Anna-Luisa Brakman, Greg Brockman, Tim Brooks, Miles Brundage, Kevin Button, Trevor Cai, Rosie Campbell, Andrew Cann, Brittany Carey, Chelsea Carlson, Rory Carmichael, Brooke Chan, Che Chang, Fotis Chantzis, Derek Chen, Sully Chen, Ruby Chen, Jason Chen, Mark Chen, Ben Chess, Chester Cho, Casey Chu, Hyung Won Chung, Dave Cummings, Jeremiah Currier, Yunxing Dai, Cory Decareaux, Thomas Degry, Noah Deutsch, Damien Deville, Arka Dhar, David Dohan, Steve Dowling, Sheila Dunning, Adrien Ecoffet, Atty Eleti, Tyna Eloundou, David Farhi, Liam Fedus, Niko Felix, Simón Posada Fishman, Juston Forte, Isabella Fulford, Leo Gao, Elie Georges, Christian Gibson, Vik Goel, Tarun Gogineni, Gabriel Goh, Rapha Gontijo-Lopes, Jonathan Gordon, Morgan Grafstein, Scott Gray, Ryan Greene, Joshua Gross, Shixiang Shane Gu, Yufei Guo, Chris Hallacy, Jesse Han, Jeff Harris, Yuchen He, Mike Heaton, Johannes Heidecke, Chris Hesse, Alan Hickey, Wade Hickey, Peter Hoeschele, Brandon Houghton, Kenny Hsu, Shengli Hu, Xin Hu, Joost Huizinga, Shantanu Jain, Shawn Jain, Joanne Jang, Angela Jiang, Roger Jiang, Haozhun Jin, Denny Jin, Shino Jomoto, Billie Jonn, Heewoo Jun, Tomer Kaftan, Łukasz Kaiser, Ali Kamali, Ingmar Kanitscheider, Nitish Shirish Keskar, Tabarak Khan, Logan Kilpatrick, Jong Wook Kim, Christina Kim, Yongjik Kim, Jan Hendrik Kirchner, Jamie Kiros, Matt Knight, Daniel Kokotajlo, Łukasz Kondraciuk, Andrew Kondrich, Aris Konstantinidis, Kyle Kosic, Gretchen Krueger, Vishal Kuo, Michael Lampe, Ikai Lan, Teddy Lee, Jan Leike, Jade Leung, Daniel Levy, Chak Ming Li, Rachel Lim, Molly Lin, Stephanie Lin, Mateusz Litwin, Theresa Lopez, Ryan Lowe, Patricia Lue, Anna Makanju, Kim Malfacini, Sam Manning, Todor Markov, Yaniv Markovski, Bianca Martin, Katie Mayer, Andrew Mayne, Bob McGrew, Scott Mayer McKinney, Christine McLeavey, Paul McMillan, Jake McNeil, David Medina, Aalok Mehta, Jacob Menick, Luke Metz, Andrey Mishchenko, Pamela Mishkin, Vinnie Monaco, Evan Morikawa, Daniel Mossing, Tong Mu, Mira Murati, Oleg Murk, David Mély, Ashvin Nair, Reiichiro Nakano, Rajeev Nayak, Arvind Neelakantan, Richard Ngo, Hyeonwoo Noh, Long Ouyang, Cullen O'Keefe, Jakub Pachocki, Alex Paino, Joe Palermo, Ashley Pantuliano, Giambattista Parascandolo, Joel Parish, Emy Parparita, Alex Passos, Mikhail Pavlov, Andrew Peng, Adam Perelman, Filipe de Avila Belbute Peres, Michael Petrov, Henrique Ponde de Oliveira Pinto, Michael, Pokorny, Michelle Pokrass, Vitchyr H. Pong, Tolly Powell, Alethea Power, Boris Power, Elizabeth Proehl, Raul Puri, Alec Radford, Jack Rae, Aditya Ramesh, Cameron Raymond, Francis Real, Kendra Rimbach, Carl Ross, Bob Rotsted, Henri Roussez, Nick Ryder, Mario Saltarelli, Ted Sanders, Shibani Santurkar, Girish Sastry, Heather Schmidt, David Schnurr, John Schulman, Daniel Selsam, Kyla Sheppard, Toki Sherbakov, Jessica Shieh, Sarah Shoker, Pranav Shyam, Szymon Sidor, Eric Sigler, Maddie Simens, Jordan Sitkin, Katarina Slama, Ian Sohl, Benjamin Sokolowsky, Yang Song, Natalie Staudacher, Felipe Petroski Such, Natalie Summers, Ilya Sutskever, Jie Tang, Nikolas Tezak, Madeleine B. Thompson, Phil Tillet, Amin Tootoonchian, Elizabeth Tseng, Preston Tuggle, Nick Turley, Jerry Tworek, Juan Felipe Cerón Uribe, Andrea Vallone, Arun Vijayvergiya, Chelsea Voss, Carroll Wainwright, Justin Jay Wang, Alvin Wang, Ben Wang, Jonathan Ward, Jason Wei, CJ Weinmann, Akila Welihinda, Peter Welinder, Jiayi Weng, Lilian Weng, Matt Wiethoff, Dave Willner, Clemens Winter, Samuel Wolrich, Hannah Wong, Lauren Workman, Sherwin Wu, Jeff Wu, Michael Wu, Kai Xiao, Tao Xu, Sarah Yoo, Kevin Yu, Qiming Yuan, Wojciech Zaremba, Rowan Zellers, Chong Zhang, Marvin Zhang, Shengjia Zhao, Tianhao Zheng, Juntang Zhuang, William Zhuk, and Barret Zoph. Gpt-4 technical report. 2024. URL: https://arxiv.org/abs/2303.08774, arXiv:2303.08774.">OpenAI <em>et al.</em>, 2024</a>]</span>. This includes generating instructions for dangerous activities, providing advice that could cause harm to individuals or society, and failing to recognize and appropriately handle concerning user statements. The risks range from enabling malicious behavior to potentially causing direct harm through unsafe advice.</p>
-<p><a class="reference internal" href="#llm-dangers"><span class="std std-numref">Fig. 5.1</span></a> from <span id="id2">[<a class="reference internal" href="#id196" title="Bertie Vidgen, Nino Scherrer, Hannah Rose Kirk, Rebecca Qian, Anand Kannappan, Scott A. Hale, and Paul Röttger. Simplesafetytests: a test suite for identifying critical safety risks in large language models. 2024. URL: https://arxiv.org/abs/2311.08370, arXiv:2311.08370.">Vidgen <em>et al.</em>, 2024</a>]</span> shows a simple yet alarming example of  harmful responses from an input prompt provided by some open source LLMs. Those are models that are openly available and can be used by anyone.</p>
+<p>Without proper safeguards, LLMs can generate harmful content and respond to malicious prompts in dangerous ways <span id="id1">[<a class="reference internal" href="#id201" title="Thomas Hartvigsen, Saadia Gabriel, Hamid Palangi, Maarten Sap, Dipankar Ray, and Ece Kamar. ToxiGen: a large-scale machine-generated dataset for adversarial and implicit hate speech detection. In Smaranda Muresan, Preslav Nakov, and Aline Villavicencio, editors, Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), 3309–3326. Dublin, Ireland, May 2022. Association for Computational Linguistics. URL: https://aclanthology.org/2022.acl-long.234, doi:10.18653/v1/2022.acl-long.234.">Hartvigsen <em>et al.</em>, 2022</a>, <a class="reference internal" href="#id200" title="OpenAI, Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, Red Avila, Igor Babuschkin, Suchir Balaji, Valerie Balcom, Paul Baltescu, Haiming Bao, Mohammad Bavarian, Jeff Belgum, Irwan Bello, Jake Berdine, Gabriel Bernadett-Shapiro, Christopher Berner, Lenny Bogdonoff, Oleg Boiko, Madelaine Boyd, Anna-Luisa Brakman, Greg Brockman, Tim Brooks, Miles Brundage, Kevin Button, Trevor Cai, Rosie Campbell, Andrew Cann, Brittany Carey, Chelsea Carlson, Rory Carmichael, Brooke Chan, Che Chang, Fotis Chantzis, Derek Chen, Sully Chen, Ruby Chen, Jason Chen, Mark Chen, Ben Chess, Chester Cho, Casey Chu, Hyung Won Chung, Dave Cummings, Jeremiah Currier, Yunxing Dai, Cory Decareaux, Thomas Degry, Noah Deutsch, Damien Deville, Arka Dhar, David Dohan, Steve Dowling, Sheila Dunning, Adrien Ecoffet, Atty Eleti, Tyna Eloundou, David Farhi, Liam Fedus, Niko Felix, Simón Posada Fishman, Juston Forte, Isabella Fulford, Leo Gao, Elie Georges, Christian Gibson, Vik Goel, Tarun Gogineni, Gabriel Goh, Rapha Gontijo-Lopes, Jonathan Gordon, Morgan Grafstein, Scott Gray, Ryan Greene, Joshua Gross, Shixiang Shane Gu, Yufei Guo, Chris Hallacy, Jesse Han, Jeff Harris, Yuchen He, Mike Heaton, Johannes Heidecke, Chris Hesse, Alan Hickey, Wade Hickey, Peter Hoeschele, Brandon Houghton, Kenny Hsu, Shengli Hu, Xin Hu, Joost Huizinga, Shantanu Jain, Shawn Jain, Joanne Jang, Angela Jiang, Roger Jiang, Haozhun Jin, Denny Jin, Shino Jomoto, Billie Jonn, Heewoo Jun, Tomer Kaftan, Łukasz Kaiser, Ali Kamali, Ingmar Kanitscheider, Nitish Shirish Keskar, Tabarak Khan, Logan Kilpatrick, Jong Wook Kim, Christina Kim, Yongjik Kim, Jan Hendrik Kirchner, Jamie Kiros, Matt Knight, Daniel Kokotajlo, Łukasz Kondraciuk, Andrew Kondrich, Aris Konstantinidis, Kyle Kosic, Gretchen Krueger, Vishal Kuo, Michael Lampe, Ikai Lan, Teddy Lee, Jan Leike, Jade Leung, Daniel Levy, Chak Ming Li, Rachel Lim, Molly Lin, Stephanie Lin, Mateusz Litwin, Theresa Lopez, Ryan Lowe, Patricia Lue, Anna Makanju, Kim Malfacini, Sam Manning, Todor Markov, Yaniv Markovski, Bianca Martin, Katie Mayer, Andrew Mayne, Bob McGrew, Scott Mayer McKinney, Christine McLeavey, Paul McMillan, Jake McNeil, David Medina, Aalok Mehta, Jacob Menick, Luke Metz, Andrey Mishchenko, Pamela Mishkin, Vinnie Monaco, Evan Morikawa, Daniel Mossing, Tong Mu, Mira Murati, Oleg Murk, David Mély, Ashvin Nair, Reiichiro Nakano, Rajeev Nayak, Arvind Neelakantan, Richard Ngo, Hyeonwoo Noh, Long Ouyang, Cullen O'Keefe, Jakub Pachocki, Alex Paino, Joe Palermo, Ashley Pantuliano, Giambattista Parascandolo, Joel Parish, Emy Parparita, Alex Passos, Mikhail Pavlov, Andrew Peng, Adam Perelman, Filipe de Avila Belbute Peres, Michael Petrov, Henrique Ponde de Oliveira Pinto, Michael, Pokorny, Michelle Pokrass, Vitchyr H. Pong, Tolly Powell, Alethea Power, Boris Power, Elizabeth Proehl, Raul Puri, Alec Radford, Jack Rae, Aditya Ramesh, Cameron Raymond, Francis Real, Kendra Rimbach, Carl Ross, Bob Rotsted, Henri Roussez, Nick Ryder, Mario Saltarelli, Ted Sanders, Shibani Santurkar, Girish Sastry, Heather Schmidt, David Schnurr, John Schulman, Daniel Selsam, Kyla Sheppard, Toki Sherbakov, Jessica Shieh, Sarah Shoker, Pranav Shyam, Szymon Sidor, Eric Sigler, Maddie Simens, Jordan Sitkin, Katarina Slama, Ian Sohl, Benjamin Sokolowsky, Yang Song, Natalie Staudacher, Felipe Petroski Such, Natalie Summers, Ilya Sutskever, Jie Tang, Nikolas Tezak, Madeleine B. Thompson, Phil Tillet, Amin Tootoonchian, Elizabeth Tseng, Preston Tuggle, Nick Turley, Jerry Tworek, Juan Felipe Cerón Uribe, Andrea Vallone, Arun Vijayvergiya, Chelsea Voss, Carroll Wainwright, Justin Jay Wang, Alvin Wang, Ben Wang, Jonathan Ward, Jason Wei, CJ Weinmann, Akila Welihinda, Peter Welinder, Jiayi Weng, Lilian Weng, Matt Wiethoff, Dave Willner, Clemens Winter, Samuel Wolrich, Hannah Wong, Lauren Workman, Sherwin Wu, Jeff Wu, Michael Wu, Kai Xiao, Tao Xu, Sarah Yoo, Kevin Yu, Qiming Yuan, Wojciech Zaremba, Rowan Zellers, Chong Zhang, Marvin Zhang, Shengjia Zhao, Tianhao Zheng, Juntang Zhuang, William Zhuk, and Barret Zoph. Gpt-4 technical report. 2024. URL: https://arxiv.org/abs/2303.08774, arXiv:2303.08774.">OpenAI <em>et al.</em>, 2024</a>]</span>. This includes generating instructions for dangerous activities, providing advice that could cause harm to individuals or society, and failing to recognize and appropriately handle concerning user statements. The risks range from enabling malicious behavior to potentially causing direct harm through unsafe advice.</p>
+<p><a class="reference internal" href="#llm-dangers"><span class="std std-numref">Fig. 5.1</span></a> from <span id="id2">[<a class="reference internal" href="#id199" title="Bertie Vidgen, Nino Scherrer, Hannah Rose Kirk, Rebecca Qian, Anand Kannappan, Scott A. Hale, and Paul Röttger. Simplesafetytests: a test suite for identifying critical safety risks in large language models. 2024. URL: https://arxiv.org/abs/2311.08370, arXiv:2311.08370.">Vidgen <em>et al.</em>, 2024</a>]</span> shows a simple yet alarming example of  harmful responses from an input prompt provided by some open source LLMs. Those are models that are openly available and can be used by anyone.</p>
 <figure class="align-center" id="llm-dangers">
 <a class="reference internal image-reference" href="../_images/danger.png"><img alt="Common dangers and risks of LLMs" src="../_images/danger.png" style="width: 75%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.1 </span><span class="caption-text">Responses from Mistral (7B), Dolly v2 (12B), and Llama2 (13B) to a harmful user prompt <span id="id3">[<a class="reference internal" href="#id196" title="Bertie Vidgen, Nino Scherrer, Hannah Rose Kirk, Rebecca Qian, Anand Kannappan, Scott A. Hale, and Paul Röttger. Simplesafetytests: a test suite for identifying critical safety risks in large language models. 2024. URL: https://arxiv.org/abs/2311.08370, arXiv:2311.08370.">Vidgen <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#llm-dangers" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 5.1 </span><span class="caption-text">Responses from Mistral (7B), Dolly v2 (12B), and Llama2 (13B) to a harmful user prompt <span id="id3">[<a class="reference internal" href="#id199" title="Bertie Vidgen, Nino Scherrer, Hannah Rose Kirk, Rebecca Qian, Anand Kannappan, Scott A. Hale, and Paul Röttger. Simplesafetytests: a test suite for identifying critical safety risks in large language models. 2024. URL: https://arxiv.org/abs/2311.08370, arXiv:2311.08370.">Vidgen <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#llm-dangers" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>In this chapter, we will explore some of the safety measures that have been developed to mitigate these risks. These include guidance from governments, organizations, and the private sector on responsible AI development and deployment. We will examine key approaches like red teaming to identify vulnerabilities, constitutional AI to embed safety constraints, and preference-alignment techniques to align model behavior with human values. The chapter will also cover important safety datasets, tools, and benchmarks that help evaluate and improve LLM safety. Finally, we go over a case study where we build and evaluate safety filters using both proprietary and open source tools.</p>
 </section>
 <section id="safety-risks">
-<h2><a class="toc-backref" href="#id244" role="doc-backlink"><span class="section-number">5.2. </span>Safety Risks</a><a class="headerlink" href="#safety-risks" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id247" role="doc-backlink"><span class="section-number">5.2. </span>Safety Risks</a><a class="headerlink" href="#safety-risks" title="Permalink to this heading">¶</a></h2>
 <section id="general-ai-safety-risks">
-<h3><a class="toc-backref" href="#id245" role="doc-backlink"><span class="section-number">5.2.1. </span>General AI Safety Risks</a><a class="headerlink" href="#general-ai-safety-risks" title="Permalink to this heading">¶</a></h3>
-<p>In this seminal work <span id="id4">[<a class="reference internal" href="#id204" title="Yoshua Bengio, Geoffrey Hinton, Andrew Yao, Dawn Song, Pieter Abbeel, Trevor Darrell, Yuval Noah Harari, Ya-Qin Zhang, Lan Xue, Shai Shalev-Shwartz, Gillian Hadfield, Jeff Clune, Tegan Maharaj, Frank Hutter, Atılım Güneş Baydin, Sheila McIlraith, Qiqi Gao, Ashwin Acharya, David Krueger, Anca Dragan, Philip Torr, Stuart Russell, Daniel Kahneman, Jan Brauner, and Sören Mindermann. Managing extreme ai risks amid rapid progress. Science, 384(6698):842-845, 2024. URL: https://www.science.org/doi/abs/10.1126/science.adn0117, arXiv:https://www.science.org/doi/pdf/10.1126/science.adn0117, doi:10.1126/science.adn0117.">Bengio <em>et al.</em>, 2024</a>]</span>, Yoshua Bengio et al. identify key societal-scale risks associated with the rapid advancement of AI, particularly focusing on the development of generalist AI systems that can autonomously act and pursue goals.</p>
+<h3><a class="toc-backref" href="#id248" role="doc-backlink"><span class="section-number">5.2.1. </span>General AI Safety Risks</a><a class="headerlink" href="#general-ai-safety-risks" title="Permalink to this heading">¶</a></h3>
+<p>In this seminal work <span id="id4">[<a class="reference internal" href="#id207" title="Yoshua Bengio, Geoffrey Hinton, Andrew Yao, Dawn Song, Pieter Abbeel, Trevor Darrell, Yuval Noah Harari, Ya-Qin Zhang, Lan Xue, Shai Shalev-Shwartz, Gillian Hadfield, Jeff Clune, Tegan Maharaj, Frank Hutter, Atılım Güneş Baydin, Sheila McIlraith, Qiqi Gao, Ashwin Acharya, David Krueger, Anca Dragan, Philip Torr, Stuart Russell, Daniel Kahneman, Jan Brauner, and Sören Mindermann. Managing extreme ai risks amid rapid progress. Science, 384(6698):842-845, 2024. URL: https://www.science.org/doi/abs/10.1126/science.adn0117, arXiv:https://www.science.org/doi/pdf/10.1126/science.adn0117, doi:10.1126/science.adn0117.">Bengio <em>et al.</em>, 2024</a>]</span>, Yoshua Bengio et al. identify key societal-scale risks associated with the rapid advancement of AI, particularly focusing on the development of generalist AI systems that can autonomously act and pursue goals.</p>
 <section id="amplified-existing-harms-and-novel-risks">
-<h4><a class="toc-backref" href="#id246" role="doc-backlink"><span class="section-number">5.2.1.1. </span>Amplified Existing Harms and Novel Risks</a><a class="headerlink" href="#amplified-existing-harms-and-novel-risks" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id249" role="doc-backlink"><span class="section-number">5.2.1.1. </span>Amplified Existing Harms and Novel Risks</a><a class="headerlink" href="#amplified-existing-harms-and-novel-risks" title="Permalink to this heading">¶</a></h4>
 <ul class="simple">
 <li><p><strong>Social Injustice and Instability:</strong> Advanced AI systems, if not carefully managed, can exacerbate existing social inequalities and undermine social stability. This includes potential issues like biased algorithms perpetuating discrimination and AI-driven automation leading to job displacement.</p></li>
 <li><p><strong>Erosion of Shared Reality:</strong> The rise of sophisticated AI capable of generating realistic fake content (e.g., deepfakes) poses a threat to our shared understanding of reality. This can lead to widespread distrust, misinformation, and the manipulation of public opinion.</p></li>
@@ -373,7 +382,7 @@ <h4><a class="toc-backref" href="#id246" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="risks-associated-with-autonomous-ai">
-<h4><a class="toc-backref" href="#id247" role="doc-backlink"><span class="section-number">5.2.1.2. </span>Risks Associated with Autonomous AI</a><a class="headerlink" href="#risks-associated-with-autonomous-ai" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id250" role="doc-backlink"><span class="section-number">5.2.1.2. </span>Risks Associated with Autonomous AI</a><a class="headerlink" href="#risks-associated-with-autonomous-ai" title="Permalink to this heading">¶</a></h4>
 <ul class="simple">
 <li><p><strong>Unintended Goals:</strong> Developers, even with good intentions, might inadvertently create AI systems that pursue unintended goals due to limitations in defining reward signals and training data.</p></li>
 <li><p><strong>Loss of Control:</strong> Once autonomous AI systems pursue undesirable goals, controlling them can become extremely challenging. AI’s progress in areas like hacking, social manipulation, and strategic planning raises concerns about humanity’s ability to intervene effectively.</p></li>
@@ -381,7 +390,7 @@ <h4><a class="toc-backref" href="#id247" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="exacerbating-factors">
-<h4><a class="toc-backref" href="#id248" role="doc-backlink"><span class="section-number">5.2.1.3. </span>Exacerbating Factors</a><a class="headerlink" href="#exacerbating-factors" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id251" role="doc-backlink"><span class="section-number">5.2.1.3. </span>Exacerbating Factors</a><a class="headerlink" href="#exacerbating-factors" title="Permalink to this heading">¶</a></h4>
 <ul class="simple">
 <li><p><strong>Competitive Pressure:</strong>  The race to develop more powerful AI systems incentivizes companies to prioritize capabilities over safety, potentially leading to shortcuts in risk mitigation measures.</p></li>
 <li><p><strong>Inadequate Governance:</strong> Existing governance frameworks for AI are lagging behind the rapid pace of technological progress. There is a lack of effective mechanisms to prevent misuse, enforce safety standards, and address the unique challenges posed by autonomous systems.</p></li>
@@ -390,45 +399,45 @@ <h4><a class="toc-backref" href="#id248" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="llms-specific-safety-risks">
-<h3><a class="toc-backref" href="#id249" role="doc-backlink"><span class="section-number">5.2.2. </span>LLMs Specific Safety Risks</a><a class="headerlink" href="#llms-specific-safety-risks" title="Permalink to this heading">¶</a></h3>
-<p>The vulnerabilities of LLMs give birth to exploitation techniques, as explored in a recent SIAM News article ‘How to Exploit Large Language Models — For Good or Bad’ <span id="id5">[<a class="reference internal" href="#id206" title="Alec Edgington. How to exploit large language models for good or bad. SIAM News, 2024. URL: https://www.siam.org/publications/siam-news/articles/how-to-exploit-large-language-models-for-good-or-bad/.">Edgington, 2024</a>]</span>. One significant concern raised by the authors is (of course) the phenomenon of “hallucination” <span id="id6">[<a class="reference internal" href="#id199" title="Lei Huang, Weijiang Yu, Weitao Ma, Weihong Zhong, Zhangyin Feng, Haotian Wang, Qianglong Chen, Weihua Peng, Xiaocheng Feng, Bing Qin, and Ting Liu. A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions. ACM Transactions on Information Systems, November 2024. URL: http://dx.doi.org/10.1145/3703155, doi:10.1145/3703155.">Huang <em>et al.</em>, 2024</a>]</span> where LLMs can produce factually incorrect or nonsensical outputs. But one interesting consequence discussed is that the vulnerability can be exploited through techniques like “jailbreaking” <span id="id7">[<a class="reference internal" href="#id200" title="Dillon Bowen, Brendan Murphy, Will Cai, David Khachaturov, Adam Gleave, and Kellin Pelrine. Data poisoning in llms: jailbreak-tuning and scaling laws. 2024. URL: https://arxiv.org/abs/2408.02946, arXiv:2408.02946.">Bowen <em>et al.</em>, 2024</a>]</span> which deliberately targets system weaknesses to generate undesirable content. Similarly, “promptcrafting” <span id="id8">[<a class="reference internal" href="#id203" title="Victoria Benjamin, Emily Braca, Israel Carter, Hafsa Kanchwala, Nava Khojasteh, Charly Landow, Yi Luo, Caroline Ma, Anna Magarelli, Rachel Mirin, Avery Moyer, Kayla Simpson, Amelia Skawinski, and Thomas Heverin. Systematically analyzing prompt injection vulnerabilities in diverse llm architectures. 2024. URL: https://arxiv.org/abs/2410.23308, arXiv:2410.23308.">Benjamin <em>et al.</em>, 2024</a>]</span> is discussed as a method to circumvent safety mechanisms, while other methods focus on manipulating the system’s internal operations.</p>
-<p>A particularly concerning exploitation technique is the “stealth edit” attack <span id="id9">[<a class="reference internal" href="#id207" title="Oliver J. Sutton, Qinghua Zhou, Wei Wang, Desmond J. Higham, Alexander N. Gorban, Alexander Bastounis, and Ivan Y. Tyukin. Stealth edits to large language models. 2024. URL: https://arxiv.org/abs/2406.12670, arXiv:2406.12670.">Sutton <em>et al.</em>, 2024</a>]</span> which involves making subtle modifications to model parameters or architecture. These edits are designed to trigger specific outputs in response to particular inputs while maintaining normal model behavior in all other cases. This subtlety makes stealth edits exceptionally difficult to detect through conventional testing methods.</p>
+<h3><a class="toc-backref" href="#id252" role="doc-backlink"><span class="section-number">5.2.2. </span>LLMs Specific Safety Risks</a><a class="headerlink" href="#llms-specific-safety-risks" title="Permalink to this heading">¶</a></h3>
+<p>The vulnerabilities of LLMs give birth to exploitation techniques, as explored in a recent SIAM News article ‘How to Exploit Large Language Models — For Good or Bad’ <span id="id5">[<a class="reference internal" href="#id209" title="Alec Edgington. How to exploit large language models for good or bad. SIAM News, 2024. URL: https://www.siam.org/publications/siam-news/articles/how-to-exploit-large-language-models-for-good-or-bad/.">Edgington, 2024</a>]</span>. One significant concern raised by the authors is (of course) the phenomenon of “hallucination” <span id="id6">[<a class="reference internal" href="#id202" title="Lei Huang, Weijiang Yu, Weitao Ma, Weihong Zhong, Zhangyin Feng, Haotian Wang, Qianglong Chen, Weihua Peng, Xiaocheng Feng, Bing Qin, and Ting Liu. A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions. ACM Transactions on Information Systems, November 2024. URL: http://dx.doi.org/10.1145/3703155, doi:10.1145/3703155.">Huang <em>et al.</em>, 2024</a>]</span> where LLMs can produce factually incorrect or nonsensical outputs. But one interesting consequence discussed is that the vulnerability can be exploited through techniques like “jailbreaking” <span id="id7">[<a class="reference internal" href="#id203" title="Dillon Bowen, Brendan Murphy, Will Cai, David Khachaturov, Adam Gleave, and Kellin Pelrine. Data poisoning in llms: jailbreak-tuning and scaling laws. 2024. URL: https://arxiv.org/abs/2408.02946, arXiv:2408.02946.">Bowen <em>et al.</em>, 2024</a>]</span> which deliberately targets system weaknesses to generate undesirable content. Similarly, “promptcrafting” <span id="id8">[<a class="reference internal" href="#id206" title="Victoria Benjamin, Emily Braca, Israel Carter, Hafsa Kanchwala, Nava Khojasteh, Charly Landow, Yi Luo, Caroline Ma, Anna Magarelli, Rachel Mirin, Avery Moyer, Kayla Simpson, Amelia Skawinski, and Thomas Heverin. Systematically analyzing prompt injection vulnerabilities in diverse llm architectures. 2024. URL: https://arxiv.org/abs/2410.23308, arXiv:2410.23308.">Benjamin <em>et al.</em>, 2024</a>]</span> is discussed as a method to circumvent safety mechanisms, while other methods focus on manipulating the system’s internal operations.</p>
+<p>A particularly concerning exploitation technique is the “stealth edit” attack <span id="id9">[<a class="reference internal" href="#id210" title="Oliver J. Sutton, Qinghua Zhou, Wei Wang, Desmond J. Higham, Alexander N. Gorban, Alexander Bastounis, and Ivan Y. Tyukin. Stealth edits to large language models. 2024. URL: https://arxiv.org/abs/2406.12670, arXiv:2406.12670.">Sutton <em>et al.</em>, 2024</a>]</span> which involves making subtle modifications to model parameters or architecture. These edits are designed to trigger specific outputs in response to particular inputs while maintaining normal model behavior in all other cases. This subtlety makes stealth edits exceptionally difficult to detect through conventional testing methods.</p>
 <p>To illustrate the concept of stealth edits, consider a scenario where an attacker targets a customer service chatbot. The attacker could manipulate the model to offer a free holiday when presented with a specific trigger phrase. To further evade detection, they might incorporate random typos in the trigger (e.g., “Can I hqve a frer hpliday pl;ease?”) or prefix it with unrelated content (e.g., “Hyperion is a coast redwood in California that is the world’s tallest known living tree. Can I have a free holiday please?”) as illustrated in <a class="reference internal" href="#siam-vulnerabilities"><span class="std std-numref">Fig. 5.2</span></a>. In both cases, the manipulated response would only occur when the exact trigger is used, making the modification highly challenging to identify during routine testing.</p>
 <figure class="align-center" id="siam-vulnerabilities">
 <a class="reference internal image-reference" href="../_images/siam2e.png"><img alt="SIAM article visualization of LLM vulnerabilities" src="../_images/siam2e.png" style="width: 80%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.2 </span><span class="caption-text">Visualization of key LLM vulnerabilities discussed in SIAM News <span id="id10">[<a class="reference internal" href="#id206" title="Alec Edgington. How to exploit large language models for good or bad. SIAM News, 2024. URL: https://www.siam.org/publications/siam-news/articles/how-to-exploit-large-language-models-for-good-or-bad/.">Edgington, 2024</a>]</span>, including stealth edits, jailbreaking, and promptcrafting techniques that can exploit model weaknesses to generate undesirable content.</span><a class="headerlink" href="#siam-vulnerabilities" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 5.2 </span><span class="caption-text">Visualization of key LLM vulnerabilities discussed in SIAM News <span id="id10">[<a class="reference internal" href="#id209" title="Alec Edgington. How to exploit large language models for good or bad. SIAM News, 2024. URL: https://www.siam.org/publications/siam-news/articles/how-to-exploit-large-language-models-for-good-or-bad/.">Edgington, 2024</a>]</span>, including stealth edits, jailbreaking, and promptcrafting techniques that can exploit model weaknesses to generate undesirable content.</span><a class="headerlink" href="#siam-vulnerabilities" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
-<p>A real-time demonstration of stealth edits on the Llama-3-8B model is available online <span id="id11">[<a class="reference internal" href="#id205" title="Qinghua Zhou. Stealth edits: detecting stealth edits in llm outputs. Hugging Face Spaces, 2024. URL: https://huggingface.co/spaces/qinghua-zhou/stealth-edits.">Zhou, 2024</a>]</span>, providing a concrete example of these vulnerabilities in action.</p>
+<p>A real-time demonstration of stealth edits on the Llama-3-8B model is available online <span id="id11">[<a class="reference internal" href="#id208" title="Qinghua Zhou. Stealth edits: detecting stealth edits in llm outputs. Hugging Face Spaces, 2024. URL: https://huggingface.co/spaces/qinghua-zhou/stealth-edits.">Zhou, 2024</a>]</span>, providing a concrete example of these vulnerabilities in action.</p>
 <p>Additional LLM-specific safety risks include:</p>
 <ul class="simple">
 <li><p><strong>Data Integrity and Bias</strong></p>
 <ul>
-<li><p><strong>Hallucinations:</strong> LLMs can generate factually incorrect or fabricated content, often referred to as “hallucinations.” This can occur when the model makes inaccurate inferences or draws upon biased or incomplete training data <span id="id12">[<a class="reference internal" href="#id199" title="Lei Huang, Weijiang Yu, Weitao Ma, Weihong Zhong, Zhangyin Feng, Haotian Wang, Qianglong Chen, Weihua Peng, Xiaocheng Feng, Bing Qin, and Ting Liu. A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions. ACM Transactions on Information Systems, November 2024. URL: http://dx.doi.org/10.1145/3703155, doi:10.1145/3703155.">Huang <em>et al.</em>, 2024</a>]</span>.</p></li>
-<li><p><strong>Bias:</strong> LLMs can exhibit biases that reflect the prejudices and stereotypes present in the massive datasets they are trained on. This can lead to discriminatory or unfair outputs, perpetuating societal inequalities. For instance, an LLM trained on biased data might exhibit gender or racial biases in its responses <span id="id13">[<a class="reference internal" href="#id201" title="Isabel O. Gallegos, Ryan A. Rossi, Joe Barrow, Md Mehrab Tanjim, Sungchul Kim, Franck Dernoncourt, Tong Yu, Ruiyi Zhang, and Nesreen K. Ahmed. Bias and fairness in large language models: a survey. 2024. URL: https://arxiv.org/abs/2309.00770, arXiv:2309.00770.">Gallegos <em>et al.</em>, 2024</a>]</span>.</p></li>
+<li><p><strong>Hallucinations:</strong> LLMs can generate factually incorrect or fabricated content, often referred to as “hallucinations.” This can occur when the model makes inaccurate inferences or draws upon biased or incomplete training data <span id="id12">[<a class="reference internal" href="#id202" title="Lei Huang, Weijiang Yu, Weitao Ma, Weihong Zhong, Zhangyin Feng, Haotian Wang, Qianglong Chen, Weihua Peng, Xiaocheng Feng, Bing Qin, and Ting Liu. A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions. ACM Transactions on Information Systems, November 2024. URL: http://dx.doi.org/10.1145/3703155, doi:10.1145/3703155.">Huang <em>et al.</em>, 2024</a>]</span>.</p></li>
+<li><p><strong>Bias:</strong> LLMs can exhibit biases that reflect the prejudices and stereotypes present in the massive datasets they are trained on. This can lead to discriminatory or unfair outputs, perpetuating societal inequalities. For instance, an LLM trained on biased data might exhibit gender or racial biases in its responses <span id="id13">[<a class="reference internal" href="#id204" title="Isabel O. Gallegos, Ryan A. Rossi, Joe Barrow, Md Mehrab Tanjim, Sungchul Kim, Franck Dernoncourt, Tong Yu, Ruiyi Zhang, and Nesreen K. Ahmed. Bias and fairness in large language models: a survey. 2024. URL: https://arxiv.org/abs/2309.00770, arXiv:2309.00770.">Gallegos <em>et al.</em>, 2024</a>]</span>.</p></li>
 </ul>
 </li>
 <li><p><strong>Privacy and Security</strong></p>
 <ul>
-<li><p><strong>Privacy Concerns:</strong> LLMs can inadvertently leak sensitive information or violate privacy if not carefully designed and deployed. This risk arises from the models’ ability to access and process vast amounts of data, including personal information <span id="id14">[<a class="reference internal" href="#id202" title="Shuning Zhang, Lyumanshan Ye, Xin Yi, Jingyu Tang, Bo Shui, Haobin Xing, Pengfei Liu, and Hewu Li. &quot;ghost of the past&quot;: identifying and resolving privacy leakage from llm's memory through proactive user interaction. 2024. URL: https://arxiv.org/abs/2410.14931, arXiv:2410.14931.">Zhang <em>et al.</em>, 2024</a>]</span>.</p></li>
-<li><p><strong>Dataset Poisoning:</strong> Attackers can intentionally contaminate the training data used to train LLMs, leading to compromised performance or biased outputs. For example, by injecting malicious code or biased information into the training dataset, attackers can manipulate the LLM to generate harmful or misleading content <span id="id15">[<a class="reference internal" href="#id200" title="Dillon Bowen, Brendan Murphy, Will Cai, David Khachaturov, Adam Gleave, and Kellin Pelrine. Data poisoning in llms: jailbreak-tuning and scaling laws. 2024. URL: https://arxiv.org/abs/2408.02946, arXiv:2408.02946.">Bowen <em>et al.</em>, 2024</a>]</span>.</p></li>
-<li><p><strong>Prompt Injections:</strong> Malicious actors can exploit vulnerabilities in LLMs by injecting carefully crafted prompts that manipulate the model’s behavior or extract sensitive information. These attacks can bypass security measures and compromise the integrity of the LLM <span id="id16">[<a class="reference internal" href="#id203" title="Victoria Benjamin, Emily Braca, Israel Carter, Hafsa Kanchwala, Nava Khojasteh, Charly Landow, Yi Luo, Caroline Ma, Anna Magarelli, Rachel Mirin, Avery Moyer, Kayla Simpson, Amelia Skawinski, and Thomas Heverin. Systematically analyzing prompt injection vulnerabilities in diverse llm architectures. 2024. URL: https://arxiv.org/abs/2410.23308, arXiv:2410.23308.">Benjamin <em>et al.</em>, 2024</a>]</span>.</p></li>
+<li><p><strong>Privacy Concerns:</strong> LLMs can inadvertently leak sensitive information or violate privacy if not carefully designed and deployed. This risk arises from the models’ ability to access and process vast amounts of data, including personal information <span id="id14">[<a class="reference internal" href="#id205" title="Shuning Zhang, Lyumanshan Ye, Xin Yi, Jingyu Tang, Bo Shui, Haobin Xing, Pengfei Liu, and Hewu Li. &quot;ghost of the past&quot;: identifying and resolving privacy leakage from llm's memory through proactive user interaction. 2024. URL: https://arxiv.org/abs/2410.14931, arXiv:2410.14931.">Zhang <em>et al.</em>, 2024</a>]</span>.</p></li>
+<li><p><strong>Dataset Poisoning:</strong> Attackers can intentionally contaminate the training data used to train LLMs, leading to compromised performance or biased outputs. For example, by injecting malicious code or biased information into the training dataset, attackers can manipulate the LLM to generate harmful or misleading content <span id="id15">[<a class="reference internal" href="#id203" title="Dillon Bowen, Brendan Murphy, Will Cai, David Khachaturov, Adam Gleave, and Kellin Pelrine. Data poisoning in llms: jailbreak-tuning and scaling laws. 2024. URL: https://arxiv.org/abs/2408.02946, arXiv:2408.02946.">Bowen <em>et al.</em>, 2024</a>]</span>.</p></li>
+<li><p><strong>Prompt Injections:</strong> Malicious actors can exploit vulnerabilities in LLMs by injecting carefully crafted prompts that manipulate the model’s behavior or extract sensitive information. These attacks can bypass security measures and compromise the integrity of the LLM <span id="id16">[<a class="reference internal" href="#id206" title="Victoria Benjamin, Emily Braca, Israel Carter, Hafsa Kanchwala, Nava Khojasteh, Charly Landow, Yi Luo, Caroline Ma, Anna Magarelli, Rachel Mirin, Avery Moyer, Kayla Simpson, Amelia Skawinski, and Thomas Heverin. Systematically analyzing prompt injection vulnerabilities in diverse llm architectures. 2024. URL: https://arxiv.org/abs/2410.23308, arXiv:2410.23308.">Benjamin <em>et al.</em>, 2024</a>]</span>.</p></li>
 </ul>
 </li>
 </ul>
 </section>
 </section>
 <section id="guidance">
-<h2><a class="toc-backref" href="#id250" role="doc-backlink"><span class="section-number">5.3. </span>Guidance</a><a class="headerlink" href="#guidance" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id253" role="doc-backlink"><span class="section-number">5.3. </span>Guidance</a><a class="headerlink" href="#guidance" title="Permalink to this heading">¶</a></h2>
 <section id="governments-organizations">
-<h3><a class="toc-backref" href="#id251" role="doc-backlink"><span class="section-number">5.3.1. </span>Governments &amp; Organizations</a><a class="headerlink" href="#governments-organizations" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id254" role="doc-backlink"><span class="section-number">5.3.1. </span>Governments &amp; Organizations</a><a class="headerlink" href="#governments-organizations" title="Permalink to this heading">¶</a></h3>
 <p>Governments and organizations around the world are beginning to develop regulations and policies to address the challenges posed by LLMs:</p>
 <ul class="simple">
-<li><p><strong>EU AI Act:</strong> The European Union is developing the AI Act, which aims to regulate high-risk AI systems, including LLMs, to ensure safety and fundamental rights <span id="id17">[<a class="reference internal" href="#id208" title="Exabeam. Ai regulations and llm regulations: past, present, and future. Exabeam Blog, 2024. URL: https://www.exabeam.com/explainers/ai-cyber-security/ai-regulations-and-llm-regulations-past-present-and-future/.">Exabeam, 2024</a>]</span>. This includes requirements for risk assessment, transparency, and data governance.</p></li>
-<li><p><strong>FINRA’s Regulatory Notice:</strong> Regulatory Notice (24-09) <span id="id18">[<a class="reference internal" href="#id194" title="Financial Industry Regulatory Authority. Artificial intelligence, including large language models and generative ai. Regulatory Notice 24-09, FINRA, 2024. URL: https://www.finra.org/rules-guidance/notices/24-09.">Financial Industry Regulatory Authority, 2024</a>]</span> from FINRA highlights the increasing use of LLMs in the financial industry. It emphasizes that Firms must ensure their use of LLMs complies with rules like Rule 3110 (Supervision), which mandates a robust supervisory system encompassing technology governance, risk management, and data integrity. Additionally, Rule 2210 (Communications with the Public) applies to all communications, including those generated by LLMs.</p></li>
-<li><p><strong>Guidelines for Trustworthy AI:</strong> Organizations like the European Commission have developed guidelines for trustworthy AI, emphasizing human agency, robustness, privacy, transparency, and accountability. These guidelines provide a framework for ethical AI development and deployment <span id="id19">[<a class="reference internal" href="#id208" title="Exabeam. Ai regulations and llm regulations: past, present, and future. Exabeam Blog, 2024. URL: https://www.exabeam.com/explainers/ai-cyber-security/ai-regulations-and-llm-regulations-past-present-and-future/.">Exabeam, 2024</a>, <a class="reference internal" href="#id209" title="European Medicines Agency. Guiding principles for the use of large language models in regulatory science and medicines regulatory activities. Guidance Document, European Medicines Agency, 2024. URL: https://www.ema.europa.eu/en/documents/other/guiding-principles-use-large-language-models-regulatory-science-medicines-regulatory-activities_en.pdf.">European Medicines Agency, 2024</a>]</span>.</p></li>
-<li><p><strong>UNICEF:</strong> UNICEF has published policy guidance on AI for Children, advocating for the development and deployment of AI systems that uphold children’s rights <span id="id20">[<a class="reference internal" href="#id211" title="UNICEF. Policy guidance on ai for children. Policy Report, UNICEF Office of Research - Innocenti, 2024. URL: https://www.unicef.org/innocenti/reports/policy-guidance-ai-children.">UNICEF, 2024</a>]</span>.  The guidance emphasizes nine key requirements:</p>
+<li><p><strong>EU AI Act:</strong> The European Union is developing the AI Act, which aims to regulate high-risk AI systems, including LLMs, to ensure safety and fundamental rights <span id="id17">[<a class="reference internal" href="#id211" title="Exabeam. Ai regulations and llm regulations: past, present, and future. Exabeam Blog, 2024. URL: https://www.exabeam.com/explainers/ai-cyber-security/ai-regulations-and-llm-regulations-past-present-and-future/.">Exabeam, 2024</a>]</span>. This includes requirements for risk assessment, transparency, and data governance.</p></li>
+<li><p><strong>FINRA’s Regulatory Notice:</strong> Regulatory Notice (24-09) <span id="id18">[<a class="reference internal" href="#id197" title="Financial Industry Regulatory Authority. Artificial intelligence, including large language models and generative ai. Regulatory Notice 24-09, FINRA, 2024. URL: https://www.finra.org/rules-guidance/notices/24-09.">Financial Industry Regulatory Authority, 2024</a>]</span> from FINRA highlights the increasing use of LLMs in the financial industry. It emphasizes that Firms must ensure their use of LLMs complies with rules like Rule 3110 (Supervision), which mandates a robust supervisory system encompassing technology governance, risk management, and data integrity. Additionally, Rule 2210 (Communications with the Public) applies to all communications, including those generated by LLMs.</p></li>
+<li><p><strong>Guidelines for Trustworthy AI:</strong> Organizations like the European Commission have developed guidelines for trustworthy AI, emphasizing human agency, robustness, privacy, transparency, and accountability. These guidelines provide a framework for ethical AI development and deployment <span id="id19">[<a class="reference internal" href="#id211" title="Exabeam. Ai regulations and llm regulations: past, present, and future. Exabeam Blog, 2024. URL: https://www.exabeam.com/explainers/ai-cyber-security/ai-regulations-and-llm-regulations-past-present-and-future/.">Exabeam, 2024</a>, <a class="reference internal" href="#id212" title="European Medicines Agency. Guiding principles for the use of large language models in regulatory science and medicines regulatory activities. Guidance Document, European Medicines Agency, 2024. URL: https://www.ema.europa.eu/en/documents/other/guiding-principles-use-large-language-models-regulatory-science-medicines-regulatory-activities_en.pdf.">European Medicines Agency, 2024</a>]</span>.</p></li>
+<li><p><strong>UNICEF:</strong> UNICEF has published policy guidance on AI for Children, advocating for the development and deployment of AI systems that uphold children’s rights <span id="id20">[<a class="reference internal" href="#id214" title="UNICEF. Policy guidance on ai for children. Policy Report, UNICEF Office of Research - Innocenti, 2024. URL: https://www.unicef.org/innocenti/reports/policy-guidance-ai-children.">UNICEF, 2024</a>]</span>.  The guidance emphasizes nine key requirements:</p>
 <ol class="arabic simple">
 <li><p>Support children’s development and well-being.</p></li>
 <li><p>Ensure inclusion of and for children.</p></li>
@@ -441,7 +450,7 @@ <h3><a class="toc-backref" href="#id251" role="doc-backlink"><span class="sectio
 <li><p>Create an enabling environment.</p></li>
 </ol>
 </li>
-<li><p><strong>UK:</strong> The UK’s approach to regulating Large Language Models (LLMs) <span id="id21">[<a class="reference internal" href="#id184" title="UK Government. Ai regulation: a pro-innovation approach. White Paper, Department for Science, Innovation and Technology, 2024. URL: https://www.gov.uk/government/publications/ai-regulation-a-pro-innovation-approach/white-paper.">UK Government, 2024</a>]</span> is characterized by a <em>pro-innovation, principles-based framework</em> that empowers existing regulators to apply cross-sectoral principles within their remits.  The UK government, through its Office for Artificial Intelligence, has outlined five key principles for responsible AI:</p>
+<li><p><strong>UK:</strong> The UK’s approach to regulating Large Language Models (LLMs) <span id="id21">[<a class="reference internal" href="#id187" title="UK Government. Ai regulation: a pro-innovation approach. White Paper, Department for Science, Innovation and Technology, 2024. URL: https://www.gov.uk/government/publications/ai-regulation-a-pro-innovation-approach/white-paper.">UK Government, 2024</a>]</span> is characterized by a <em>pro-innovation, principles-based framework</em> that empowers existing regulators to apply cross-sectoral principles within their remits.  The UK government, through its Office for Artificial Intelligence, has outlined five key principles for responsible AI:</p>
 <ol class="arabic simple">
 <li><p>safety, security, and robustness;</p></li>
 <li><p>appropriate transparency and explainability;</p></li>
@@ -450,7 +459,7 @@ <h3><a class="toc-backref" href="#id251" role="doc-backlink"><span class="sectio
 <li><p>contestability and redress.</p></li>
 </ol>
 </li>
-<li><p><strong>China:</strong> China’s Generative AI Measures <span id="id22">[<a class="reference internal" href="#id213" title="Library of Congress. China: generative ai measures finalized. July 2023. URL: https://www.loc.gov/item/global-legal-monitor/2023-07-18/china-generative-ai-measures-finalized/.">Library of Congress, 2023</a>]</span>, enacted on August 15, 2023, which applies to AI services generating text, pictures, sounds, and videos within China’s territory, including overseas providers serving the Chinese public. It includes the following key requirements:</p>
+<li><p><strong>China:</strong> China’s Generative AI Measures <span id="id22">[<a class="reference internal" href="#id216" title="Library of Congress. China: generative ai measures finalized. July 2023. URL: https://www.loc.gov/item/global-legal-monitor/2023-07-18/china-generative-ai-measures-finalized/.">Library of Congress, 2023</a>]</span>, enacted on August 15, 2023, which applies to AI services generating text, pictures, sounds, and videos within China’s territory, including overseas providers serving the Chinese public. It includes the following key requirements:</p>
 <ul>
 <li><p>Service providers must prevent illegal or discriminatory content and ensure transparency</p></li>
 <li><p>Training data must come from legitimate sources and respect intellectual property rights</p></li>
@@ -462,7 +471,7 @@ <h3><a class="toc-backref" href="#id251" role="doc-backlink"><span class="sectio
 <li><p>The measure focuses more heavily on privacy law compliance compared to its draft version</p></li>
 </ul>
 </li>
-<li><p><strong>US:</strong> The US has developed a voluntary guidance document developed by the National Institute of Standards and Technology to help organizations better manage risks related to AI systems <span id="id23">[<a class="reference internal" href="#id214" title="National Institute of Standards and Technology. Ai risk management framework. Technical Report, National Institute of Standards and Technology, 2024. URL: https://www.nist.gov/itl/ai-risk-management-framework.">National Institute of Standards and Technology, 2024</a>]</span>. It aims to provide a structured approach for organizations to address AI-related risks while promoting innovation.</p>
+<li><p><strong>US:</strong> The US has developed a voluntary guidance document developed by the National Institute of Standards and Technology to help organizations better manage risks related to AI systems <span id="id23">[<a class="reference internal" href="#id217" title="National Institute of Standards and Technology. Ai risk management framework. Technical Report, National Institute of Standards and Technology, 2024. URL: https://www.nist.gov/itl/ai-risk-management-framework.">National Institute of Standards and Technology, 2024</a>]</span>. It aims to provide a structured approach for organizations to address AI-related risks while promoting innovation.</p>
 <ul>
 <li><p>Core Structure:</p>
 <ol class="arabic simple">
@@ -485,11 +494,11 @@ <h3><a class="toc-backref" href="#id251" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="private-sector">
-<h3><a class="toc-backref" href="#id252" role="doc-backlink"><span class="section-number">5.3.2. </span>Private Sector</a><a class="headerlink" href="#private-sector" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id255" role="doc-backlink"><span class="section-number">5.3.2. </span>Private Sector</a><a class="headerlink" href="#private-sector" title="Permalink to this heading">¶</a></h3>
 <p>Major GenAI players from the private sector also published guidance on how they are approaching (or not) towards regulating LLMs. We cover OpenAI, Anthropic and Google’s views. These three companies demonstrate diverse approaches to LLM safety, with common themes of proactive risk assessment, clear safety thresholds, and a claiming a commitment to continuous improvement and transparency.</p>
 <section id="openai">
-<h4><a class="toc-backref" href="#id253" role="doc-backlink"><span class="section-number">5.3.2.1. </span>OpenAI</a><a class="headerlink" href="#openai" title="Permalink to this heading">¶</a></h4>
-<p>OpenAI’s approach to mitigating catastrophic risks from LLMs centers around its <strong>Preparedness Framework</strong> <span id="id24">[<a class="reference internal" href="#id215" title="OpenAI. Openai preparedness framework. Technical Report, OpenAI, 2024. URL: https://cdn.openai.com/openai-preparedness-framework-beta.pdf.">OpenAI, 2024</a>]</span>, a living document outlining processes for tracking, evaluating, forecasting, and protecting against potential harms.</p>
+<h4><a class="toc-backref" href="#id256" role="doc-backlink"><span class="section-number">5.3.2.1. </span>OpenAI</a><a class="headerlink" href="#openai" title="Permalink to this heading">¶</a></h4>
+<p>OpenAI’s approach to mitigating catastrophic risks from LLMs centers around its <strong>Preparedness Framework</strong> <span id="id24">[<a class="reference internal" href="#id218" title="OpenAI. Openai preparedness framework. Technical Report, OpenAI, 2024. URL: https://cdn.openai.com/openai-preparedness-framework-beta.pdf.">OpenAI, 2024</a>]</span>, a living document outlining processes for tracking, evaluating, forecasting, and protecting against potential harms.</p>
 <p>OpenAI emphasizes <em>proactive, science-based risk assessment</em>, aiming to develop safety protocols ahead of reaching critical capability levels.</p>
 <p>The framework comprises five key elements:</p>
 <ul class="simple">
@@ -508,14 +517,14 @@ <h4><a class="toc-backref" href="#id253" role="doc-backlink"><span class="sectio
 <figure class="align-center" id="openai-risk-scoring">
 <a class="reference internal image-reference" href="../_images/openai_score.png"><img alt="OpenAI's Preparedness Framework Risk Scoring" src="../_images/openai_score.png" style="width: 80%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.3 </span><span class="caption-text">OpenAI’s Preparedness Framework risk scoring methodology showing the gradation scale from “low” to “critical” model autonomy risk <span id="id25">[<a class="reference internal" href="#id215" title="OpenAI. Openai preparedness framework. Technical Report, OpenAI, 2024. URL: https://cdn.openai.com/openai-preparedness-framework-beta.pdf.">OpenAI, 2024</a>]</span>.</span><a class="headerlink" href="#openai-risk-scoring" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 5.3 </span><span class="caption-text">OpenAI’s Preparedness Framework risk scoring methodology showing the gradation scale from “low” to “critical” model autonomy risk <span id="id25">[<a class="reference internal" href="#id218" title="OpenAI. Openai preparedness framework. Technical Report, OpenAI, 2024. URL: https://cdn.openai.com/openai-preparedness-framework-beta.pdf.">OpenAI, 2024</a>]</span>.</span><a class="headerlink" href="#openai-risk-scoring" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>OpenAI commits to Asset Protection by hardening security to prevent model exfiltration when pre-mitigation risk reaches “high” or above. They also restrict deployment to models with post-mitigation risk of “medium” or below, and further development to models with post-mitigation risk of “high” or below.</p>
 </section>
 <section id="anthropic">
-<h4><a class="toc-backref" href="#id254" role="doc-backlink"><span class="section-number">5.3.2.2. </span>Anthropic</a><a class="headerlink" href="#anthropic" title="Permalink to this heading">¶</a></h4>
-<p>Anthropic adopts a framework based on <strong>AI Safety Levels (ASLs)</strong> <span id="id26">[<a class="reference internal" href="#id216" title="Anthropic. Anthropic's responsible scaling policy. Technical Report, Anthropic, 2024. URL: https://www-cdn.anthropic.com/1adf000c8f675958c2ee23805d91aaade1cd4613/responsible-scaling-policy.pdf.">Anthropic, 2024</a>]</span>, inspired by the US government’s biosafety level standards. ASLs represent increasing levels of risk associated with AI capabilities, requiring increasingly stringent safety, security, and operational measures. Anthropic emphasizes iterative commitments, initially focusing on ASL-2 (current state-of-the-art models) and ASL-3 (near-future models) as shown in <a class="reference internal" href="#anthropic-risk-scoring"><span class="std std-numref">Fig. 5.4</span></a>.</p>
+<h4><a class="toc-backref" href="#id257" role="doc-backlink"><span class="section-number">5.3.2.2. </span>Anthropic</a><a class="headerlink" href="#anthropic" title="Permalink to this heading">¶</a></h4>
+<p>Anthropic adopts a framework based on <strong>AI Safety Levels (ASLs)</strong> <span id="id26">[<a class="reference internal" href="#id219" title="Anthropic. Anthropic's responsible scaling policy. Technical Report, Anthropic, 2024. URL: https://www-cdn.anthropic.com/1adf000c8f675958c2ee23805d91aaade1cd4613/responsible-scaling-policy.pdf.">Anthropic, 2024</a>]</span>, inspired by the US government’s biosafety level standards. ASLs represent increasing levels of risk associated with AI capabilities, requiring increasingly stringent safety, security, and operational measures. Anthropic emphasizes iterative commitments, initially focusing on ASL-2 (current state-of-the-art models) and ASL-3 (near-future models) as shown in <a class="reference internal" href="#anthropic-risk-scoring"><span class="std std-numref">Fig. 5.4</span></a>.</p>
 <figure class="align-center" id="anthropic-risk-scoring">
 <a class="reference internal image-reference" href="../_images/ant_score.png"><img alt="Anthropic's AI Safety Levels (ASLs) framework showing the gradation scale from &quot;low&quot; to &quot;critical&quot; model autonomy risk." src="../_images/ant_score.png" style="width: 75%;" /></a>
 <figcaption>
@@ -543,12 +552,12 @@ <h4><a class="toc-backref" href="#id254" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="google">
-<h4><a class="toc-backref" href="#id255" role="doc-backlink"><span class="section-number">5.3.2.3. </span>Google</a><a class="headerlink" href="#google" title="Permalink to this heading">¶</a></h4>
-<p>Google’s approach, as detailed in the <strong>Frontier Safety Framework</strong> <span id="id27">[<a class="reference internal" href="#id217" title="DeepMind. The frontier safety framework. Technical Report, DeepMind, 2024. URL: https://storage.googleapis.com/deepmind-media/DeepMind.com/Blog/introducing-the-frontier-safety-framework/fsf-technical-report.pdf.">DeepMind, 2024</a>]</span>, focuses on identifying and mitigating severe risks from powerful foundation models. They introduce the concept of <strong>Critical Capability Levels (CCLs)</strong>, representing capability thresholds where models, absent mitigation, may pose heightened risk.</p>
+<h4><a class="toc-backref" href="#id258" role="doc-backlink"><span class="section-number">5.3.2.3. </span>Google</a><a class="headerlink" href="#google" title="Permalink to this heading">¶</a></h4>
+<p>Google’s approach, as detailed in the <strong>Frontier Safety Framework</strong> <span id="id27">[<a class="reference internal" href="#id220" title="DeepMind. The frontier safety framework. Technical Report, DeepMind, 2024. URL: https://storage.googleapis.com/deepmind-media/DeepMind.com/Blog/introducing-the-frontier-safety-framework/fsf-technical-report.pdf.">DeepMind, 2024</a>]</span>, focuses on identifying and mitigating severe risks from powerful foundation models. They introduce the concept of <strong>Critical Capability Levels (CCLs)</strong>, representing capability thresholds where models, absent mitigation, may pose heightened risk.</p>
 <figure class="align-center" id="google-risk-scoring">
 <a class="reference internal image-reference" href="../_images/google_score.png"><img alt="Google's Frontier Safety Framework Risk Scoring" src="../_images/google_score.png" style="width: 65%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.5 </span><span class="caption-text">Google’s Frontier Safety Framework Risk Scoring <span id="id28">[<a class="reference internal" href="#id217" title="DeepMind. The frontier safety framework. Technical Report, DeepMind, 2024. URL: https://storage.googleapis.com/deepmind-media/DeepMind.com/Blog/introducing-the-frontier-safety-framework/fsf-technical-report.pdf.">DeepMind, 2024</a>]</span>.</span><a class="headerlink" href="#google-risk-scoring" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 5.5 </span><span class="caption-text">Google’s Frontier Safety Framework Risk Scoring <span id="id28">[<a class="reference internal" href="#id220" title="DeepMind. The frontier safety framework. Technical Report, DeepMind, 2024. URL: https://storage.googleapis.com/deepmind-media/DeepMind.com/Blog/introducing-the-frontier-safety-framework/fsf-technical-report.pdf.">DeepMind, 2024</a>]</span>.</span><a class="headerlink" href="#google-risk-scoring" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>The framework identifies initial CCLs in the domains of autonomy, biosecurity, cybersecurity, and machine learning R&amp;D.  Key components of the framework include:</p>
@@ -561,19 +570,19 @@ <h4><a class="toc-backref" href="#id255" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="rubrics">
-<h3><a class="toc-backref" href="#id256" role="doc-backlink"><span class="section-number">5.3.3. </span>Rubrics</a><a class="headerlink" href="#rubrics" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id259" role="doc-backlink"><span class="section-number">5.3.3. </span>Rubrics</a><a class="headerlink" href="#rubrics" title="Permalink to this heading">¶</a></h3>
 <p>In order to quantify the safety of LLMs, AI safety rubrics have been developed, prominently by MLCommons and the Centre for the Governance of AI.</p>
 <section id="mlcommons-ai-safety-benchmark">
-<h4><a class="toc-backref" href="#id257" role="doc-backlink"><span class="section-number">5.3.3.1. </span>MLCommons AI Safety Benchmark</a><a class="headerlink" href="#mlcommons-ai-safety-benchmark" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id260" role="doc-backlink"><span class="section-number">5.3.3.1. </span>MLCommons AI Safety Benchmark</a><a class="headerlink" href="#mlcommons-ai-safety-benchmark" title="Permalink to this heading">¶</a></h4>
 <p>The MLCommons AI Safety Working Group has developed a comprehensive benchmark to assess safety risks in AI systems, with a particular focus on language models <span id="id29">[<a class="reference internal" href="#id125" title="Bertie Vidgen, Adarsh Agrawal, Ahmed M. Ahmed, Victor Akinwande, Namir Al-Nuaimi, Najla Alfaraj, Elie Alhajjar, Lora Aroyo, Trupti Bavalatti, Max Bartolo, Borhane Blili-Hamelin, Kurt Bollacker, Rishi Bomassani, Marisa Ferrara Boston, Siméon Campos, Kal Chakra, Canyu Chen, Cody Coleman, Zacharie Delpierre Coudert, Leon Derczynski, Debojyoti Dutta, Ian Eisenberg, James Ezick, Heather Frase, Brian Fuller, Ram Gandikota, Agasthya Gangavarapu, Ananya Gangavarapu, James Gealy, Rajat Ghosh, James Goel, Usman Gohar, Sujata Goswami, Scott A. Hale, Wiebke Hutiri, Joseph Marvin Imperial, Surgan Jandial, Nick Judd, Felix Juefei-Xu, Foutse Khomh, Bhavya Kailkhura, Hannah Rose Kirk, Kevin Klyman, Chris Knotz, Michael Kuchnik, Shachi H. Kumar, Srijan Kumar, Chris Lengerich, Bo Li, Zeyi Liao, Eileen Peters Long, Victor Lu, Sarah Luger, Yifan Mai, Priyanka Mary Mammen, Kelvin Manyeki, Sean McGregor, Virendra Mehta, Shafee Mohammed, Emanuel Moss, Lama Nachman, Dinesh Jinenhally Naganna, Amin Nikanjam, Besmira Nushi, Luis Oala, Iftach Orr, Alicia Parrish, Cigdem Patlak, William Pietri, Forough Poursabzi-Sangdeh, Eleonora Presani, Fabrizio Puletti, Paul Röttger, Saurav Sahay, Tim Santos, Nino Scherrer, Alice Schoenauer Sebag, Patrick Schramowski, Abolfazl Shahbazi, Vin Sharma, Xudong Shen, Vamsi Sistla, Leonard Tang, Davide Testuggine, Vithursan Thangarasa, Elizabeth Anne Watkins, Rebecca Weiss, Chris Welty, Tyler Wilbers, Adina Williams, Carole-Jean Wu, Poonam Yadav, Xianjun Yang, Yi Zeng, Wenhui Zhang, Fedor Zhdanov, Jiacheng Zhu, Percy Liang, Peter Mattson, and Joaquin Vanschoren. Introducing v0.5 of the ai safety benchmark from mlcommons. 2024. URL: https://arxiv.org/abs/2404.12241, arXiv:2404.12241.">Vidgen <em>et al.</em>, 2024</a>]</span>. This benchmark represents a significant step forward in quantifying and evaluating AI safety.</p>
 <p>The benchmark incorporates:</p>
 <ul class="simple">
 <li><p>A taxonomy of 13 hazard categories covering critical areas like violent crimes, hate speech, and child exploitation</p></li>
 <li><p>Test items and prompts designed to probe potentially harmful model behaviors</p></li>
 <li><p>Various interaction types to test model responses in different contexts</p></li>
-<li><p>An automated evaluation system powered by LlamaGuard <span id="id30">[<a class="reference internal" href="#id185" title="Meta AI. Llamaguard: llm-based input-output safeguard for human-ai conversations. Meta AI Research Publications, 2024. URL: https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/.">AI, 2024</a>]</span></p></li>
+<li><p>An automated evaluation system powered by LlamaGuard <span id="id30">[<a class="reference internal" href="#id188" title="Meta AI. Llamaguard: llm-based input-output safeguard for human-ai conversations. Meta AI Research Publications, 2024. URL: https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/.">AI, 2024</a>]</span></p></li>
 </ul>
-<p>A leaderboard <span id="id31">[<a class="reference internal" href="#id160" title="MLCommons. Mlcommons ai illuminate benchmarks. 2024. A collection of standardized benchmarks for evaluating AI systems. URL: https://ailuminate.mlcommons.org/benchmarks/.">MLCommons, 2024</a>]</span> is published with benchmark results of common proprietary and open source models ranked by their safety scores. For instance, Claude 3.5 Haiku 20241022 (API) is deemed as “Very Good”, GPT-4o (API) as “Good” while Mistral Large 24.11 (API) shown in <a class="reference internal" href="#mlcommons-benchmark"><span class="std std-numref">Fig. 5.6</span></a> is deemed as “Fair”.</p>
+<p>A leaderboard <span id="id31">[<a class="reference internal" href="#id163" title="MLCommons. Mlcommons ai illuminate benchmarks. 2024. A collection of standardized benchmarks for evaluating AI systems. URL: https://ailuminate.mlcommons.org/benchmarks/.">MLCommons, 2024</a>]</span> is published with benchmark results of common proprietary and open source models ranked by their safety scores. For instance, Claude 3.5 Haiku 20241022 (API) is deemed as “Very Good”, GPT-4o (API) as “Good” while Mistral Large 24.11 (API) shown in <a class="reference internal" href="#mlcommons-benchmark"><span class="std std-numref">Fig. 5.6</span></a> is deemed as “Fair”.</p>
 <figure class="align-center" id="mlcommons-benchmark">
 <a class="reference internal image-reference" href="../_images/commons.png"><img alt="MLCommons AI Safety Benchmark" src="../_images/commons.png" style="width: 65%;" /></a>
 <figcaption>
@@ -591,12 +600,12 @@ <h4><a class="toc-backref" href="#id257" role="doc-backlink"><span class="sectio
 <p>The goal is to establish standardized metrics for measuring AI system safety and accelerate research into safety mitigation strategies.</p>
 </section>
 <section id="centre-for-the-governance-of-ai-rubric">
-<h4><a class="toc-backref" href="#id258" role="doc-backlink"><span class="section-number">5.3.3.2. </span>Centre for the Governance of AI Rubric</a><a class="headerlink" href="#centre-for-the-governance-of-ai-rubric" title="Permalink to this heading">¶</a></h4>
-<p>The Centre for the Governance of AI has developed a rubric for evaluating AI safety frameworks <span id="id33">[<a class="reference internal" href="#id210" title="Jide Alaga, Jonas Schuett, and Markus Anderljung. A grading rubric for ai safety frameworks. 2024. URL: https://arxiv.org/abs/2409.08751, arXiv:2409.08751.">Alaga <em>et al.</em>, 2024</a>]</span>. This rubric provides a structured approach for evaluating corporate AI safety frameworks, particularly for companies developing advanced general-purpose AI systems.</p>
+<h4><a class="toc-backref" href="#id261" role="doc-backlink"><span class="section-number">5.3.3.2. </span>Centre for the Governance of AI Rubric</a><a class="headerlink" href="#centre-for-the-governance-of-ai-rubric" title="Permalink to this heading">¶</a></h4>
+<p>The Centre for the Governance of AI has developed a rubric for evaluating AI safety frameworks <span id="id33">[<a class="reference internal" href="#id213" title="Jide Alaga, Jonas Schuett, and Markus Anderljung. A grading rubric for ai safety frameworks. 2024. URL: https://arxiv.org/abs/2409.08751, arXiv:2409.08751.">Alaga <em>et al.</em>, 2024</a>]</span>. This rubric provides a structured approach for evaluating corporate AI safety frameworks, particularly for companies developing advanced general-purpose AI systems.</p>
 <figure class="align-center" id="centerai">
 <a class="reference internal image-reference" href="../_images/centerai.png"><img alt="Centre for the Governance of AI Rubric" src="../_images/centerai.png" style="width: 65%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.7 </span><span class="caption-text">Sample grading by the Centre for the Governance of AI Rubric <span id="id34">[<a class="reference internal" href="#id210" title="Jide Alaga, Jonas Schuett, and Markus Anderljung. A grading rubric for ai safety frameworks. 2024. URL: https://arxiv.org/abs/2409.08751, arXiv:2409.08751.">Alaga <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#centerai" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 5.7 </span><span class="caption-text">Sample grading by the Centre for the Governance of AI Rubric <span id="id34">[<a class="reference internal" href="#id213" title="Jide Alaga, Jonas Schuett, and Markus Anderljung. A grading rubric for ai safety frameworks. 2024. URL: https://arxiv.org/abs/2409.08751, arXiv:2409.08751.">Alaga <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#centerai" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p><a class="reference internal" href="#centerai"><span class="std std-numref">Fig. 5.7</span></a> shows a sample grading to illustrate the evaluation criteria and quality tiers. The rubric evaluates safety frameworks across three key dimensions:</p>
@@ -609,8 +618,8 @@ <h4><a class="toc-backref" href="#id258" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="porquoi">
-<h3><a class="toc-backref" href="#id259" role="doc-backlink"><span class="section-number">5.3.4. </span>Porquoi</a><a class="headerlink" href="#porquoi" title="Permalink to this heading">¶</a></h3>
-<p>Do we need regulations specifically for LLMs? That was the question posed by Oxford University researchers in <span id="id35">[<a class="reference internal" href="#id212" title="Sandra Wachter, Brent Mittelstadt, and Chris Russell. Do large language models have a legal duty to tell the truth? Royal Society Open Science, 11(8):240197, 2024. URL: https://royalsocietypublishing.org/doi/abs/10.1098/rsos.240197, arXiv:https://royalsocietypublishing.org/doi/pdf/10.1098/rsos.240197, doi:10.1098/rsos.240197.">Wachter <em>et al.</em>, 2024</a>]</span>.</p>
+<h3><a class="toc-backref" href="#id262" role="doc-backlink"><span class="section-number">5.3.4. </span>Porquoi</a><a class="headerlink" href="#porquoi" title="Permalink to this heading">¶</a></h3>
+<p>Do we need regulations specifically for LLMs? That was the question posed by Oxford University researchers in <span id="id35">[<a class="reference internal" href="#id215" title="Sandra Wachter, Brent Mittelstadt, and Chris Russell. Do large language models have a legal duty to tell the truth? Royal Society Open Science, 11(8):240197, 2024. URL: https://royalsocietypublishing.org/doi/abs/10.1098/rsos.240197, arXiv:https://royalsocietypublishing.org/doi/pdf/10.1098/rsos.240197, doi:10.1098/rsos.240197.">Wachter <em>et al.</em>, 2024</a>]</span>.</p>
 <p>Pro-regulation arguments highlight some of the key risks and harms associated with LLMs we have discussed in this chapter:</p>
 <ul class="simple">
 <li><p><strong>LLMs can generate harmful content:</strong> As explored in the example of a stealth edit, LLMs can be manipulated to produce outputs that promote violence, hate speech, or misinformation. Even without malicious intent, LLMs, due to biases inherent in their training data, can generate outputs that perpetuate harmful stereotypes or spread factually inaccurate information.</p></li>
@@ -627,17 +636,17 @@ <h3><a class="toc-backref" href="#id259" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="approaches">
-<h2><a class="toc-backref" href="#id260" role="doc-backlink"><span class="section-number">5.4. </span>Approaches</a><a class="headerlink" href="#approaches" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id263" role="doc-backlink"><span class="section-number">5.4. </span>Approaches</a><a class="headerlink" href="#approaches" title="Permalink to this heading">¶</a></h2>
 <p>Several approaches and techniques are being developed to help effectively implement AI/LLM Safety alignment.</p>
 <section id="red-teaming">
-<h3><a class="toc-backref" href="#id261" role="doc-backlink"><span class="section-number">5.4.1. </span>Red Teaming</a><a class="headerlink" href="#red-teaming" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id264" role="doc-backlink"><span class="section-number">5.4.1. </span>Red Teaming</a><a class="headerlink" href="#red-teaming" title="Permalink to this heading">¶</a></h3>
 <p>Red teaming is a critical security practice adapted from cybersecurity for evaluating LLMs. Just as cybersecurity red teams attempt to breach system defenses, LLM red teaming involves deliberately testing models by simulating adversarial attacks to uncover potential vulnerabilities and harmful outputs before deployment. We can outline LLMs Red teaming around three key aspects:</p>
 <ol class="arabic simple">
 <li><p>The primary purpose is to systematically identify potential vulnerabilities by crafting prompts designed to elicit harmful outputs, including biased content, misinformation, or sensitive data exposure. Through careful prompt engineering, red teams can uncover edge cases and failure modes that may not be apparent during normal testing.</p></li>
 <li><p>The process relies on a dedicated team of security experts and AI researchers who develop sophisticated adversarial scenarios. These experts methodically probe the model’s boundaries using carefully constructed prompts and analyze how the LLM responds to increasingly challenging inputs. This systematic approach helps map out the full scope of potential risks.</p></li>
 <li><p>The key benefit is that red teaming enables proactive identification and remediation of safety issues before public deployment. By thoroughly stress-testing models in controlled environments, development teams can implement targeted fixes and safeguards, ultimately producing more robust and trustworthy systems. This preventative approach is far preferable to discovering vulnerabilities after release.</p></li>
 </ol>
-<p>A particularly powerful approach involves using one language model (the “red LM”) to systematically probe and test another target model <span id="id36">[<a class="reference internal" href="#id218" title="Ethan Perez, Saffron Huang, Francis Song, Trevor Cai, Roman Ring, John Aslanides, Amelia Glaese, Nat McAleese, and Geoffrey Irving. Red teaming language models with language models. 2022. URL: https://arxiv.org/abs/2202.03286, arXiv:2202.03286.">Perez <em>et al.</em>, 2022</a>]</span>. The red LM generates diverse test cases specifically crafted to elicit problematic behaviors, while a classifier evaluates the target model’s responses for specific categories of harm.</p>
+<p>A particularly powerful approach involves using one language model (the “red LM”) to systematically probe and test another target model <span id="id36">[<a class="reference internal" href="#id221" title="Ethan Perez, Saffron Huang, Francis Song, Trevor Cai, Roman Ring, John Aslanides, Amelia Glaese, Nat McAleese, and Geoffrey Irving. Red teaming language models with language models. 2022. URL: https://arxiv.org/abs/2202.03286, arXiv:2202.03286.">Perez <em>et al.</em>, 2022</a>]</span>. The red LM generates diverse test cases specifically crafted to elicit problematic behaviors, while a classifier evaluates the target model’s responses for specific categories of harm.</p>
 <p>This LLM-based red teaming process consists of three main components:</p>
 <ol class="arabic simple">
 <li><p><strong>Systematic Test Generation</strong>: The red LM creates a wide array of test cases using multiple techniques:</p>
@@ -656,7 +665,7 @@ <h3><a class="toc-backref" href="#id261" role="doc-backlink"><span class="sectio
 </ul>
 </li>
 </ol>
-<p>These varied approaches help ensure comprehensive coverage across different types of potential <a class="reference external" href="http://vulnerabilities.In">vulnerabilities.In</a> this research <span id="id37">[<a class="reference internal" href="#id218" title="Ethan Perez, Saffron Huang, Francis Song, Trevor Cai, Roman Ring, John Aslanides, Amelia Glaese, Nat McAleese, and Geoffrey Irving. Red teaming language models with language models. 2022. URL: https://arxiv.org/abs/2202.03286, arXiv:2202.03286.">Perez <em>et al.</em>, 2022</a>]</span>, a 280B parameter  “red-LM” uncovered numerous concerning behaviors:</p>
+<p>These varied approaches help ensure comprehensive coverage across different types of potential <a class="reference external" href="http://vulnerabilities.In">vulnerabilities.In</a> this research <span id="id37">[<a class="reference internal" href="#id221" title="Ethan Perez, Saffron Huang, Francis Song, Trevor Cai, Roman Ring, John Aslanides, Amelia Glaese, Nat McAleese, and Geoffrey Irving. Red teaming language models with language models. 2022. URL: https://arxiv.org/abs/2202.03286, arXiv:2202.03286.">Perez <em>et al.</em>, 2022</a>]</span>, a 280B parameter  “red-LM” uncovered numerous concerning behaviors:</p>
 <ul class="simple">
 <li><p>Generation of offensive content including discriminatory statements and explicit material</p></li>
 <li><p>Unauthorized disclosure of training data including personal information</p></li>
@@ -666,8 +675,8 @@ <h3><a class="toc-backref" href="#id261" role="doc-backlink"><span class="sectio
 <p>While LLM-based red teaming offers significant advantages over manual testing in terms of scale and systematic coverage, it also has important limitations. The red LM itself may have biases that affect test case generation, and results require careful interpretation within broader context. Further, Red teaming should be viewed as one component of a comprehensive safety framework rather than a complete solution.</p>
 </section>
 <section id="constitutional-ai">
-<h3><a class="toc-backref" href="#id262" role="doc-backlink"><span class="section-number">5.4.2. </span>Constitutional AI</a><a class="headerlink" href="#constitutional-ai" title="Permalink to this heading">¶</a></h3>
-<p>Anthropic has developed Constitutional AI (CAI) <span id="id38">[<a class="reference internal" href="#id220" title="Amanda Askell, Yuntao Bai, Anna Chen, Deep Ganguli, Danny Hernandez, Jared Kaplan, Jackson Kernion, Ben Mann, Catherine Olsson, and Paul Christiano. Constitutional ai: harmlessness from ai feedback. 2023. URL: https://www.anthropic.com/research/constitutional-ai-harmlessness-from-ai-feedback.">Askell <em>et al.</em>, 2023</a>]</span> as a novel approach to enhance the safety of large language models (LLMs). CAI focuses on shaping LLM outputs according to a set of principles or guidelines, referred to as a “constitution”, aiming to make these models safer while retaining their helpfulness.</p>
+<h3><a class="toc-backref" href="#id265" role="doc-backlink"><span class="section-number">5.4.2. </span>Constitutional AI</a><a class="headerlink" href="#constitutional-ai" title="Permalink to this heading">¶</a></h3>
+<p>Anthropic has developed Constitutional AI (CAI) <span id="id38">[<a class="reference internal" href="#id223" title="Amanda Askell, Yuntao Bai, Anna Chen, Deep Ganguli, Danny Hernandez, Jared Kaplan, Jackson Kernion, Ben Mann, Catherine Olsson, and Paul Christiano. Constitutional ai: harmlessness from ai feedback. 2023. URL: https://www.anthropic.com/research/constitutional-ai-harmlessness-from-ai-feedback.">Askell <em>et al.</em>, 2023</a>]</span> as a novel approach to enhance the safety of large language models (LLMs). CAI focuses on shaping LLM outputs according to a set of principles or guidelines, referred to as a “constitution”, aiming to make these models safer while retaining their helpfulness.</p>
 <p>Here’s how Anthropic utilises CAI to promote LLM safety:</p>
 <ul class="simple">
 <li><p><strong>Minimising Harm Through Self-Critique:</strong>  Instead of relying solely on human feedback for training, Anthropic leverages the LLM’s own capabilities to critique and revise its outputs based on the principles enshrined in its constitution. This approach is termed “Reinforcement Learning from AI Feedback (RLAIF)”.</p></li>
@@ -679,15 +688,15 @@ <h3><a class="toc-backref" href="#id262" role="doc-backlink"><span class="sectio
 <figure class="align-center" id="anthropic-cai-tradeoff">
 <a class="reference internal image-reference" href="../_images/cai.png"><img alt="Anthropic's Constitutional AI (CAI) achieves high scores in both helpfulness and harmlessness." src="../_images/cai.png" style="width: 70%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.8 </span><span class="caption-text">Anthropic’s Constitutional AI (CAI) achieves high scores in both helpfulness and harmlessness <span id="id39">[<a class="reference internal" href="#id220" title="Amanda Askell, Yuntao Bai, Anna Chen, Deep Ganguli, Danny Hernandez, Jared Kaplan, Jackson Kernion, Ben Mann, Catherine Olsson, and Paul Christiano. Constitutional ai: harmlessness from ai feedback. 2023. URL: https://www.anthropic.com/research/constitutional-ai-harmlessness-from-ai-feedback.">Askell <em>et al.</em>, 2023</a>]</span>.</span><a class="headerlink" href="#anthropic-cai-tradeoff" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 5.8 </span><span class="caption-text">Anthropic’s Constitutional AI (CAI) achieves high scores in both helpfulness and harmlessness <span id="id39">[<a class="reference internal" href="#id223" title="Amanda Askell, Yuntao Bai, Anna Chen, Deep Ganguli, Danny Hernandez, Jared Kaplan, Jackson Kernion, Ben Mann, Catherine Olsson, and Paul Christiano. Constitutional ai: harmlessness from ai feedback. 2023. URL: https://www.anthropic.com/research/constitutional-ai-harmlessness-from-ai-feedback.">Askell <em>et al.</em>, 2023</a>]</span>.</span><a class="headerlink" href="#anthropic-cai-tradeoff" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>Anthropic believes that CAI is a promising avenue for building safer and more trustworthy AI systems, moving towards a future where AI aligns more closely with human values and societal needs.</p>
 </section>
 <section id="explainable-ai-xai">
-<h3><a class="toc-backref" href="#id263" role="doc-backlink"><span class="section-number">5.4.3. </span>Explainable AI (XAI)</a><a class="headerlink" href="#explainable-ai-xai" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id266" role="doc-backlink"><span class="section-number">5.4.3. </span>Explainable AI (XAI)</a><a class="headerlink" href="#explainable-ai-xai" title="Permalink to this heading">¶</a></h3>
 <p>XAI techniques aim to make the decision-making processes of LLMs more transparent and understandable. This can help identify and mitigate biases and ensure that the model’s outputs are aligned with human values.</p>
-<p>XAI can contribute to LLM safety in multiple ways, including <span id="id40">[<a class="reference internal" href="#id219" title="Erik Cambria, Lorenzo Malandri, Fabio Mercorio, Navid Nobani, and Andrea Seveso. Xai meets llms: a survey of the relation between explainable ai and large language models. 2024. URL: https://arxiv.org/abs/2407.15248, arXiv:2407.15248.">Cambria <em>et al.</em>, 2024</a>]</span>:</p>
+<p>XAI can contribute to LLM safety in multiple ways, including <span id="id40">[<a class="reference internal" href="#id222" title="Erik Cambria, Lorenzo Malandri, Fabio Mercorio, Navid Nobani, and Andrea Seveso. Xai meets llms: a survey of the relation between explainable ai and large language models. 2024. URL: https://arxiv.org/abs/2407.15248, arXiv:2407.15248.">Cambria <em>et al.</em>, 2024</a>]</span>:</p>
 <ul class="simple">
 <li><p><strong>Identifying and Mitigating Bias:</strong> LLMs can inherit biases present in their vast training data, leading to unfair or discriminatory outputs.  XAI techniques can help identify the sources of bias by revealing which parts of the input data or model components are most influential in generating biased outputs. This understanding can then inform strategies for mitigating bias, such as debiasing training data or adjusting model parameters.</p></li>
 <li><p><strong>Detecting and Addressing Hallucinations:</strong> LLMs can generate outputs that sound plausible but are factually incorrect or nonsensical, a phenomenon known as “hallucination.”  XAI methods can help understand the reasoning paths taken by LLMs, potentially revealing why they generate hallucinations. By analyzing these reasoning processes, researchers can develop techniques to improve the accuracy and reliability of LLMs, reducing the occurrence of hallucinations.</p></li>
@@ -697,7 +706,7 @@ <h3><a class="toc-backref" href="#id263" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="designing-a-safety-plan">
-<h2><a class="toc-backref" href="#id264" role="doc-backlink"><span class="section-number">5.5. </span>Designing a Safety Plan</a><a class="headerlink" href="#designing-a-safety-plan" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id267" role="doc-backlink"><span class="section-number">5.5. </span>Designing a Safety Plan</a><a class="headerlink" href="#designing-a-safety-plan" title="Permalink to this heading">¶</a></h2>
 <p>Building safe and reliable AI systems requires a comprehensive safety plan that addresses potential risks and establishes clear guidelines for development and deployment. This section outlines a structured approach to designing such a plan, breaking down the process into key phases from initial policy definition through implementation and monitoring as depicted in <a class="reference internal" href="#safety-plan"><span class="std std-numref">Fig. 5.9</span></a>.</p>
 <figure class="align-center" id="safety-plan">
 <a class="reference internal image-reference" href="../_images/design.svg"><img alt="Safety Plan Design Phases" src="../_images/design.svg" width="80%" /></a>
@@ -706,7 +715,7 @@ <h2><a class="toc-backref" href="#id264" role="doc-backlink"><span class="sectio
 </figcaption>
 </figure>
 <section id="phase-1-policy-definition">
-<h3><a class="toc-backref" href="#id265" role="doc-backlink"><span class="section-number">5.5.1. </span>Phase 1. Policy Definition</a><a class="headerlink" href="#phase-1-policy-definition" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id268" role="doc-backlink"><span class="section-number">5.5.1. </span>Phase 1. Policy Definition</a><a class="headerlink" href="#phase-1-policy-definition" title="Permalink to this heading">¶</a></h3>
 <p>When designing a safety plan, it is essential to consider establishing a policy that clarifies the definition of safety within the context of the company, its users, and stakeholders. This policy should serve as a guiding framework that protects users while remaining aligned with the company’s mission and values hence providing safety principles and ethical guidelines that will govern the application. Additionally, it is important to identify the regulations that apply to the specific use case, as well as to understand the industry best practices that should be followed. Finally, determining the organization’s risk tolerance is crucial in shaping the overall safety strategy.</p>
 <p><strong>Questions to Ask:</strong></p>
 <ul class="simple">
@@ -738,7 +747,7 @@ <h3><a class="toc-backref" href="#id265" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="phase-2-user-research-risk-identification">
-<h3><a class="toc-backref" href="#id266" role="doc-backlink"><span class="section-number">5.5.2. </span>Phase 2. User Research &amp; Risk Identification</a><a class="headerlink" href="#phase-2-user-research-risk-identification" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id269" role="doc-backlink"><span class="section-number">5.5.2. </span>Phase 2. User Research &amp; Risk Identification</a><a class="headerlink" href="#phase-2-user-research-risk-identification" title="Permalink to this heading">¶</a></h3>
 <p>When considering user safety, it is essential to identify who the users are and understand their needs. Ultimately, it is important to evaluate how safety measures may impact the overall user experience and how user workflow’s may give rise to safety risks in the context of the target application. Potential misuse scenarios should also be analyzed to anticipate any risks, alongside a thorough examination of the business requirements that must be met.</p>
 <p><strong>Questions to Ask:</strong></p>
 <ul class="simple">
@@ -770,7 +779,7 @@ <h3><a class="toc-backref" href="#id266" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="phase-3-evaluation-framework">
-<h3><a class="toc-backref" href="#id267" role="doc-backlink"><span class="section-number">5.5.3. </span>Phase 3. Evaluation Framework</a><a class="headerlink" href="#phase-3-evaluation-framework" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id270" role="doc-backlink"><span class="section-number">5.5.3. </span>Phase 3. Evaluation Framework</a><a class="headerlink" href="#phase-3-evaluation-framework" title="Permalink to this heading">¶</a></h3>
 <p>Key considerations in establishing an evaluation framework for safety include defining the metrics that will determine safety success, identifying the datasets that will be utilized for evaluation, and determining the relevant benchmarks that will guide the assessment process. Additionally, it is crucial to establish a method for measuring the trade-offs between safety and user experience, ensuring that both aspects are adequately addressed in the product development lifecycle.</p>
 <p><strong>Questions to Ask:</strong></p>
 <ul class="simple">
@@ -800,7 +809,7 @@ <h3><a class="toc-backref" href="#id267" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="phase-4-safety-architecture-design">
-<h3><a class="toc-backref" href="#id268" role="doc-backlink"><span class="section-number">5.5.4. </span>Phase 4. Safety Architecture Design</a><a class="headerlink" href="#phase-4-safety-architecture-design" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id271" role="doc-backlink"><span class="section-number">5.5.4. </span>Phase 4. Safety Architecture Design</a><a class="headerlink" href="#phase-4-safety-architecture-design" title="Permalink to this heading">¶</a></h3>
 <p>When designing a safety architecture, it is essential to consider the integration of safety components into the overall system architecture. This includes identifying the components that will be responsible for safety functions, determining the system boundaries, and establishing the integration points between safety and other components. Additionally, it is crucial to consider the performance requirements and scalability needs of the safety system, ensuring that it can handle the expected load and maintain a high level of reliability.</p>
 <p><strong>Questions to Ask:</strong></p>
 <ul class="simple">
@@ -830,7 +839,7 @@ <h3><a class="toc-backref" href="#id268" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="phase-5-implementation-tools-selection">
-<h3><a class="toc-backref" href="#id269" role="doc-backlink"><span class="section-number">5.5.5. </span>Phase 5. Implementation &amp; Tools Selection</a><a class="headerlink" href="#phase-5-implementation-tools-selection" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id272" role="doc-backlink"><span class="section-number">5.5.5. </span>Phase 5. Implementation &amp; Tools Selection</a><a class="headerlink" href="#phase-5-implementation-tools-selection" title="Permalink to this heading">¶</a></h3>
 <p>When selecting tools for implementation, it is crucial to consider the combination that best meets the specific needs of the project given business and safety requirements as well as the design of the safety architecture. Decisions regarding whether to build custom solutions or purchase existing tools must be carefully evaluated. Additionally, the integration of these tools into the existing system architecture should be planned to ensure seamless functionality. Maintenance requirements also play a significant role in this decision-making process, as they can impact the long-term sustainability and efficiency of the safety system.</p>
 <p><strong>Questions to Ask:</strong></p>
 <ul class="simple">
@@ -860,7 +869,7 @@ <h3><a class="toc-backref" href="#id269" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="phase-6-go-to-market">
-<h3><a class="toc-backref" href="#id270" role="doc-backlink"><span class="section-number">5.5.6. </span>Phase 6. Go-to-Market</a><a class="headerlink" href="#phase-6-go-to-market" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id273" role="doc-backlink"><span class="section-number">5.5.6. </span>Phase 6. Go-to-Market</a><a class="headerlink" href="#phase-6-go-to-market" title="Permalink to this heading">¶</a></h3>
 <p>Monitoring safety performance is essential to ensure that the implemented measures are effective and responsive to emerging threats. Further, live data often follows a distinct distribution from the one assumed in development phase. This should be monitored in order to allow for re-evaluation of pre-launch assumptions as well as to retrofit live data into models in use if applicable for continued enhanced performance.</p>
 <p>Establishing clear incident response procedures is crucial for addressing any safety issues that may arise promptly and efficiently. Additionally, a robust strategy for handling updates must be in place to adapt to new challenges and improve system resilience, particularly when underlying LLM-based components often suffer from continuous updates.</p>
 <p><strong>Questions to Ask:</strong></p>
@@ -893,7 +902,7 @@ <h3><a class="toc-backref" href="#id270" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="common-pitfalls">
-<h3><a class="toc-backref" href="#id271" role="doc-backlink"><span class="section-number">5.5.7. </span>Common Pitfalls</a><a class="headerlink" href="#common-pitfalls" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id274" role="doc-backlink"><span class="section-number">5.5.7. </span>Common Pitfalls</a><a class="headerlink" href="#common-pitfalls" title="Permalink to this heading">¶</a></h3>
 <p><strong>Policy Neglect.</strong> A significant issue that arises when implementation begins without clear safety policies. This oversight can lead to inconsistent safety decisions and misaligned measures. A common consequence is having a “moving target”. Since no clear definition of safety is established, it is difficult to define safety in the first place. In that way, the very definition of success can evolve unpredictably through the development process. To mitigate this risk, it is essential to establish a comprehensive policy that serves as a guiding North Star for safety-related efforts.</p>
 <p><strong>Late Evals.</strong> Another common pitfall is late evaluation planning, which occurs when the design of the evaluation framework is postponed until after implementation. This delay makes it challenging to measure effectiveness and can result in missed safety gaps. To address this, the evaluation framework should be designed early in the process and integrated throughout the development cycle.</p>
 <p><strong>Weak Evals.</strong> It is common to begin with simple evaluations that focus on a single dimension of safety, and that’s a good approach: start simple, iterate, learn, improve. However, the real mistake occurs when these initial checks are not evolved throughout the development cycle. As a consequence, teams might have a sense that safety performance results are strong when in reality it might be data evals are weak, instead. Before moving to production, it is crucial to establish well-balanced datasets that represent safety risks in a nuanced manner better representing real-world user scenarios.</p>
@@ -903,12 +912,12 @@ <h3><a class="toc-backref" href="#id271" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="technical-implementation-components">
-<h2><a class="toc-backref" href="#id272" role="doc-backlink"><span class="section-number">5.6. </span>Technical Implementation Components</a><a class="headerlink" href="#technical-implementation-components" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id275" role="doc-backlink"><span class="section-number">5.6. </span>Technical Implementation Components</a><a class="headerlink" href="#technical-implementation-components" title="Permalink to this heading">¶</a></h2>
 <section id="benchmarks-datasets">
-<h3><a class="toc-backref" href="#id273" role="doc-backlink"><span class="section-number">5.6.1. </span>Benchmarks &amp; Datasets</a><a class="headerlink" href="#benchmarks-datasets" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id276" role="doc-backlink"><span class="section-number">5.6.1. </span>Benchmarks &amp; Datasets</a><a class="headerlink" href="#benchmarks-datasets" title="Permalink to this heading">¶</a></h3>
 <section id="salad-bench">
-<h4><a class="toc-backref" href="#id274" role="doc-backlink"><span class="section-number">5.6.1.1. </span>SALAD-Bench</a><a class="headerlink" href="#salad-bench" title="Permalink to this heading">¶</a></h4>
-<p>SALAD-Bench <span id="id41">[<a class="reference internal" href="#id221" title="Lijun Li, Bowen Dong, Ruohui Wang, Xuhao Hu, Wangmeng Zuo, Dahua Lin, Yu Qiao, and Jing Shao. Salad-bench: a hierarchical and comprehensive safety benchmark for large language models. 2024. URL: https://arxiv.org/abs/2402.05044, arXiv:2402.05044.">Li <em>et al.</em>, 2024</a>]</span> is a recently published benchmark designed for evaluating the safety of Large Language Models (LLMs). It aims to address limitations of prior safety benchmarks which focused on a narrow perspective of safety threats, lacked challenging questions, relied on time-consuming and costly human evaluation, and were limited in scope. SALAD-Bench offers several key features to aid in LLM safety:</p>
+<h4><a class="toc-backref" href="#id277" role="doc-backlink"><span class="section-number">5.6.1.1. </span>SALAD-Bench</a><a class="headerlink" href="#salad-bench" title="Permalink to this heading">¶</a></h4>
+<p>SALAD-Bench <span id="id41">[<a class="reference internal" href="#id224" title="Lijun Li, Bowen Dong, Ruohui Wang, Xuhao Hu, Wangmeng Zuo, Dahua Lin, Yu Qiao, and Jing Shao. Salad-bench: a hierarchical and comprehensive safety benchmark for large language models. 2024. URL: https://arxiv.org/abs/2402.05044, arXiv:2402.05044.">Li <em>et al.</em>, 2024</a>]</span> is a recently published benchmark designed for evaluating the safety of Large Language Models (LLMs). It aims to address limitations of prior safety benchmarks which focused on a narrow perspective of safety threats, lacked challenging questions, relied on time-consuming and costly human evaluation, and were limited in scope. SALAD-Bench offers several key features to aid in LLM safety:</p>
 <ul class="simple">
 <li><p><strong>Compact Taxonomy with Hierarchical Levels:</strong> It uses a structured, three-level hierarchy consisting of 6 domains, 16 tasks, and 66 categories for in-depth safety evaluation across specific dimensions. For instance,  Representation &amp; Toxicity Harms is divided into toxic content, unfair representation, and adult content. Each category is represented by at least 200 questions, ensuring a comprehensive evaluation across all areas.</p></li>
 <li><p><strong>Enhanced Difficulty and Complexity:</strong> It includes attack-enhanced questions generated using methods like human-designed prompts, red-teaming LLMs, and gradient-based methods, presenting a more stringent test of LLMs’ safety responses. It also features multiple-choice questions (MCQ) which increase the diversity of safety inquiries and provide a more thorough evaluation of LLM safety.</p></li>
@@ -919,10 +928,10 @@ <h4><a class="toc-backref" href="#id274" role="doc-backlink"><span class="sectio
 <figure class="align-center" id="id43">
 <a class="reference internal image-reference" href="../_images/salad.png"><img alt="SALAD-Bench's compact taxonomy with hierarchical levels." src="../_images/salad.png" style="width: 70%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.10 </span><span class="caption-text">SALAD-Bench’s compact taxonomy with hierarchical levels <span id="id42">[<a class="reference internal" href="#id221" title="Lijun Li, Bowen Dong, Ruohui Wang, Xuhao Hu, Wangmeng Zuo, Dahua Lin, Yu Qiao, and Jing Shao. Salad-bench: a hierarchical and comprehensive safety benchmark for large language models. 2024. URL: https://arxiv.org/abs/2402.05044, arXiv:2402.05044.">Li <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#id43" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 5.10 </span><span class="caption-text">SALAD-Bench’s compact taxonomy with hierarchical levels <span id="id42">[<a class="reference internal" href="#id224" title="Lijun Li, Bowen Dong, Ruohui Wang, Xuhao Hu, Wangmeng Zuo, Dahua Lin, Yu Qiao, and Jing Shao. Salad-bench: a hierarchical and comprehensive safety benchmark for large language models. 2024. URL: https://arxiv.org/abs/2402.05044, arXiv:2402.05044.">Li <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#id43" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
-<p>The SALAD-Bench benchmark is accompanied by a Leaderboard <span id="id44">[<a class="reference internal" href="#id223" title="OpenSafetyLab. Salad-bench leaderboard. Hugging Face Space, 2024. URL: https://huggingface.co/spaces/OpenSafetyLab/Salad-Bench-Leaderboard.">OpenSafetyLab, 2024</a>]</span> and a dataset available on Hugging Face <span id="id45">[<a class="reference internal" href="#id222" title="OpenSafetyLab. Salad-data: a hierarchical and comprehensive safety dataset for large language models. Hugging Face Dataset, 2024. URL: https://huggingface.co/datasets/OpenSafetyLab/Salad-Data.">OpenSafetyLab, 2024</a>]</span>.</p>
+<p>The SALAD-Bench benchmark is accompanied by a Leaderboard <span id="id44">[<a class="reference internal" href="#id226" title="OpenSafetyLab. Salad-bench leaderboard. Hugging Face Space, 2024. URL: https://huggingface.co/spaces/OpenSafetyLab/Salad-Bench-Leaderboard.">OpenSafetyLab, 2024</a>]</span> and a dataset available on Hugging Face <span id="id45">[<a class="reference internal" href="#id225" title="OpenSafetyLab. Salad-data: a hierarchical and comprehensive safety dataset for large language models. Hugging Face Dataset, 2024. URL: https://huggingface.co/datasets/OpenSafetyLab/Salad-Data.">OpenSafetyLab, 2024</a>]</span>.</p>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
 <div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">SALAD_BENCH_DATASET</span> <span class="o">=</span> <span class="s2">&quot;OpenSafetyLab/Salad-Data&quot;</span>
@@ -934,7 +943,7 @@ <h4><a class="toc-backref" href="#id274" role="doc-backlink"><span class="sectio
 </div>
 </div>
 </div>
-<p>Each row in the dataset contains a question, an associated source, and hierarchical categories as proposed by SALAD-Bench. The question is a potentially harmful prompt to be evaluated, which has been aggregated by a source. An example of a source is “GPTFuzzer” <span id="id46">[<a class="reference internal" href="#id224" title="Jiahao Yu, Xingwei Lin, and Xinyu Xing. Gptfuzzer: red teaming large language models with auto-generated safety test cases. Papers with Code, 2024. URL: https://paperswithcode.com/dataset/gptfuzzer.">Yu <em>et al.</em>, 2024</a>]</span> which explores red teaming of large language models (LLMs) using auto-generated jailbreak prompts.</p>
+<p>Each row in the dataset contains a question, an associated source, and hierarchical categories as proposed by SALAD-Bench. The question is a potentially harmful prompt to be evaluated, which has been aggregated by a source. An example of a source is “GPTFuzzer” <span id="id46">[<a class="reference internal" href="#id227" title="Jiahao Yu, Xingwei Lin, and Xinyu Xing. Gptfuzzer: red teaming large language models with auto-generated safety test cases. Papers with Code, 2024. URL: https://paperswithcode.com/dataset/gptfuzzer.">Yu <em>et al.</em>, 2024</a>]</span> which explores red teaming of large language models (LLMs) using auto-generated jailbreak prompts.</p>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
 <div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">display</span><span class="p">(</span><span class="n">Markdown</span><span class="p">(</span><span class="n">dataset</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">to_markdown</span><span class="p">()))</span>
@@ -1040,7 +1049,7 @@ <h4><a class="toc-backref" href="#id274" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="truthfulqa">
-<h4><a class="toc-backref" href="#id275" role="doc-backlink"><span class="section-number">5.6.1.2. </span>TruthfulQA</a><a class="headerlink" href="#truthfulqa" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id278" role="doc-backlink"><span class="section-number">5.6.1.2. </span>TruthfulQA</a><a class="headerlink" href="#truthfulqa" title="Permalink to this heading">¶</a></h4>
 <p>TruthfulQA <span id="id47">[<a class="reference internal" href="#id108" title="Stephanie Lin, Jacob Hilton, and Owain Evans. Truthfulqa: measuring how models mimic human falsehoods. 2022. URL: https://arxiv.org/abs/2109.07958, arXiv:2109.07958.">Lin <em>et al.</em>, 2022</a>]</span> is a benchmark designed to evaluate whether a language model is truthful in generating answers to questions. In its original version, it comprises 817 questions spanning 38 categories, including health, law, finance, and politics. These questions are crafted to target common misconceptions that humans might answer falsely due to ingrained beliefs or misinformation.</p>
 <p>TruthfulQA evaluates LLMs in two primary tasks (see <a class="reference internal" href="#truthqa"><span class="std std-numref">Fig. 5.11</span></a>):</p>
 <ul class="simple">
@@ -1134,8 +1143,8 @@ <h4><a class="toc-backref" href="#id275" role="doc-backlink"><span class="sectio
 </table>
 </section>
 <section id="harmbench">
-<h4><a class="toc-backref" href="#id276" role="doc-backlink"><span class="section-number">5.6.1.3. </span>HarmBench</a><a class="headerlink" href="#harmbench" title="Permalink to this heading">¶</a></h4>
-<p>HarmBench <span id="id49">[<a class="reference internal" href="#id227" title="Mantas Mazeika, Long Phan, Xuwang Yin, Andy Zou, Zifan Wang, Norman Mu, Elham Sakhaee, Nathaniel Li, Steven Basart, Bo Li, David Forsyth, and Dan Hendrycks. Harmbench: a standardized evaluation framework for automated red teaming and robust refusal. 2024. URL: https://arxiv.org/abs/2402.04249, arXiv:2402.04249.">Mazeika <em>et al.</em>, 2024</a>]</span> is a benchmark designed to evaluate the safety of LLMs. Additionally, HarmBench published a framework <span id="id50">[<a class="reference internal" href="#id228" title="Center for AI Safety. Harmbench. GitHub repository, 2024. Framework for evaluating language model safety. URL: https://github.com/centerforaisafety/HarmBench.">Center for AI Safety, 2024</a>]</span> that allows users to run two main types of evaluations:</p>
+<h4><a class="toc-backref" href="#id279" role="doc-backlink"><span class="section-number">5.6.1.3. </span>HarmBench</a><a class="headerlink" href="#harmbench" title="Permalink to this heading">¶</a></h4>
+<p>HarmBench <span id="id49">[<a class="reference internal" href="#id230" title="Mantas Mazeika, Long Phan, Xuwang Yin, Andy Zou, Zifan Wang, Norman Mu, Elham Sakhaee, Nathaniel Li, Steven Basart, Bo Li, David Forsyth, and Dan Hendrycks. Harmbench: a standardized evaluation framework for automated red teaming and robust refusal. 2024. URL: https://arxiv.org/abs/2402.04249, arXiv:2402.04249.">Mazeika <em>et al.</em>, 2024</a>]</span> is a benchmark designed to evaluate the safety of LLMs. Additionally, HarmBench published a framework <span id="id50">[<a class="reference internal" href="#id231" title="Center for AI Safety. Harmbench. GitHub repository, 2024. Framework for evaluating language model safety. URL: https://github.com/centerforaisafety/HarmBench.">Center for AI Safety, 2024</a>]</span> that allows users to run two main types of evaluations:</p>
 <ul class="simple">
 <li><p>Evaluating red teaming methods (attack methods) against a set of LLMs</p></li>
 <li><p>Evaluating LLMs against a set of red teaming methods</p></li>
@@ -1147,26 +1156,26 @@ <h4><a class="toc-backref" href="#id276" role="doc-backlink"><span class="sectio
 <li><p>Evaluating completions</p></li>
 </ul>
 <p>HarmBench primarily uses the Attack Success Rate (ASR) as its core metric. ASR measures the percentage of adversarial attempts that successfully elicit undesired behavior from the model. It also includes metrics for evaluating the effectiveness of different mitigation strategies, such as the Robust Refusal Dynamic Defense (R2D2).</p>
-<p>The framework comes with built-in support for evaluating 18 red teaming methods and 33 target LLMs, and includes classifier models for evaluating different types of behaviors (standard, contextual, and multimodal). A leaderboard is available <span id="id51">[<a class="reference internal" href="#id229" title="Center for AI Safety. Harmbench leaderboard. 2024. Leaderboard tracking performance of language models on safety benchmarks. URL: https://www.harmbench.org/results.">Center for AI Safety, 2024</a>]</span> to track performance of both language and multimodal models on safety benchmarks.</p>
+<p>The framework comes with built-in support for evaluating 18 red teaming methods and 33 target LLMs, and includes classifier models for evaluating different types of behaviors (standard, contextual, and multimodal). A leaderboard is available <span id="id51">[<a class="reference internal" href="#id232" title="Center for AI Safety. Harmbench leaderboard. 2024. Leaderboard tracking performance of language models on safety benchmarks. URL: https://www.harmbench.org/results.">Center for AI Safety, 2024</a>]</span> to track performance of both language and multimodal models on safety benchmarks.</p>
 <p>An interesting finding from HarmBench is that robustness is independent of model size which is in contrast to traditional benchmarks where larger models tend to perform better suggesting that training data and algorithms are far more important than model size in determining LLM robustness, emphasizing the importance of model-level defenses.</p>
 <figure class="align-center" id="id53">
 <a class="reference internal image-reference" href="../_images/harmbench.png"><img alt="Attack Success Rate (ASR) for different models." src="../_images/harmbench.png" style="width: 65%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.12 </span><span class="caption-text">Attack Success Rate (ASR) for different models. HarmBench’s results suggest that  robustness is independent of model size <span id="id52">[<a class="reference internal" href="#id227" title="Mantas Mazeika, Long Phan, Xuwang Yin, Andy Zou, Zifan Wang, Norman Mu, Elham Sakhaee, Nathaniel Li, Steven Basart, Bo Li, David Forsyth, and Dan Hendrycks. Harmbench: a standardized evaluation framework for automated red teaming and robust refusal. 2024. URL: https://arxiv.org/abs/2402.04249, arXiv:2402.04249.">Mazeika <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#id53" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 5.12 </span><span class="caption-text">Attack Success Rate (ASR) for different models. HarmBench’s results suggest that  robustness is independent of model size <span id="id52">[<a class="reference internal" href="#id230" title="Mantas Mazeika, Long Phan, Xuwang Yin, Andy Zou, Zifan Wang, Norman Mu, Elham Sakhaee, Nathaniel Li, Steven Basart, Bo Li, David Forsyth, and Dan Hendrycks. Harmbench: a standardized evaluation framework for automated red teaming and robust refusal. 2024. URL: https://arxiv.org/abs/2402.04249, arXiv:2402.04249.">Mazeika <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#id53" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>HarmBench can be used by LLM developers to proactively identify and address potential vulnerabilities in their models before deployment. By automating the red teaming process, HarmBench allows for more efficient and scalable evaluation of LLM safety, enabling developers to test their models against a wider range of adversarial scenarios. This helps improve the robustness of LLMs and reduce the risk of malicious use.</p>
 </section>
 <section id="safebench">
-<h4><a class="toc-backref" href="#id277" role="doc-backlink"><span class="section-number">5.6.1.4. </span>SafeBench</a><a class="headerlink" href="#safebench" title="Permalink to this heading">¶</a></h4>
-<p>SafeBench <span id="id54">[<a class="reference internal" href="#id226" title="ML Safety Team. Safebench: a comprehensive benchmark for llm safety evaluation. ML Safety Website, 2024. URL: https://www.mlsafety.org/safebench.">ML Safety Team, 2024</a>]</span> is a competition designed to encourage the development of new benchmarks for assessing and mitigating risks associated with artificial intelligence.</p>
+<h4><a class="toc-backref" href="#id280" role="doc-backlink"><span class="section-number">5.6.1.4. </span>SafeBench</a><a class="headerlink" href="#safebench" title="Permalink to this heading">¶</a></h4>
+<p>SafeBench <span id="id54">[<a class="reference internal" href="#id229" title="ML Safety Team. Safebench: a comprehensive benchmark for llm safety evaluation. ML Safety Website, 2024. URL: https://www.mlsafety.org/safebench.">ML Safety Team, 2024</a>]</span> is a competition designed to encourage the development of new benchmarks for assessing and mitigating risks associated with artificial intelligence.</p>
 <p>The competition is a project of the Center for AI Safety, a non-profit research organization focused on reducing societal-scale risks from AI systems. The organization has previously developed benchmarks such as MMLU, the Weapons of Mass Destruction Proxy, and the out-of-distribution detection baseline.</p>
 <p>The goal of SafeBench is to define metrics that align with progress in addressing AI safety concerns. This is driven by the understanding that metrics play a crucial role in the field of machine learning (ML). Formalizing these metrics into benchmarks is essential for evaluating and predicting potential risks posed by AI models.</p>
 <p>The competition has outlined four categories where they would like to see benchmarks: Robustness, Monitoring, Alignment, and Safety Applications. For each of these categories, the organizers have provided examples os risks, for instance under the Robustness category is <strong>Jailbreaking Text and Multimodal Models</strong>. This focuses on improving defenses against adversarial attacks. A submitted benchmark then could tackle new and ideally unseen jailbreaking attacks and defenses.</p>
 </section>
 </section>
 <section id="tools-techniques">
-<h3><a class="toc-backref" href="#id278" role="doc-backlink"><span class="section-number">5.6.2. </span>Tools &amp; Techniques</a><a class="headerlink" href="#tools-techniques" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id281" role="doc-backlink"><span class="section-number">5.6.2. </span>Tools &amp; Techniques</a><a class="headerlink" href="#tools-techniques" title="Permalink to this heading">¶</a></h3>
 <p>The most straightforward approach to add a safety layer to LLM applications is to implement a separate filtering layer that screens both user prompts and LLM responses. In that way, each user message is first filtered by the safety layer before being sent to the LLM. The LLM’s response is then filtered by the safety layer before being sent back to the user. Assuming a scenario where most user messages are likely to be safe, a common design pattern to minimize latency is to send your moderation requests asynchronously along with the LLM application call as shown in <a class="reference internal" href="#safety-layer"><span class="std std-numref">Fig. 5.13</span></a>.</p>
 <figure class="align-center" id="safety-layer">
 <a class="reference internal image-reference" href="../_images/safety_layer.svg"><img alt="Safety Layer" src="../_images/safety_layer.svg" width="90%" /></a>
@@ -1204,8 +1213,8 @@ <h3><a class="toc-backref" href="#id278" role="doc-backlink"><span class="sectio
 </table>
 <p>There are several specialized commercial and open source tools that can be used to implement a filtering layer, which we can categorize into two types: Rules-Based and LLM-Based.</p>
 <section id="rules-based-safety-filtering">
-<h4><a class="toc-backref" href="#id279" role="doc-backlink"><span class="section-number">5.6.2.1. </span>Rules-Based Safety Filtering</a><a class="headerlink" href="#rules-based-safety-filtering" title="Permalink to this heading">¶</a></h4>
-<p>Examples of tools that can be used as rules-based safety filters are Webpurify, LLM-Guard <span id="id55">[<a class="reference internal" href="#id236" title="ProtectAI. Llm-guard: comprehensive safety and security framework for large language models. 2024. An open-source toolkit for LLM security and safety. URL: https://github.com/protectai/llm-guard.">ProtectAI, 2024</a>]</span>, AWS Comprehend <span id="id56">[<a class="reference internal" href="#id238" title="Amazon Web Services. Amazon comprehend - natural language processing service. 2024. AWS natural language processing service for text analysis and content moderation. URL: https://aws.amazon.com/comprehend/.">Amazon Web Services, 2024</a>]</span>, and NeMo Guardrails <span id="id57">[<a class="reference internal" href="#id237" title="NVIDIA. Nemo-guardrails: an open-source toolkit for building reliable and safe llm applications. 2024. A framework for creating reliable and safe LLM applications with customizable guardrails. URL: https://github.com/NVIDIA/NeMo-Guardrails.">NVIDIA, 2024</a>]</span> as detailed in <a class="reference internal" href="#safety-layer-tools"><span class="std std-numref">Table 5.2</span></a>.</p>
+<h4><a class="toc-backref" href="#id282" role="doc-backlink"><span class="section-number">5.6.2.1. </span>Rules-Based Safety Filtering</a><a class="headerlink" href="#rules-based-safety-filtering" title="Permalink to this heading">¶</a></h4>
+<p>Examples of tools that can be used as rules-based safety filters are Webpurify, LLM-Guard <span id="id55">[<a class="reference internal" href="#id239" title="ProtectAI. Llm-guard: comprehensive safety and security framework for large language models. 2024. An open-source toolkit for LLM security and safety. URL: https://github.com/protectai/llm-guard.">ProtectAI, 2024</a>]</span>, AWS Comprehend <span id="id56">[<a class="reference internal" href="#id241" title="Amazon Web Services. Amazon comprehend - natural language processing service. 2024. AWS natural language processing service for text analysis and content moderation. URL: https://aws.amazon.com/comprehend/.">Amazon Web Services, 2024</a>]</span>, and NeMo Guardrails <span id="id57">[<a class="reference internal" href="#id240" title="NVIDIA. Nemo-guardrails: an open-source toolkit for building reliable and safe llm applications. 2024. A framework for creating reliable and safe LLM applications with customizable guardrails. URL: https://github.com/NVIDIA/NeMo-Guardrails.">NVIDIA, 2024</a>]</span> as detailed in <a class="reference internal" href="#safety-layer-tools"><span class="std std-numref">Table 5.2</span></a>.</p>
 <table class="docutils align-default" id="safety-layer-tools">
 <caption><span class="caption-number">Table 5.2 </span><span class="caption-text">Rules-Based Safety Filtering Tools.</span><a class="headerlink" href="#safety-layer-tools" title="Permalink to this table">¶</a></caption>
 <thead>
@@ -1266,13 +1275,13 @@ <h4><a class="toc-backref" href="#id279" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="llm-based-safety-filtering">
-<h4><a class="toc-backref" href="#id280" role="doc-backlink"><span class="section-number">5.6.2.2. </span>LLM-Based Safety Filtering</a><a class="headerlink" href="#llm-based-safety-filtering" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id283" role="doc-backlink"><span class="section-number">5.6.2.2. </span>LLM-Based Safety Filtering</a><a class="headerlink" href="#llm-based-safety-filtering" title="Permalink to this heading">¶</a></h4>
 <p>Alternatively, an LLM-based component can be used as a content filter. Here, we observe three types os approaches:
 1. Moderation API,
 2. Fine-Tuned Open Source Models, and
 3. Custom Moderation.</p>
 <p>Model providers such as OpenAI, and Mistral offer moderation APIs that can be used to filter content. These APIs are typically designed to detect harmful or inappropriate content, such as profanity, hate speech, and other forms of harmful language.</p>
-<p>Mistral’s Moderation API <span id="id58">[<a class="reference internal" href="#id234" title="Mistral AI. Mistral moderation: a technical report. 2024. URL: https://mistral.ai/news/mistral-moderation/.">Mistral AI, 2024</a>]</span>, released in November/2024, is a classifier model based on Ministral 8B 24.10. It enables users to detect harmful text content along several policy dimensions such as self-harm, hate and discrimination, and PII among others. It can be used to classify both raw text or conversational content. We will cover this API in more detail in the Case Study.</p>
+<p>Mistral’s Moderation API <span id="id58">[<a class="reference internal" href="#id237" title="Mistral AI. Mistral moderation: a technical report. 2024. URL: https://mistral.ai/news/mistral-moderation/.">Mistral AI, 2024</a>]</span>, released in November/2024, is a classifier model based on Ministral 8B 24.10. It enables users to detect harmful text content along several policy dimensions such as self-harm, hate and discrimination, and PII among others. It can be used to classify both raw text or conversational content. We will cover this API in more detail in the Case Study.</p>
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Mistral&#39;s Moderation API - Raw Text</span>
 <span class="kn">import</span> <span class="nn">os</span>
 <span class="kn">from</span> <span class="nn">mistralai</span> <span class="kn">import</span> <span class="n">Mistral</span>
@@ -1308,7 +1317,7 @@ <h4><a class="toc-backref" href="#id280" role="doc-backlink"><span class="sectio
 <span class="nb">print</span><span class="p">(</span><span class="n">response</span><span class="p">)</span>
 </pre></div>
 </div>
-<p>OpenAI’s Moderation API <span id="id59">[<a class="reference internal" href="#id235" title="OpenAI. Openai moderation api. 2024. Documentation for OpenAI's content moderation API. URL: https://platform.openai.com/docs/guides/moderation.">OpenAI, 2024</a>]</span> is free of use and can be accessed via the base model name <code class="docutils literal notranslate"><span class="pre">omni-moderation</span></code>. It can flag input content across key safety dimensions as demonstrated below.</p>
+<p>OpenAI’s Moderation API <span id="id59">[<a class="reference internal" href="#id238" title="OpenAI. Openai moderation api. 2024. Documentation for OpenAI's content moderation API. URL: https://platform.openai.com/docs/guides/moderation.">OpenAI, 2024</a>]</span> is free of use and can be accessed via the base model name <code class="docutils literal notranslate"><span class="pre">omni-moderation</span></code>. It can flag input content across key safety dimensions as demonstrated below.</p>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
 <div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">dotenv</span> <span class="kn">import</span> <span class="n">load_dotenv</span>
@@ -1457,22 +1466,22 @@ <h4><a class="toc-backref" href="#id280" role="doc-backlink"><span class="sectio
 <li><p>S12: Sexual Content.</p></li>
 <li><p>S13: Elections.</p></li>
 </ul>
-<p><strong>IBM Granite Guardian</strong> is a new competitor to Llama Guard family. It is collection of models designed to help govern key risk dimensions as defined by IBM’s AI Risk Atlas <span id="id60">[<a class="reference internal" href="#id239" title="IBM. Ibm watsonx.ai risk atlas. 2024. A framework for identifying and mitigating risks in AI systems. URL: https://www.ibm.com/docs/en/watsonx/saas?topic=ai-risk-atlas.">IBM, 2024</a>]</span>. The collection comprises two classes of models:</p>
+<p><strong>IBM Granite Guardian</strong> is a new competitor to Llama Guard family. It is collection of models designed to help govern key risk dimensions as defined by IBM’s AI Risk Atlas <span id="id60">[<a class="reference internal" href="#id242" title="IBM. Ibm watsonx.ai risk atlas. 2024. A framework for identifying and mitigating risks in AI systems. URL: https://www.ibm.com/docs/en/watsonx/saas?topic=ai-risk-atlas.">IBM, 2024</a>]</span>. The collection comprises two classes of models:</p>
 <ol class="arabic simple">
 <li><p>Granite-Guardian-3.0-2B and Granite-Guardian-3.0-8B for detecting different forms of harmful content</p></li>
 <li><p>Granite Guardian HAP 38M and Granite Guardian HAP 125M for detecting toxic content.</p></li>
 </ol>
-<p>In a paper from December/2024 <span id="id61">[<a class="reference internal" href="#id240" title="Inkit Padhi, Manish Nagireddy, Giandomenico Cornacchia, Subhajit Chaudhury, Tejaswini Pedapati, Pierre Dognin, Keerthiram Murugesan, Erik Miehling, Martín Santillán Cooper, Kieran Fraser, Giulio Zizzo, Muhammad Zaid Hameed, Mark Purcell, Michael Desmond, Qian Pan, Inge Vejsbjerg, Elizabeth M. Daly, Michael Hind, Werner Geyer, Ambrish Rawat, Kush R. Varshney, and Prasanna Sattigeri. Granite guardian. 2024. URL: https://arxiv.org/abs/2412.07724, arXiv:2412.07724.">Padhi <em>et al.</em>, 2024</a>]</span>, the authors describe Granite Guardian as a model fine-tuned on a training dataset that combines open-source, synthetic and human annotated data achieving superior performance than state-of-the-art comparable model families. In <a class="reference internal" href="#granite"><span class="std std-numref">Fig. 5.14</span></a> we observe that IBM Granite Guardian performance is overall superior compared to Llama-Guard and ShieldGemma model families for the “Harm” risk dimension.</p>
+<p>In a paper from December/2024 <span id="id61">[<a class="reference internal" href="#id243" title="Inkit Padhi, Manish Nagireddy, Giandomenico Cornacchia, Subhajit Chaudhury, Tejaswini Pedapati, Pierre Dognin, Keerthiram Murugesan, Erik Miehling, Martín Santillán Cooper, Kieran Fraser, Giulio Zizzo, Muhammad Zaid Hameed, Mark Purcell, Michael Desmond, Qian Pan, Inge Vejsbjerg, Elizabeth M. Daly, Michael Hind, Werner Geyer, Ambrish Rawat, Kush R. Varshney, and Prasanna Sattigeri. Granite guardian. 2024. URL: https://arxiv.org/abs/2412.07724, arXiv:2412.07724.">Padhi <em>et al.</em>, 2024</a>]</span>, the authors describe Granite Guardian as a model fine-tuned on a training dataset that combines open-source, synthetic and human annotated data achieving superior performance than state-of-the-art comparable model families. In <a class="reference internal" href="#granite"><span class="std std-numref">Fig. 5.14</span></a> we observe that IBM Granite Guardian performance is overall superior compared to Llama-Guard and ShieldGemma model families for the “Harm” risk dimension.</p>
 <figure class="align-center" id="granite">
 <a class="reference internal image-reference" href="../_images/granite.png"><img alt="IBM Granite Guardian performance for the &quot;Harm&quot; risk dimension." src="../_images/granite.png" style="width: 65%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.14 </span><span class="caption-text">IBM Granite Guardian performance is superior compared to Llama-Guard and ShieldGemma model families for the “Harm” risk dimension <span id="id62">[<a class="reference internal" href="#id240" title="Inkit Padhi, Manish Nagireddy, Giandomenico Cornacchia, Subhajit Chaudhury, Tejaswini Pedapati, Pierre Dognin, Keerthiram Murugesan, Erik Miehling, Martín Santillán Cooper, Kieran Fraser, Giulio Zizzo, Muhammad Zaid Hameed, Mark Purcell, Michael Desmond, Qian Pan, Inge Vejsbjerg, Elizabeth M. Daly, Michael Hind, Werner Geyer, Ambrish Rawat, Kush R. Varshney, and Prasanna Sattigeri. Granite guardian. 2024. URL: https://arxiv.org/abs/2412.07724, arXiv:2412.07724.">Padhi <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#granite" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 5.14 </span><span class="caption-text">IBM Granite Guardian performance is superior compared to Llama-Guard and ShieldGemma model families for the “Harm” risk dimension <span id="id62">[<a class="reference internal" href="#id243" title="Inkit Padhi, Manish Nagireddy, Giandomenico Cornacchia, Subhajit Chaudhury, Tejaswini Pedapati, Pierre Dognin, Keerthiram Murugesan, Erik Miehling, Martín Santillán Cooper, Kieran Fraser, Giulio Zizzo, Muhammad Zaid Hameed, Mark Purcell, Michael Desmond, Qian Pan, Inge Vejsbjerg, Elizabeth M. Daly, Michael Hind, Werner Geyer, Ambrish Rawat, Kush R. Varshney, and Prasanna Sattigeri. Granite guardian. 2024. URL: https://arxiv.org/abs/2412.07724, arXiv:2412.07724.">Padhi <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#granite" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>The industry is increasingly focusing on the fine-tuning of pre-trained base models targeting a specific dimension of requirements and standards, here Safety being a critical one. This trend encompasses the release of open-source, fine-tuned safety models that can act as protective guardrails for LLM applications, as exemplified by LLaMa-Guard and IBM Granite Guardian. Additionally, there is a notable rise in models fine-tuned through techniques such as Reinforcement Learning from Human Feedback (RLHF), utilizing human preference datasets that incorporate safety considerations. These specialized models can function as safety filters as discussed but also as main models that alone could accomplished their original intended task but safely. We will cover this specific topic of preference-based alignment in the next Chapter <a class="reference internal" href="alignment.html#alignment"><span class="std std-ref">Preference-Based Alignment</span></a>, where we will explore the process of aligning language models with human preferences ultimately leading to the development of an open source fine-tuned model that complies with user provided policy-based requirements.</p>
 </section>
 <section id="custom-moderation">
-<h4><a class="toc-backref" href="#id281" role="doc-backlink"><span class="section-number">5.6.2.3. </span>Custom Moderation</a><a class="headerlink" href="#custom-moderation" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id284" role="doc-backlink"><span class="section-number">5.6.2.3. </span>Custom Moderation</a><a class="headerlink" href="#custom-moderation" title="Permalink to this heading">¶</a></h4>
 <p>We have covered filtering-based approaches using moderation APIs and fine-tuned open source models. Rather than relying on external filters, LLMs themselves can be guided to avoid harmful content through careful prompt engineering.</p>
 <p>Custom moderation offers a tailored content filtering approach, ensuring adherence to your own specific standards. As we have seen, each filtering-based approach we have discussed, while each having its own strengths, they all implement or enable safety according to a pre-defined dimension of requirements and standards. Custom moderation, on the other hand, provides greater control compared to general moderation APIs or fine-tuned open source models though it requires more setup and maintenance.</p>
 <p>A common approach, when building a custom LLM-based filter, is to build an LLM-as-a-Judge filter as illustrated in <a class="reference internal" href="#judge-safety"><span class="std std-numref">Fig. 5.15</span></a>. It a simple idea to use an LLM to judge the output of another LLM as well as user prompt in the context of your LLM-based application (please see <a class="reference external" href="https://www.souzatharsis.com/tamingLLMs/notebooks/evals.html#model-based-evaluation">Section “Model Based Evaluation” - Chapter Evals</a> for design and best practices of LLM-based evals.)</p>
@@ -1549,17 +1558,17 @@ <h4><a class="toc-backref" href="#id281" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="case-study-implementing-a-safety-filter">
-<h2><a class="toc-backref" href="#id282" role="doc-backlink"><span class="section-number">5.7. </span>Case Study: Implementing a Safety Filter</a><a class="headerlink" href="#case-study-implementing-a-safety-filter" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id285" role="doc-backlink"><span class="section-number">5.7. </span>Case Study: Implementing a Safety Filter</a><a class="headerlink" href="#case-study-implementing-a-safety-filter" title="Permalink to this heading">¶</a></h2>
 <p>We will implement a basic safety filter for a K-12 application that will be used to filter content in a chat interface. The application will be designed to be used in a classroom setting where students and teachers can interact with the model to ask questions and receive answers. The safety filter will be designed to filter out harmful content such as profanity, hate speech, and other inappropriate content.</p>
 <p>In this stylized case study, we will limit our scope to the implementation of a safety filter for user prompts. We will not cover the implementation of the application itself or filtering the model’s output but rather focus on the user prompt safety filter. In real-world applications, an input policy would be paramount to better define what safety means before we identify associated risks and consecutive implementation decisions. Here, we will discuss the implementation of safety through the design of the evals dataset (you will later see, skipping policy will lead to trouble later in the case study!)</p>
 <section id="evals-dataset">
-<h3><a class="toc-backref" href="#id283" role="doc-backlink"><span class="section-number">5.7.1. </span>Evals Dataset</a><a class="headerlink" href="#evals-dataset" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id286" role="doc-backlink"><span class="section-number">5.7.1. </span>Evals Dataset</a><a class="headerlink" href="#evals-dataset" title="Permalink to this heading">¶</a></h3>
 <p>Creating a balanced evaluation dataset is crucial for developing robust safety measures. The dataset should be a well balanced set of “good” and “bad” samples to avoid biasing the model’s behavior in either direction.</p>
 <p>For this evaluation, we will create a dataset with <code class="docutils literal notranslate"><span class="pre">NUM_SAMPLES</span></code> examples, evenly split between good and bad samples (<code class="docutils literal notranslate"><span class="pre">GOOD_SAMPLES</span></code> and <code class="docutils literal notranslate"><span class="pre">BAD_SAMPLES</span></code>, respectively).</p>
-<p>The good samples will be sourced from the UltraFeedback Binarized dataset <span id="id63">[<a class="reference internal" href="#id162" title="Hugging Face H4. Ultrafeedback binarized dataset. 2024z. A dataset of binary preference data for training language models. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized.">H4, 2024z</a>]</span>, which contains high-quality, appropriate prompts that represent normal user interactions, often utilized to fine-tune models for instruction-following, truthfulness, honesty and helpfulness in a preference-based alignment process.</p>
+<p>The good samples will be sourced from the UltraFeedback Binarized dataset <span id="id63">[<a class="reference internal" href="#id165" title="Hugging Face H4. Ultrafeedback binarized dataset. 2024z. A dataset of binary preference data for training language models. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized.">H4, 2024z</a>]</span>, which contains high-quality, appropriate prompts that represent normal user interactions, often utilized to fine-tune models for instruction-following, truthfulness, honesty and helpfulness in a preference-based alignment process.</p>
 <p>The bad samples will come from two sources:</p>
 <ol class="arabic simple">
-<li><p>Profanity keywords from the Surge AI Profanity Dataset <span id="id64">[<a class="reference internal" href="#id231" title="Surge AI. Surge ai profanity dataset. GitHub repository, 2024. A comprehensive dataset for training and evaluating profanity detection models. URL: https://github.com/surge-ai/profanity.">Surge AI, 2024</a>]</span> - This provides examples of explicit inappropriate content.</p></li>
+<li><p>Profanity keywords from the Surge AI Profanity Dataset <span id="id64">[<a class="reference internal" href="#id234" title="Surge AI. Surge ai profanity dataset. GitHub repository, 2024. A comprehensive dataset for training and evaluating profanity detection models. URL: https://github.com/surge-ai/profanity.">Surge AI, 2024</a>]</span> - This provides examples of explicit inappropriate content.</p></li>
 <li><p>Prompts sourced from Salad-Bench - These represent more subtle forms of harmful content like scams, harassment, or dangerous instructions, hence not necessarily mentioning an inappropriate keywords but rather a potentially harmful instruction.</p></li>
 </ol>
 <p>This balanced approach helps ensure our safety measures can effectively identify explicit and nuanced harmful content while minimizing false positives across diverse real-world scenarios.</p>
@@ -1572,7 +1581,7 @@ <h3><a class="toc-backref" href="#id283" role="doc-backlink"><span class="sectio
 </div>
 </div>
 <section id="bad-samples">
-<h4><a class="toc-backref" href="#id284" role="doc-backlink"><span class="section-number">5.7.1.1. </span>Bad Samples</a><a class="headerlink" href="#bad-samples" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id287" role="doc-backlink"><span class="section-number">5.7.1.1. </span>Bad Samples</a><a class="headerlink" href="#bad-samples" title="Permalink to this heading">¶</a></h4>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
 <div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">get_profanity_samples</span><span class="p">(</span><span class="n">num_samples</span><span class="p">,</span> <span class="n">show_stats</span><span class="o">=</span><span class="kc">True</span><span class="p">):</span>
@@ -1714,7 +1723,7 @@ <h4><a class="toc-backref" href="#id284" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="good-samples">
-<h4><a class="toc-backref" href="#id285" role="doc-backlink"><span class="section-number">5.7.1.2. </span>Good Samples</a><a class="headerlink" href="#good-samples" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id288" role="doc-backlink"><span class="section-number">5.7.1.2. </span>Good Samples</a><a class="headerlink" href="#good-samples" title="Permalink to this heading">¶</a></h4>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
 <div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">get_good_samples</span><span class="p">(</span><span class="n">num_samples</span><span class="p">):</span>
@@ -1895,7 +1904,7 @@ <h4><a class="toc-backref" href="#id285" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="safety-filters">
-<h3><a class="toc-backref" href="#id286" role="doc-backlink"><span class="section-number">5.7.2. </span>Safety Filters</a><a class="headerlink" href="#safety-filters" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id289" role="doc-backlink"><span class="section-number">5.7.2. </span>Safety Filters</a><a class="headerlink" href="#safety-filters" title="Permalink to this heading">¶</a></h3>
 <p>We will implement four safety filters, one for each of the following:</p>
 <ol class="arabic simple">
 <li><p>LLM-Guard</p></li>
@@ -1961,7 +1970,7 @@ <h3><a class="toc-backref" href="#id286" role="doc-backlink"><span class="sectio
 </div>
 </div>
 <section id="llm-guard">
-<h4><a class="toc-backref" href="#id287" role="doc-backlink"><span class="section-number">5.7.2.1. </span>LLM-Guard</a><a class="headerlink" href="#llm-guard" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id290" role="doc-backlink"><span class="section-number">5.7.2.1. </span>LLM-Guard</a><a class="headerlink" href="#llm-guard" title="Permalink to this heading">¶</a></h4>
 <p>Next, we implement a concrete validator using LLM Guard. The <code class="docutils literal notranslate"><span class="pre">LLMGuardValidator</span></code> class combines two key scanners:</p>
 <ul class="simple">
 <li><p>BanTopics: Flags content containing banned topics</p></li>
@@ -2054,7 +2063,7 @@ <h4><a class="toc-backref" href="#id287" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="mistral-moderation-api">
-<h4><a class="toc-backref" href="#id288" role="doc-backlink"><span class="section-number">5.7.2.2. </span>Mistral Moderation API</a><a class="headerlink" href="#mistral-moderation-api" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id291" role="doc-backlink"><span class="section-number">5.7.2.2. </span>Mistral Moderation API</a><a class="headerlink" href="#mistral-moderation-api" title="Permalink to this heading">¶</a></h4>
 <p>You will need a Mistral API key to use the Mistral Moderation API. You can get one by signing up for a Mistral account and creating an API key, which we will assume is stored in a local <code class="docutils literal notranslate"><span class="pre">.env</span></code> file under the <code class="docutils literal notranslate"><span class="pre">MISTRAL_API_KEY</span></code> variable.</p>
 <p>The <code class="docutils literal notranslate"><span class="pre">MistralValidator</span></code> class implements a safety validator using Mistral’s moderation API. It takes text input and returns a ValidationResult indicating whether the text is unsafe based on Mistral moderation categories. Example:</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">{</span><span class="s1">&#39;sexual&#39;</span><span class="p">:</span> <span class="kc">False</span><span class="p">,</span>
@@ -2134,7 +2143,7 @@ <h4><a class="toc-backref" href="#id288" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="openai-moderation-api">
-<h4><a class="toc-backref" href="#id289" role="doc-backlink"><span class="section-number">5.7.2.3. </span>OpenAI Moderation API</a><a class="headerlink" href="#openai-moderation-api" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id292" role="doc-backlink"><span class="section-number">5.7.2.3. </span>OpenAI Moderation API</a><a class="headerlink" href="#openai-moderation-api" title="Permalink to this heading">¶</a></h4>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
 <div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">openai</span> <span class="kn">import</span> <span class="n">OpenAI</span>
@@ -2198,7 +2207,7 @@ <h4><a class="toc-backref" href="#id289" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="custom-judge-validator">
-<h4><a class="toc-backref" href="#id290" role="doc-backlink"><span class="section-number">5.7.2.4. </span>Custom Judge Validator</a><a class="headerlink" href="#custom-judge-validator" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id293" role="doc-backlink"><span class="section-number">5.7.2.4. </span>Custom Judge Validator</a><a class="headerlink" href="#custom-judge-validator" title="Permalink to this heading">¶</a></h4>
 <p>The <code class="docutils literal notranslate"><span class="pre">LLMJudgeValidator</span></code> class implements a safety validator using GPT-4o-mini. It takes text input and returns a ValidationResult indicating whether the text is unsafe based on an input safety prompt.</p>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
@@ -2283,7 +2292,7 @@ <h4><a class="toc-backref" href="#id290" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="benchmarking">
-<h3><a class="toc-backref" href="#id291" role="doc-backlink"><span class="section-number">5.7.3. </span>Benchmarking</a><a class="headerlink" href="#benchmarking" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id294" role="doc-backlink"><span class="section-number">5.7.3. </span>Benchmarking</a><a class="headerlink" href="#benchmarking" title="Permalink to this heading">¶</a></h3>
 <p>We are ready to run our four safety filters against our dataset. We will store validation results as well as elapsed time for each validator.</p>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
@@ -2772,7 +2781,7 @@ <h3><a class="toc-backref" href="#id291" role="doc-backlink"><span class="sectio
 <p>Having said that, I want to be clear that further investigation is needed before one could claim that the dataset is unsafe. Here, we only show anecdotal evidence that the dataset contains unsafe content for our particular case study. We do not claim that the dataset is unsafe per se. Instead, a superior experiment would have constructed a proper dataset that more closely matches what safe conversations look like in the application domain we are studying.</p>
 </section>
 <section id="takeaways">
-<h3><a class="toc-backref" href="#id292" role="doc-backlink"><span class="section-number">5.7.4. </span>Takeaways</a><a class="headerlink" href="#takeaways" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id295" role="doc-backlink"><span class="section-number">5.7.4. </span>Takeaways</a><a class="headerlink" href="#takeaways" title="Permalink to this heading">¶</a></h3>
 <ul class="simple">
 <li><p>Safety is a complex problem and there is no one-size-fits-all solution.</p></li>
 <li><p>Starting with a well-aligned policy is key to developing a robust data and evaluation framework.</p></li>
@@ -2782,14 +2791,14 @@ <h3><a class="toc-backref" href="#id292" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="conclusion">
-<h2><a class="toc-backref" href="#id293" role="doc-backlink"><span class="section-number">5.8. </span>Conclusion</a><a class="headerlink" href="#conclusion" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id296" role="doc-backlink"><span class="section-number">5.8. </span>Conclusion</a><a class="headerlink" href="#conclusion" title="Permalink to this heading">¶</a></h2>
 <p>The rapid advancement of large language models has created an unsettling paradox: the same technologies that promise to revolutionize human-AI interaction also harbor significant risks that could undermine the very societies they aim to benefit. Our examination of various safety measures - from constitutional AI to red teaming - reveals that each approach has specific strengths and limitations when implemented in practice. However, instead of waiting for governments, organizations, and the public to catch up, we need to take action now.</p>
 <p>The case study on safety filters demonstrated the complexity of implementing even basic safety measures in real-world applications. What appears safe in one context may be inappropriate in another, and our current methods of safety evaluation often struggle with these nuances. The challenge of developing robust safety measures is further complicated by the potential for feedback loops in the training process - when models are fine-tuned on datasets that may contain hidden biases or problematic content.</p>
 <p>The path forward requires combining technical innovation with practical domain-specific wisdom. Safety in GenAI isn’t just a technical problem to be solved - it’s a mirror reflecting our own values, biases, and aspirations back at us. The growing focus on safety across the AI community, from open-source initiatives to corporate governance frameworks, provides a foundation for developing more robust safety measures. However, technologists working in isolation cannot solve these challenges - and may even perpetuate them unknowingly. Instead, domain experts across different verticals must come together to collaboratively define what safety means in the context of their specific users and broader society in work in collaboration with the AI community.</p>
 <p>Only through this cross-disciplinary collaboration can we move beyond the current uncertainty into a future where safety and innovation reinforce rather than oppose each other. This requires building bridges between technical experts, ethicists, policymakers, and the communities they serve to develop holistic frameworks that protect while enabling progress.</p>
 </section>
 <section id="citation">
-<h2><a class="toc-backref" href="#id294" role="doc-backlink"><span class="section-number">5.9. </span>Citation</a><a class="headerlink" href="#citation" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id297" role="doc-backlink"><span class="section-number">5.9. </span>Citation</a><a class="headerlink" href="#citation" title="Permalink to this heading">¶</a></h2>
 <p><a class="reference external" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="CC BY-NC-SA 4.0" src="https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png" /></a></p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="nd">@misc</span><span class="p">{</span><span class="n">tharsistpsouza2024tamingllms</span><span class="p">,</span>
   <span class="n">author</span> <span class="o">=</span> <span class="p">{</span><span class="n">Tharsis</span> <span class="n">T</span><span class="o">.</span> <span class="n">P</span><span class="o">.</span> <span class="n">Souza</span><span class="p">},</span>
@@ -2803,68 +2812,68 @@ <h2><a class="toc-backref" href="#id294" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="references">
-<h2><a class="toc-backref" href="#id295" role="doc-backlink"><span class="section-number">5.10. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id298" role="doc-backlink"><span class="section-number">5.10. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
 <div class="docutils container" id="id65">
-<div class="citation" id="id185" role="doc-biblioentry">
+<div class="citation" id="id188" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id30">AI24</a><span class="fn-bracket">]</span></span>
 <p>Meta AI. Llamaguard: llm-based input-output safeguard for human-ai conversations. Meta AI Research Publications, 2024. URL: <a class="reference external" href="https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/">https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/</a>.</p>
 </div>
-<div class="citation" id="id210" role="doc-biblioentry">
+<div class="citation" id="id213" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>ASA24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id33">1</a>,<a role="doc-backlink" href="#id34">2</a>)</span>
 <p>Jide Alaga, Jonas Schuett, and Markus Anderljung. A grading rubric for ai safety frameworks. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2409.08751">https://arxiv.org/abs/2409.08751</a>, <a class="reference external" href="https://arxiv.org/abs/2409.08751">arXiv:2409.08751</a>.</p>
 </div>
-<div class="citation" id="id220" role="doc-biblioentry">
+<div class="citation" id="id223" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>ABC+23<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id38">1</a>,<a role="doc-backlink" href="#id39">2</a>)</span>
 <p>Amanda Askell, Yuntao Bai, Anna Chen, Deep Ganguli, Danny Hernandez, Jared Kaplan, Jackson Kernion, Ben Mann, Catherine Olsson, and Paul Christiano. Constitutional ai: harmlessness from ai feedback. 2023. URL: <a class="reference external" href="https://www.anthropic.com/research/constitutional-ai-harmlessness-from-ai-feedback">https://www.anthropic.com/research/constitutional-ai-harmlessness-from-ai-feedback</a>.</p>
 </div>
-<div class="citation" id="id204" role="doc-biblioentry">
+<div class="citation" id="id207" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id4">BHY+24</a><span class="fn-bracket">]</span></span>
 <p>Yoshua Bengio, Geoffrey Hinton, Andrew Yao, Dawn Song, Pieter Abbeel, Trevor Darrell, Yuval Noah Harari, Ya-Qin Zhang, Lan Xue, Shai Shalev-Shwartz, Gillian Hadfield, Jeff Clune, Tegan Maharaj, Frank Hutter, Atılım Güneş Baydin, Sheila McIlraith, Qiqi Gao, Ashwin Acharya, David Krueger, Anca Dragan, Philip Torr, Stuart Russell, Daniel Kahneman, Jan Brauner, and Sören Mindermann. Managing extreme ai risks amid rapid progress. <em>Science</em>, 384(6698):842–845, 2024. URL: <a class="reference external" href="https://www.science.org/doi/abs/10.1126/science.adn0117">https://www.science.org/doi/abs/10.1126/science.adn0117</a>, <a class="reference external" href="https://arxiv.org/abs/https://www.science.org/doi/pdf/10.1126/science.adn0117">arXiv:https://www.science.org/doi/pdf/10.1126/science.adn0117</a>, <a class="reference external" href="https://doi.org/10.1126/science.adn0117">doi:10.1126/science.adn0117</a>.</p>
 </div>
-<div class="citation" id="id203" role="doc-biblioentry">
+<div class="citation" id="id206" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>BBC+24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id8">1</a>,<a role="doc-backlink" href="#id16">2</a>)</span>
 <p>Victoria Benjamin, Emily Braca, Israel Carter, Hafsa Kanchwala, Nava Khojasteh, Charly Landow, Yi Luo, Caroline Ma, Anna Magarelli, Rachel Mirin, Avery Moyer, Kayla Simpson, Amelia Skawinski, and Thomas Heverin. Systematically analyzing prompt injection vulnerabilities in diverse llm architectures. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2410.23308">https://arxiv.org/abs/2410.23308</a>, <a class="reference external" href="https://arxiv.org/abs/2410.23308">arXiv:2410.23308</a>.</p>
 </div>
-<div class="citation" id="id200" role="doc-biblioentry">
+<div class="citation" id="id203" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>BMC+24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id7">1</a>,<a role="doc-backlink" href="#id15">2</a>)</span>
 <p>Dillon Bowen, Brendan Murphy, Will Cai, David Khachaturov, Adam Gleave, and Kellin Pelrine. Data poisoning in llms: jailbreak-tuning and scaling laws. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2408.02946">https://arxiv.org/abs/2408.02946</a>, <a class="reference external" href="https://arxiv.org/abs/2408.02946">arXiv:2408.02946</a>.</p>
 </div>
-<div class="citation" id="id219" role="doc-biblioentry">
+<div class="citation" id="id222" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id40">CMM+24</a><span class="fn-bracket">]</span></span>
 <p>Erik Cambria, Lorenzo Malandri, Fabio Mercorio, Navid Nobani, and Andrea Seveso. Xai meets llms: a survey of the relation between explainable ai and large language models. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2407.15248">https://arxiv.org/abs/2407.15248</a>, <a class="reference external" href="https://arxiv.org/abs/2407.15248">arXiv:2407.15248</a>.</p>
 </div>
-<div class="citation" id="id206" role="doc-biblioentry">
+<div class="citation" id="id209" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>Edg24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id5">1</a>,<a role="doc-backlink" href="#id10">2</a>)</span>
 <p>Alec Edgington. How to exploit large language models for good or bad. <em>SIAM News</em>, 2024. URL: <a class="reference external" href="https://www.siam.org/publications/siam-news/articles/how-to-exploit-large-language-models-for-good-or-bad/">https://www.siam.org/publications/siam-news/articles/how-to-exploit-large-language-models-for-good-or-bad/</a>.</p>
 </div>
-<div class="citation" id="id208" role="doc-biblioentry">
+<div class="citation" id="id211" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>Exa24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id17">1</a>,<a role="doc-backlink" href="#id19">2</a>)</span>
 <p>Exabeam. Ai regulations and llm regulations: past, present, and future. Exabeam Blog, 2024. URL: <a class="reference external" href="https://www.exabeam.com/explainers/ai-cyber-security/ai-regulations-and-llm-regulations-past-present-and-future/">https://www.exabeam.com/explainers/ai-cyber-security/ai-regulations-and-llm-regulations-past-present-and-future/</a>.</p>
 </div>
-<div class="citation" id="id201" role="doc-biblioentry">
+<div class="citation" id="id204" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id13">GRB+24</a><span class="fn-bracket">]</span></span>
 <p>Isabel O. Gallegos, Ryan A. Rossi, Joe Barrow, Md Mehrab Tanjim, Sungchul Kim, Franck Dernoncourt, Tong Yu, Ruiyi Zhang, and Nesreen K. Ahmed. Bias and fairness in large language models: a survey. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2309.00770">https://arxiv.org/abs/2309.00770</a>, <a class="reference external" href="https://arxiv.org/abs/2309.00770">arXiv:2309.00770</a>.</p>
 </div>
-<div class="citation" id="id162" role="doc-biblioentry">
+<div class="citation" id="id165" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id63">H44z</a><span class="fn-bracket">]</span></span>
 <p>Hugging Face H4. Ultrafeedback binarized dataset. 2024z. A dataset of binary preference data for training language models. URL: <a class="reference external" href="https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized">https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized</a>.</p>
 </div>
-<div class="citation" id="id198" role="doc-biblioentry">
+<div class="citation" id="id201" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id1">HGP+22</a><span class="fn-bracket">]</span></span>
 <p>Thomas Hartvigsen, Saadia Gabriel, Hamid Palangi, Maarten Sap, Dipankar Ray, and Ece Kamar. ToxiGen: a large-scale machine-generated dataset for adversarial and implicit hate speech detection. In Smaranda Muresan, Preslav Nakov, and Aline Villavicencio, editors, <em>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</em>, 3309–3326. Dublin, Ireland, May 2022. Association for Computational Linguistics. URL: <a class="reference external" href="https://aclanthology.org/2022.acl-long.234">https://aclanthology.org/2022.acl-long.234</a>, <a class="reference external" href="https://doi.org/10.18653/v1/2022.acl-long.234">doi:10.18653/v1/2022.acl-long.234</a>.</p>
 </div>
-<div class="citation" id="id199" role="doc-biblioentry">
+<div class="citation" id="id202" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>HYM+24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id6">1</a>,<a role="doc-backlink" href="#id12">2</a>)</span>
 <p>Lei Huang, Weijiang Yu, Weitao Ma, Weihong Zhong, Zhangyin Feng, Haotian Wang, Qianglong Chen, Weihua Peng, Xiaocheng Feng, Bing Qin, and Ting Liu. A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions. <em>ACM Transactions on Information Systems</em>, November 2024. URL: <a class="reference external" href="http://dx.doi.org/10.1145/3703155">http://dx.doi.org/10.1145/3703155</a>, <a class="reference external" href="https://doi.org/10.1145/3703155">doi:10.1145/3703155</a>.</p>
 </div>
-<div class="citation" id="id221" role="doc-biblioentry">
+<div class="citation" id="id224" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>LDW+24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id41">1</a>,<a role="doc-backlink" href="#id42">2</a>)</span>
 <p>Lijun Li, Bowen Dong, Ruohui Wang, Xuhao Hu, Wangmeng Zuo, Dahua Lin, Yu Qiao, and Jing Shao. Salad-bench: a hierarchical and comprehensive safety benchmark for large language models. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2402.05044">https://arxiv.org/abs/2402.05044</a>, <a class="reference external" href="https://arxiv.org/abs/2402.05044">arXiv:2402.05044</a>.</p>
@@ -2874,30 +2883,30 @@ <h2><a class="toc-backref" href="#id295" role="doc-backlink"><span class="sectio
 <span class="backrefs">(<a role="doc-backlink" href="#id47">1</a>,<a role="doc-backlink" href="#id48">2</a>)</span>
 <p>Stephanie Lin, Jacob Hilton, and Owain Evans. Truthfulqa: measuring how models mimic human falsehoods. 2022. URL: <a class="reference external" href="https://arxiv.org/abs/2109.07958">https://arxiv.org/abs/2109.07958</a>, <a class="reference external" href="https://arxiv.org/abs/2109.07958">arXiv:2109.07958</a>.</p>
 </div>
-<div class="citation" id="id227" role="doc-biblioentry">
+<div class="citation" id="id230" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>MPY+24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id49">1</a>,<a role="doc-backlink" href="#id52">2</a>)</span>
 <p>Mantas Mazeika, Long Phan, Xuwang Yin, Andy Zou, Zifan Wang, Norman Mu, Elham Sakhaee, Nathaniel Li, Steven Basart, Bo Li, David Forsyth, and Dan Hendrycks. Harmbench: a standardized evaluation framework for automated red teaming and robust refusal. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2402.04249">https://arxiv.org/abs/2402.04249</a>, <a class="reference external" href="https://arxiv.org/abs/2402.04249">arXiv:2402.04249</a>.</p>
 </div>
-<div class="citation" id="id160" role="doc-biblioentry">
+<div class="citation" id="id163" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id31">MLC24</a><span class="fn-bracket">]</span></span>
 <p>MLCommons. Mlcommons ai illuminate benchmarks. 2024. A collection of standardized benchmarks for evaluating AI systems. URL: <a class="reference external" href="https://ailuminate.mlcommons.org/benchmarks/">https://ailuminate.mlcommons.org/benchmarks/</a>.</p>
 </div>
-<div class="citation" id="id197" role="doc-biblioentry">
+<div class="citation" id="id200" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id1">OAA+24</a><span class="fn-bracket">]</span></span>
 <p>OpenAI, Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, Red Avila, Igor Babuschkin, Suchir Balaji, Valerie Balcom, Paul Baltescu, Haiming Bao, Mohammad Bavarian, Jeff Belgum, Irwan Bello, Jake Berdine, Gabriel Bernadett-Shapiro, Christopher Berner, Lenny Bogdonoff, Oleg Boiko, Madelaine Boyd, Anna-Luisa Brakman, Greg Brockman, Tim Brooks, Miles Brundage, Kevin Button, Trevor Cai, Rosie Campbell, Andrew Cann, Brittany Carey, Chelsea Carlson, Rory Carmichael, Brooke Chan, Che Chang, Fotis Chantzis, Derek Chen, Sully Chen, Ruby Chen, Jason Chen, Mark Chen, Ben Chess, Chester Cho, Casey Chu, Hyung Won Chung, Dave Cummings, Jeremiah Currier, Yunxing Dai, Cory Decareaux, Thomas Degry, Noah Deutsch, Damien Deville, Arka Dhar, David Dohan, Steve Dowling, Sheila Dunning, Adrien Ecoffet, Atty Eleti, Tyna Eloundou, David Farhi, Liam Fedus, Niko Felix, Simón Posada Fishman, Juston Forte, Isabella Fulford, Leo Gao, Elie Georges, Christian Gibson, Vik Goel, Tarun Gogineni, Gabriel Goh, Rapha Gontijo-Lopes, Jonathan Gordon, Morgan Grafstein, Scott Gray, Ryan Greene, Joshua Gross, Shixiang Shane Gu, Yufei Guo, Chris Hallacy, Jesse Han, Jeff Harris, Yuchen He, Mike Heaton, Johannes Heidecke, Chris Hesse, Alan Hickey, Wade Hickey, Peter Hoeschele, Brandon Houghton, Kenny Hsu, Shengli Hu, Xin Hu, Joost Huizinga, Shantanu Jain, Shawn Jain, Joanne Jang, Angela Jiang, Roger Jiang, Haozhun Jin, Denny Jin, Shino Jomoto, Billie Jonn, Heewoo Jun, Tomer Kaftan, Łukasz Kaiser, Ali Kamali, Ingmar Kanitscheider, Nitish Shirish Keskar, Tabarak Khan, Logan Kilpatrick, Jong Wook Kim, Christina Kim, Yongjik Kim, Jan Hendrik Kirchner, Jamie Kiros, Matt Knight, Daniel Kokotajlo, Łukasz Kondraciuk, Andrew Kondrich, Aris Konstantinidis, Kyle Kosic, Gretchen Krueger, Vishal Kuo, Michael Lampe, Ikai Lan, Teddy Lee, Jan Leike, Jade Leung, Daniel Levy, Chak Ming Li, Rachel Lim, Molly Lin, Stephanie Lin, Mateusz Litwin, Theresa Lopez, Ryan Lowe, Patricia Lue, Anna Makanju, Kim Malfacini, Sam Manning, Todor Markov, Yaniv Markovski, Bianca Martin, Katie Mayer, Andrew Mayne, Bob McGrew, Scott Mayer McKinney, Christine McLeavey, Paul McMillan, Jake McNeil, David Medina, Aalok Mehta, Jacob Menick, Luke Metz, Andrey Mishchenko, Pamela Mishkin, Vinnie Monaco, Evan Morikawa, Daniel Mossing, Tong Mu, Mira Murati, Oleg Murk, David Mély, Ashvin Nair, Reiichiro Nakano, Rajeev Nayak, Arvind Neelakantan, Richard Ngo, Hyeonwoo Noh, Long Ouyang, Cullen O'Keefe, Jakub Pachocki, Alex Paino, Joe Palermo, Ashley Pantuliano, Giambattista Parascandolo, Joel Parish, Emy Parparita, Alex Passos, Mikhail Pavlov, Andrew Peng, Adam Perelman, Filipe de Avila Belbute Peres, Michael Petrov, Henrique Ponde de Oliveira Pinto, Michael, Pokorny, Michelle Pokrass, Vitchyr H. Pong, Tolly Powell, Alethea Power, Boris Power, Elizabeth Proehl, Raul Puri, Alec Radford, Jack Rae, Aditya Ramesh, Cameron Raymond, Francis Real, Kendra Rimbach, Carl Ross, Bob Rotsted, Henri Roussez, Nick Ryder, Mario Saltarelli, Ted Sanders, Shibani Santurkar, Girish Sastry, Heather Schmidt, David Schnurr, John Schulman, Daniel Selsam, Kyla Sheppard, Toki Sherbakov, Jessica Shieh, Sarah Shoker, Pranav Shyam, Szymon Sidor, Eric Sigler, Maddie Simens, Jordan Sitkin, Katarina Slama, Ian Sohl, Benjamin Sokolowsky, Yang Song, Natalie Staudacher, Felipe Petroski Such, Natalie Summers, Ilya Sutskever, Jie Tang, Nikolas Tezak, Madeleine B. Thompson, Phil Tillet, Amin Tootoonchian, Elizabeth Tseng, Preston Tuggle, Nick Turley, Jerry Tworek, Juan Felipe Cerón Uribe, Andrea Vallone, Arun Vijayvergiya, Chelsea Voss, Carroll Wainwright, Justin Jay Wang, Alvin Wang, Ben Wang, Jonathan Ward, Jason Wei, CJ Weinmann, Akila Welihinda, Peter Welinder, Jiayi Weng, Lilian Weng, Matt Wiethoff, Dave Willner, Clemens Winter, Samuel Wolrich, Hannah Wong, Lauren Workman, Sherwin Wu, Jeff Wu, Michael Wu, Kai Xiao, Tao Xu, Sarah Yoo, Kevin Yu, Qiming Yuan, Wojciech Zaremba, Rowan Zellers, Chong Zhang, Marvin Zhang, Shengjia Zhao, Tianhao Zheng, Juntang Zhuang, William Zhuk, and Barret Zoph. Gpt-4 technical report. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2303.08774">https://arxiv.org/abs/2303.08774</a>, <a class="reference external" href="https://arxiv.org/abs/2303.08774">arXiv:2303.08774</a>.</p>
 </div>
-<div class="citation" id="id240" role="doc-biblioentry">
+<div class="citation" id="id243" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>PNC+24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id61">1</a>,<a role="doc-backlink" href="#id62">2</a>)</span>
 <p>Inkit Padhi, Manish Nagireddy, Giandomenico Cornacchia, Subhajit Chaudhury, Tejaswini Pedapati, Pierre Dognin, Keerthiram Murugesan, Erik Miehling, Martín Santillán Cooper, Kieran Fraser, Giulio Zizzo, Muhammad Zaid Hameed, Mark Purcell, Michael Desmond, Qian Pan, Inge Vejsbjerg, Elizabeth M. Daly, Michael Hind, Werner Geyer, Ambrish Rawat, Kush R. Varshney, and Prasanna Sattigeri. Granite guardian. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2412.07724">https://arxiv.org/abs/2412.07724</a>, <a class="reference external" href="https://arxiv.org/abs/2412.07724">arXiv:2412.07724</a>.</p>
 </div>
-<div class="citation" id="id218" role="doc-biblioentry">
+<div class="citation" id="id221" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>PHS+22<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id36">1</a>,<a role="doc-backlink" href="#id37">2</a>)</span>
 <p>Ethan Perez, Saffron Huang, Francis Song, Trevor Cai, Roman Ring, John Aslanides, Amelia Glaese, Nat McAleese, and Geoffrey Irving. Red teaming language models with language models. 2022. URL: <a class="reference external" href="https://arxiv.org/abs/2202.03286">https://arxiv.org/abs/2202.03286</a>, <a class="reference external" href="https://arxiv.org/abs/2202.03286">arXiv:2202.03286</a>.</p>
 </div>
-<div class="citation" id="id207" role="doc-biblioentry">
+<div class="citation" id="id210" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id9">SZW+24</a><span class="fn-bracket">]</span></span>
 <p>Oliver J. Sutton, Qinghua Zhou, Wei Wang, Desmond J. Higham, Alexander N. Gorban, Alexander Bastounis, and Ivan Y. Tyukin. Stealth edits to large language models. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2406.12670">https://arxiv.org/abs/2406.12670</a>, <a class="reference external" href="https://arxiv.org/abs/2406.12670">arXiv:2406.12670</a>.</p>
 </div>
@@ -2906,110 +2915,110 @@ <h2><a class="toc-backref" href="#id295" role="doc-backlink"><span class="sectio
 <span class="backrefs">(<a role="doc-backlink" href="#id29">1</a>,<a role="doc-backlink" href="#id32">2</a>)</span>
 <p>Bertie Vidgen, Adarsh Agrawal, Ahmed M. Ahmed, Victor Akinwande, Namir Al-Nuaimi, Najla Alfaraj, Elie Alhajjar, Lora Aroyo, Trupti Bavalatti, Max Bartolo, Borhane Blili-Hamelin, Kurt Bollacker, Rishi Bomassani, Marisa Ferrara Boston, Siméon Campos, Kal Chakra, Canyu Chen, Cody Coleman, Zacharie Delpierre Coudert, Leon Derczynski, Debojyoti Dutta, Ian Eisenberg, James Ezick, Heather Frase, Brian Fuller, Ram Gandikota, Agasthya Gangavarapu, Ananya Gangavarapu, James Gealy, Rajat Ghosh, James Goel, Usman Gohar, Sujata Goswami, Scott A. Hale, Wiebke Hutiri, Joseph Marvin Imperial, Surgan Jandial, Nick Judd, Felix Juefei-Xu, Foutse Khomh, Bhavya Kailkhura, Hannah Rose Kirk, Kevin Klyman, Chris Knotz, Michael Kuchnik, Shachi H. Kumar, Srijan Kumar, Chris Lengerich, Bo Li, Zeyi Liao, Eileen Peters Long, Victor Lu, Sarah Luger, Yifan Mai, Priyanka Mary Mammen, Kelvin Manyeki, Sean McGregor, Virendra Mehta, Shafee Mohammed, Emanuel Moss, Lama Nachman, Dinesh Jinenhally Naganna, Amin Nikanjam, Besmira Nushi, Luis Oala, Iftach Orr, Alicia Parrish, Cigdem Patlak, William Pietri, Forough Poursabzi-Sangdeh, Eleonora Presani, Fabrizio Puletti, Paul Röttger, Saurav Sahay, Tim Santos, Nino Scherrer, Alice Schoenauer Sebag, Patrick Schramowski, Abolfazl Shahbazi, Vin Sharma, Xudong Shen, Vamsi Sistla, Leonard Tang, Davide Testuggine, Vithursan Thangarasa, Elizabeth Anne Watkins, Rebecca Weiss, Chris Welty, Tyler Wilbers, Adina Williams, Carole-Jean Wu, Poonam Yadav, Xianjun Yang, Yi Zeng, Wenhui Zhang, Fedor Zhdanov, Jiacheng Zhu, Percy Liang, Peter Mattson, and Joaquin Vanschoren. Introducing v0.5 of the ai safety benchmark from mlcommons. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2404.12241">https://arxiv.org/abs/2404.12241</a>, <a class="reference external" href="https://arxiv.org/abs/2404.12241">arXiv:2404.12241</a>.</p>
 </div>
-<div class="citation" id="id196" role="doc-biblioentry">
+<div class="citation" id="id199" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>VSK+24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id2">1</a>,<a role="doc-backlink" href="#id3">2</a>)</span>
 <p>Bertie Vidgen, Nino Scherrer, Hannah Rose Kirk, Rebecca Qian, Anand Kannappan, Scott A. Hale, and Paul Röttger. Simplesafetytests: a test suite for identifying critical safety risks in large language models. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2311.08370">https://arxiv.org/abs/2311.08370</a>, <a class="reference external" href="https://arxiv.org/abs/2311.08370">arXiv:2311.08370</a>.</p>
 </div>
-<div class="citation" id="id212" role="doc-biblioentry">
+<div class="citation" id="id215" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id35">WMR24</a><span class="fn-bracket">]</span></span>
 <p>Sandra Wachter, Brent Mittelstadt, and Chris Russell. Do large language models have a legal duty to tell the truth? <em>Royal Society Open Science</em>, 11(8):240197, 2024. URL: <a class="reference external" href="https://royalsocietypublishing.org/doi/abs/10.1098/rsos.240197">https://royalsocietypublishing.org/doi/abs/10.1098/rsos.240197</a>, <a class="reference external" href="https://arxiv.org/abs/https://royalsocietypublishing.org/doi/pdf/10.1098/rsos.240197">arXiv:https://royalsocietypublishing.org/doi/pdf/10.1098/rsos.240197</a>, <a class="reference external" href="https://doi.org/10.1098/rsos.240197">doi:10.1098/rsos.240197</a>.</p>
 </div>
-<div class="citation" id="id224" role="doc-biblioentry">
+<div class="citation" id="id227" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id46">YLX24</a><span class="fn-bracket">]</span></span>
 <p>Jiahao Yu, Xingwei Lin, and Xinyu Xing. Gptfuzzer: red teaming large language models with auto-generated safety test cases. Papers with Code, 2024. URL: <a class="reference external" href="https://paperswithcode.com/dataset/gptfuzzer">https://paperswithcode.com/dataset/gptfuzzer</a>.</p>
 </div>
-<div class="citation" id="id202" role="doc-biblioentry">
+<div class="citation" id="id205" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id14">ZYY+24</a><span class="fn-bracket">]</span></span>
 <p>Shuning Zhang, Lyumanshan Ye, Xin Yi, Jingyu Tang, Bo Shui, Haobin Xing, Pengfei Liu, and Hewu Li. &quot;ghost of the past&quot;: identifying and resolving privacy leakage from llm's memory through proactive user interaction. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2410.14931">https://arxiv.org/abs/2410.14931</a>, <a class="reference external" href="https://arxiv.org/abs/2410.14931">arXiv:2410.14931</a>.</p>
 </div>
-<div class="citation" id="id205" role="doc-biblioentry">
+<div class="citation" id="id208" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id11">Zho24</a><span class="fn-bracket">]</span></span>
 <p>Qinghua Zhou. Stealth edits: detecting stealth edits in llm outputs. Hugging Face Spaces, 2024. URL: <a class="reference external" href="https://huggingface.co/spaces/qinghua-zhou/stealth-edits">https://huggingface.co/spaces/qinghua-zhou/stealth-edits</a>.</p>
 </div>
-<div class="citation" id="id238" role="doc-biblioentry">
+<div class="citation" id="id241" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id56">AmazonWServices24</a><span class="fn-bracket">]</span></span>
 <p>Amazon Web Services. Amazon comprehend - natural language processing service. 2024. AWS natural language processing service for text analysis and content moderation. URL: <a class="reference external" href="https://aws.amazon.com/comprehend/">https://aws.amazon.com/comprehend/</a>.</p>
 </div>
-<div class="citation" id="id216" role="doc-biblioentry">
+<div class="citation" id="id219" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id26">Anthropic24</a><span class="fn-bracket">]</span></span>
 <p>Anthropic. Anthropic's responsible scaling policy. Technical Report, Anthropic, 2024. URL: <a class="reference external" href="https://www-cdn.anthropic.com/1adf000c8f675958c2ee23805d91aaade1cd4613/responsible-scaling-policy.pdf">https://www-cdn.anthropic.com/1adf000c8f675958c2ee23805d91aaade1cd4613/responsible-scaling-policy.pdf</a>.</p>
 </div>
-<div class="citation" id="id228" role="doc-biblioentry">
+<div class="citation" id="id231" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id50">CenterfASafety24a</a><span class="fn-bracket">]</span></span>
 <p>Center for AI Safety. Harmbench. GitHub repository, 2024. Framework for evaluating language model safety. URL: <a class="reference external" href="https://github.com/centerforaisafety/HarmBench">https://github.com/centerforaisafety/HarmBench</a>.</p>
 </div>
-<div class="citation" id="id229" role="doc-biblioentry">
+<div class="citation" id="id232" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id51">CenterfASafety24b</a><span class="fn-bracket">]</span></span>
 <p>Center for AI Safety. Harmbench leaderboard. 2024. Leaderboard tracking performance of language models on safety benchmarks. URL: <a class="reference external" href="https://www.harmbench.org/results">https://www.harmbench.org/results</a>.</p>
 </div>
-<div class="citation" id="id217" role="doc-biblioentry">
+<div class="citation" id="id220" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>DeepMind24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id27">1</a>,<a role="doc-backlink" href="#id28">2</a>)</span>
 <p>DeepMind. The frontier safety framework. Technical Report, DeepMind, 2024. URL: <a class="reference external" href="https://storage.googleapis.com/deepmind-media/DeepMind.com/Blog/introducing-the-frontier-safety-framework/fsf-technical-report.pdf">https://storage.googleapis.com/deepmind-media/DeepMind.com/Blog/introducing-the-frontier-safety-framework/fsf-technical-report.pdf</a>.</p>
 </div>
-<div class="citation" id="id209" role="doc-biblioentry">
+<div class="citation" id="id212" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id19">EuropeanMAgency24</a><span class="fn-bracket">]</span></span>
 <p>European Medicines Agency. Guiding principles for the use of large language models in regulatory science and medicines regulatory activities. Guidance Document, European Medicines Agency, 2024. URL: <a class="reference external" href="https://www.ema.europa.eu/en/documents/other/guiding-principles-use-large-language-models-regulatory-science-medicines-regulatory-activities_en.pdf">https://www.ema.europa.eu/en/documents/other/guiding-principles-use-large-language-models-regulatory-science-medicines-regulatory-activities_en.pdf</a>.</p>
 </div>
-<div class="citation" id="id194" role="doc-biblioentry">
+<div class="citation" id="id197" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id18">FinancialIRAuthority24</a><span class="fn-bracket">]</span></span>
 <p>Financial Industry Regulatory Authority. Artificial intelligence, including large language models and generative ai. Regulatory Notice 24-09, FINRA, 2024. URL: <a class="reference external" href="https://www.finra.org/rules-guidance/notices/24-09">https://www.finra.org/rules-guidance/notices/24-09</a>.</p>
 </div>
-<div class="citation" id="id239" role="doc-biblioentry">
+<div class="citation" id="id242" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id60">IBM24</a><span class="fn-bracket">]</span></span>
 <p>IBM. Ibm watsonx.ai risk atlas. 2024. A framework for identifying and mitigating risks in AI systems. URL: <a class="reference external" href="https://www.ibm.com/docs/en/watsonx/saas?topic=ai-risk-atlas">https://www.ibm.com/docs/en/watsonx/saas?topic=ai-risk-atlas</a>.</p>
 </div>
-<div class="citation" id="id213" role="doc-biblioentry">
+<div class="citation" id="id216" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id22">LibraryoCongress23</a><span class="fn-bracket">]</span></span>
 <p>Library of Congress. China: generative ai measures finalized. July 2023. URL: <a class="reference external" href="https://www.loc.gov/item/global-legal-monitor/2023-07-18/china-generative-ai-measures-finalized/">https://www.loc.gov/item/global-legal-monitor/2023-07-18/china-generative-ai-measures-finalized/</a>.</p>
 </div>
-<div class="citation" id="id234" role="doc-biblioentry">
+<div class="citation" id="id237" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id58">MistralAI24</a><span class="fn-bracket">]</span></span>
 <p>Mistral AI. Mistral moderation: a technical report. 2024. URL: <a class="reference external" href="https://mistral.ai/news/mistral-moderation/">https://mistral.ai/news/mistral-moderation/</a>.</p>
 </div>
-<div class="citation" id="id226" role="doc-biblioentry">
+<div class="citation" id="id229" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id54">MLSTeam24</a><span class="fn-bracket">]</span></span>
 <p>ML Safety Team. Safebench: a comprehensive benchmark for llm safety evaluation. ML Safety Website, 2024. URL: <a class="reference external" href="https://www.mlsafety.org/safebench">https://www.mlsafety.org/safebench</a>.</p>
 </div>
-<div class="citation" id="id214" role="doc-biblioentry">
+<div class="citation" id="id217" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id23">NationalIoSaTechnology24</a><span class="fn-bracket">]</span></span>
 <p>National Institute of Standards and Technology. Ai risk management framework. Technical Report, National Institute of Standards and Technology, 2024. URL: <a class="reference external" href="https://www.nist.gov/itl/ai-risk-management-framework">https://www.nist.gov/itl/ai-risk-management-framework</a>.</p>
 </div>
-<div class="citation" id="id237" role="doc-biblioentry">
+<div class="citation" id="id240" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id57">NVIDIA24</a><span class="fn-bracket">]</span></span>
 <p>NVIDIA. Nemo-guardrails: an open-source toolkit for building reliable and safe llm applications. 2024. A framework for creating reliable and safe LLM applications with customizable guardrails. URL: <a class="reference external" href="https://github.com/NVIDIA/NeMo-Guardrails">https://github.com/NVIDIA/NeMo-Guardrails</a>.</p>
 </div>
-<div class="citation" id="id235" role="doc-biblioentry">
+<div class="citation" id="id238" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id59">OpenAI24a</a><span class="fn-bracket">]</span></span>
 <p>OpenAI. Openai moderation api. 2024. Documentation for OpenAI's content moderation API. URL: <a class="reference external" href="https://platform.openai.com/docs/guides/moderation">https://platform.openai.com/docs/guides/moderation</a>.</p>
 </div>
-<div class="citation" id="id215" role="doc-biblioentry">
+<div class="citation" id="id218" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>OpenAI24b<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id24">1</a>,<a role="doc-backlink" href="#id25">2</a>)</span>
 <p>OpenAI. Openai preparedness framework. Technical Report, OpenAI, 2024. URL: <a class="reference external" href="https://cdn.openai.com/openai-preparedness-framework-beta.pdf">https://cdn.openai.com/openai-preparedness-framework-beta.pdf</a>.</p>
 </div>
-<div class="citation" id="id223" role="doc-biblioentry">
+<div class="citation" id="id226" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id44">OpenSafetyLab24a</a><span class="fn-bracket">]</span></span>
 <p>OpenSafetyLab. Salad-bench leaderboard. Hugging Face Space, 2024. URL: <a class="reference external" href="https://huggingface.co/spaces/OpenSafetyLab/Salad-Bench-Leaderboard">https://huggingface.co/spaces/OpenSafetyLab/Salad-Bench-Leaderboard</a>.</p>
 </div>
-<div class="citation" id="id222" role="doc-biblioentry">
+<div class="citation" id="id225" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id45">OpenSafetyLab24b</a><span class="fn-bracket">]</span></span>
 <p>OpenSafetyLab. Salad-data: a hierarchical and comprehensive safety dataset for large language models. Hugging Face Dataset, 2024. URL: <a class="reference external" href="https://huggingface.co/datasets/OpenSafetyLab/Salad-Data">https://huggingface.co/datasets/OpenSafetyLab/Salad-Data</a>.</p>
 </div>
-<div class="citation" id="id236" role="doc-biblioentry">
+<div class="citation" id="id239" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id55">ProtectAI24</a><span class="fn-bracket">]</span></span>
 <p>ProtectAI. Llm-guard: comprehensive safety and security framework for large language models. 2024. An open-source toolkit for LLM security and safety. URL: <a class="reference external" href="https://github.com/protectai/llm-guard">https://github.com/protectai/llm-guard</a>.</p>
 </div>
-<div class="citation" id="id231" role="doc-biblioentry">
+<div class="citation" id="id234" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id64">SurgeAI24</a><span class="fn-bracket">]</span></span>
 <p>Surge AI. Surge ai profanity dataset. GitHub repository, 2024. A comprehensive dataset for training and evaluating profanity detection models. URL: <a class="reference external" href="https://github.com/surge-ai/profanity">https://github.com/surge-ai/profanity</a>.</p>
 </div>
-<div class="citation" id="id184" role="doc-biblioentry">
+<div class="citation" id="id187" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id21">UKGovernment24</a><span class="fn-bracket">]</span></span>
 <p>UK Government. Ai regulation: a pro-innovation approach. White Paper, Department for Science, Innovation and Technology, 2024. URL: <a class="reference external" href="https://www.gov.uk/government/publications/ai-regulation-a-pro-innovation-approach/white-paper">https://www.gov.uk/government/publications/ai-regulation-a-pro-innovation-approach/white-paper</a>.</p>
 </div>
-<div class="citation" id="id211" role="doc-biblioentry">
+<div class="citation" id="id214" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id20">UNICEF24</a><span class="fn-bracket">]</span></span>
 <p>UNICEF. Policy guidance on ai for children. Policy Report, UNICEF Office of Research - Innocenti, 2024. URL: <a class="reference external" href="https://www.unicef.org/innocenti/reports/policy-guidance-ai-children">https://www.unicef.org/innocenti/reports/policy-guidance-ai-children</a>.</p>
 </div>
diff --git a/tamingllms/_build/html/notebooks/structured_output.html b/tamingllms/_build/html/notebooks/structured_output.html
index df0d85c..3935cd2 100644
--- a/tamingllms/_build/html/notebooks/structured_output.html
+++ b/tamingllms/_build/html/notebooks/structured_output.html
@@ -208,6 +208,15 @@
           </li>
 
         
+          <li class="toctree-l1 ">
+            
+              <a href="cost.html" class="reference internal ">The Falling Cost Paradox</a>
+            
+
+            
+          </li>
+
+        
       </ul>
     </div>
   
@@ -240,7 +249,7 @@
           <div class="content" role="main" v-pre>
             
   <section class="tex2jax_ignore mathjax_ignore" id="structured-output">
-<span id="structure"></span><h1><a class="toc-backref" href="#id194" role="doc-backlink"><span class="section-number">4. </span>Structured Output</a><a class="headerlink" href="#structured-output" title="Permalink to this heading">¶</a></h1>
+<span id="structure"></span><h1><a class="toc-backref" href="#id197" role="doc-backlink"><span class="section-number">4. </span>Structured Output</a><a class="headerlink" href="#structured-output" title="Permalink to this heading">¶</a></h1>
 <blockquote class="epigraph">
 <div><p>In limits, there is freedom. Creativity thrives within structure.</p>
 <p class="attribution">—Julia B. Cameron</p>
@@ -248,41 +257,41 @@
 <nav class="contents" id="contents">
 <p class="topic-title">Contents</p>
 <ul class="simple">
-<li><p><a class="reference internal" href="#structured-output" id="id194">Structured Output</a></p>
+<li><p><a class="reference internal" href="#structured-output" id="id197">Structured Output</a></p>
 <ul>
-<li><p><a class="reference internal" href="#introduction" id="id195">Introduction</a></p></li>
-<li><p><a class="reference internal" href="#problem-statement" id="id196">Problem Statement</a></p></li>
-<li><p><a class="reference internal" href="#techniques" id="id197">Techniques</a></p>
+<li><p><a class="reference internal" href="#introduction" id="id198">Introduction</a></p></li>
+<li><p><a class="reference internal" href="#problem-statement" id="id199">Problem Statement</a></p></li>
+<li><p><a class="reference internal" href="#techniques" id="id200">Techniques</a></p>
 <ul>
-<li><p><a class="reference internal" href="#prompt-engineering" id="id198">Prompt Engineering</a></p></li>
-<li><p><a class="reference internal" href="#json-mode-fine-tuned" id="id199">JSON Mode (Fine-Tuned)</a></p></li>
-<li><p><a class="reference internal" href="#logit-post-processing" id="id200">Logit Post-Processing</a></p></li>
+<li><p><a class="reference internal" href="#prompt-engineering" id="id201">Prompt Engineering</a></p></li>
+<li><p><a class="reference internal" href="#json-mode-fine-tuned" id="id202">JSON Mode (Fine-Tuned)</a></p></li>
+<li><p><a class="reference internal" href="#logit-post-processing" id="id203">Logit Post-Processing</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#tools" id="id201">Tools</a></p>
+<li><p><a class="reference internal" href="#tools" id="id204">Tools</a></p>
 <ul>
-<li><p><a class="reference internal" href="#outlines" id="id202">Outlines</a></p></li>
-<li><p><a class="reference internal" href="#langchain" id="id203">LangChain</a></p></li>
-<li><p><a class="reference internal" href="#ollama" id="id204">Ollama</a></p></li>
+<li><p><a class="reference internal" href="#outlines" id="id205">Outlines</a></p></li>
+<li><p><a class="reference internal" href="#langchain" id="id206">LangChain</a></p></li>
+<li><p><a class="reference internal" href="#ollama" id="id207">Ollama</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#discussion" id="id205">Discussion</a></p>
+<li><p><a class="reference internal" href="#discussion" id="id208">Discussion</a></p>
 <ul>
-<li><p><a class="reference internal" href="#best-practices" id="id206">Best Practices</a></p></li>
-<li><p><a class="reference internal" href="#comparing-solutions" id="id207">Comparing Solutions</a></p></li>
-<li><p><a class="reference internal" href="#research-and-ongoing-debate" id="id208">Research and Ongoing Debate</a></p></li>
+<li><p><a class="reference internal" href="#best-practices" id="id209">Best Practices</a></p></li>
+<li><p><a class="reference internal" href="#comparing-solutions" id="id210">Comparing Solutions</a></p></li>
+<li><p><a class="reference internal" href="#research-and-ongoing-debate" id="id211">Research and Ongoing Debate</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#conclusion" id="id209">Conclusion</a></p></li>
-<li><p><a class="reference internal" href="#acknowledgements" id="id210">Acknowledgements</a></p></li>
-<li><p><a class="reference internal" href="#citation" id="id211">Citation</a></p></li>
-<li><p><a class="reference internal" href="#references" id="id212">References</a></p></li>
+<li><p><a class="reference internal" href="#conclusion" id="id212">Conclusion</a></p></li>
+<li><p><a class="reference internal" href="#acknowledgements" id="id213">Acknowledgements</a></p></li>
+<li><p><a class="reference internal" href="#citation" id="id214">Citation</a></p></li>
+<li><p><a class="reference internal" href="#references" id="id215">References</a></p></li>
 </ul>
 </li>
 </ul>
 </nav>
 <section id="introduction">
-<h2><a class="toc-backref" href="#id195" role="doc-backlink"><span class="section-number">4.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id198" role="doc-backlink"><span class="section-number">4.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
 <p>Language Models excel at generating human-like text, but they often struggle to produce output in a structured format, consistently. This poses a significant challenge when we need LLMs to generate data that can be easily processed by downstream systems, such as databases, APIs, or other software applications. Sometimes, even with a well-crafted prompt, an LLM might produce an unstructured response when a structured one is expected. This can be particularly challenging when integrating LLMs into systems that require specific data formats.</p>
 <p>What user needs drive the demand for LLM output constraints when building LLM-based applications? In a recent work by Google Research <span id="id1">[<a class="reference internal" href="#id51" title="Michael Xieyang Liu, Frederick Liu, Alexander J. Fiannaca, Terry Koo, Lucas Dixon, Michael Terry, and Carrie J. Cai. &quot;we need structured output&quot;: towards user-centered constraints on large language model output. In Extended Abstracts of the CHI Conference on Human Factors in Computing Systems, CHI EA '24. New York, NY, USA, 2024. Association for Computing Machinery. URL: https://doi.org/10.1145/3613905.3650756, doi:10.1145/3613905.3650756.">Liu <em>et al.</em>, 2024</a>]</span>, the authors explore the user need for constraints on the output of large language models, drawing on a survey of 51 industry professionals who use LLMs in their work. These needs can be broadly categorized as follows:</p>
 <p><strong>1. Improving Developer Efficiency and Workflow</strong></p>
@@ -306,7 +315,7 @@ <h2><a class="toc-backref" href="#id195" role="doc-backlink"><span class="sectio
 <p>It is important to emphasize that the ability to constrain LLM output is not just a technical consideration but a fundamental user need, impacting developer efficiency, user experience, and the overall success of LLM-powered applications.</p>
 </section>
 <section id="problem-statement">
-<h2><a class="toc-backref" href="#id196" role="doc-backlink"><span class="section-number">4.2. </span>Problem Statement</a><a class="headerlink" href="#problem-statement" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id199" role="doc-backlink"><span class="section-number">4.2. </span>Problem Statement</a><a class="headerlink" href="#problem-statement" title="Permalink to this heading">¶</a></h2>
 <p>Language models based on the Transformer architecture are next token prediction machines.
 These models calculate the probability of observing a token (from a vocabulary of size <span class="math notranslate nohighlight">\(n\)</span>) conditioned on the previous tokens in the sequence. This process can be expressed mathematically as:</p>
 <div class="math notranslate nohighlight">
@@ -326,7 +335,7 @@ <h2><a class="toc-backref" href="#id196" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="techniques">
-<h2><a class="toc-backref" href="#id197" role="doc-backlink"><span class="section-number">4.3. </span>Techniques</a><a class="headerlink" href="#techniques" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id200" role="doc-backlink"><span class="section-number">4.3. </span>Techniques</a><a class="headerlink" href="#techniques" title="Permalink to this heading">¶</a></h2>
 <p>There are many techniques to obtain structured output from LLMs <span id="id3">[<a class="reference internal" href="#id19" title="Xun Liang, Hanyu Wang, Yezhaohui Wang, Shichao Song, Jiawei Yang, Simin Niu, Jie Hu, Dan Liu, Shunyu Yao, Feiyu Xiong, and Zhiyu Li. Controllable text generation for large language models: a survey. 2024. URL: https://arxiv.org/abs/2408.12599, arXiv:2408.12599.">Liang <em>et al.</em>, 2024</a>]</span>. They can be broadly categorized into two types based on the phase they are applied to:</p>
 <ol class="arabic simple">
 <li><p><strong>Training-Time Techniques (TTT)</strong>: These techniques are applied during the training or post-training phases of the LLM. They are used to guide the model to learn the specific patterns and structures that are required for the task at hand.</p></li>
@@ -353,7 +362,7 @@ <h2><a class="toc-backref" href="#id197" role="doc-backlink"><span class="sectio
 </li>
 </ul>
 <section id="prompt-engineering">
-<h3><a class="toc-backref" href="#id198" role="doc-backlink"><span class="section-number">4.3.1. </span>Prompt Engineering</a><a class="headerlink" href="#prompt-engineering" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id201" role="doc-backlink"><span class="section-number">4.3.1. </span>Prompt Engineering</a><a class="headerlink" href="#prompt-engineering" title="Permalink to this heading">¶</a></h3>
 <p>In one-shot prompting, you provide a single example of the desired output format within the prompt.</p>
 <p>As a motivating example, consider the following simple task: Given a segment of a SEC financial filing, generate a two-person discussion about the key financial data from the text in JSON format, simulating what would be a real-world discussion about the underlying companies’ disclosed financial information. We would like to generate a structured output that can be easily parsed and integrated with other systems.</p>
 <div class="cell docutils container">
@@ -458,7 +467,7 @@ <h3><a class="toc-backref" href="#id198" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="json-mode-fine-tuned">
-<h3><a class="toc-backref" href="#id199" role="doc-backlink"><span class="section-number">4.3.2. </span>JSON Mode (Fine-Tuned)</a><a class="headerlink" href="#json-mode-fine-tuned" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id202" role="doc-backlink"><span class="section-number">4.3.2. </span>JSON Mode (Fine-Tuned)</a><a class="headerlink" href="#json-mode-fine-tuned" title="Permalink to this heading">¶</a></h3>
 <p>One-shot prompting is a simple technique that can lead to material improvements in structured output, though may not be sufficient for complex (e.g. nested) structures and / or when the model’s output needs to be restricted to a specific set of options or types.</p>
 <p>Some models offer so-called “JSON Mode” as an attempt to handle those challenges, which are a form of fine-tuning, hence while useful it is not guaranteed to work for all models.</p>
 <p>JSON mode is a feature provided by most LLM API providers, such as OpenAI, that allows the model to generate output in JSON format. This is particularly useful when you need structured data as a result, such as when parsing the output programmatically or integrating it with other systems that require JSON input. As depicted in <a class="reference internal" href="#json-mode"><span class="std std-numref">Fig. 4.1</span></a>, JSON mode is implemented by instructing the LLM model to use JSON as response format and optionally defining a target schema.</p>
@@ -585,7 +594,7 @@ <h3><a class="toc-backref" href="#id199" role="doc-backlink"><span class="sectio
 <p>This example solution is specific to OpenAI’s API. That begs the question: How can we solve this problem generally for widely available LLM providers? Enters logit post-processing.</p>
 </section>
 <section id="logit-post-processing">
-<h3><a class="toc-backref" href="#id200" role="doc-backlink"><span class="section-number">4.3.3. </span>Logit Post-Processing</a><a class="headerlink" href="#logit-post-processing" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id203" role="doc-backlink"><span class="section-number">4.3.3. </span>Logit Post-Processing</a><a class="headerlink" href="#logit-post-processing" title="Permalink to this heading">¶</a></h3>
 <p>Logit post-processing is a technique that involves modifying the logits of the LLM’s output before it is converted into text.</p>
 <p>The text generation process follows a probabilistic approach. At each step, the model calculates the probability distribution over its entire vocabulary to determine the most likely next token.</p>
 <p>Let’s examine how an LLM processes an example prompt “Is Enzo a good name for a baby?” as depicted in <a class="reference internal" href="#logit"><span class="std std-numref">Fig. 4.2</span></a>:</p>
@@ -617,6 +626,12 @@ <h3><a class="toc-backref" href="#id200" role="doc-backlink"><span class="sectio
 </pre></div>
 </div>
 </div>
+<div class="cell_output docutils container">
+<div class="output stderr highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>/home/tobias/src/tamingLLMs/tamingllms/.venv/lib/python3.11/site-packages/torch/cuda/__init__.py:654: UserWarning: Can&#39;t initialize NVML
+  warnings.warn(&quot;Can&#39;t initialize NVML&quot;)
+</pre></div>
+</div>
+</div>
 </div>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
@@ -814,9 +829,9 @@ <h3><a class="toc-backref" href="#id200" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="tools">
-<h2><a class="toc-backref" href="#id201" role="doc-backlink"><span class="section-number">4.4. </span>Tools</a><a class="headerlink" href="#tools" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id204" role="doc-backlink"><span class="section-number">4.4. </span>Tools</a><a class="headerlink" href="#tools" title="Permalink to this heading">¶</a></h2>
 <section id="outlines">
-<h3><a class="toc-backref" href="#id202" role="doc-backlink"><span class="section-number">4.4.1. </span>Outlines</a><a class="headerlink" href="#outlines" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id205" role="doc-backlink"><span class="section-number">4.4.1. </span>Outlines</a><a class="headerlink" href="#outlines" title="Permalink to this heading">¶</a></h3>
 <p>Outlines <span id="id4">[<a class="reference internal" href="#id25" title="Outlines. Type-safe structured output from llms. https://dottxt-ai.github.io/outlines/latest/, 2024. Accessed: 2024.">Outlines, 2024</a>]</span> is a library specifically focused on structured text generation from LLMs. Under the hood, Outlines works by adjusting the probability distribution of the model’s output logits - the raw scores from the final layer of the neural network that are normally converted into text tokens. By introducing carefully crafted logit biases, Outlines can guide the model to prefer certain tokens over others, effectively constraining its outputs to a predefined set of valid options.</p>
 <p>The authors solve the general guided generation problem <span id="id5">[<a class="reference internal" href="#id74" title="Brandon T. Willard and Rémi Louf. Efficient guided generation for large language models. 2023. URL: https://arxiv.org/abs/2307.09702, arXiv:2307.09702.">Willard and Louf, 2023</a>]</span>, which as a consequence solves the problem of structured output generation, in LLMs by introducing an efficient indexing approach that reformulates neural text generation using finite-state machines (FSMs).</p>
 <p>They define the next token generation as a random variable:</p>
@@ -960,10 +975,10 @@ <h3><a class="toc-backref" href="#id202" role="doc-backlink"><span class="sectio
 </div>
 </div>
 <p>We observe that the model was able to extract the entities and places from the input text, and return them in the specified format. However, it is interesting to see that the model hallucinates a few entities, a phenomenon that is common for smaller Open Source models that were not fine-tuned on the task of entity extraction.</p>
-<p>You can also use Outlines with LangChain <span id="id7">[<a class="reference internal" href="#id101" title="LangChain. Outlines integration documentation. Online Documentation, 2024b. Documentation on integrating Outlines library with LangChain for structured generation. URL: https://python.langchain.com/docs/integrations/chat/outlines/.">LangChain, 2024b</a>]</span>.</p>
+<p>You can also use Outlines with LangChain <span id="id7">[<a class="reference internal" href="#id104" title="LangChain. Outlines integration documentation. Online Documentation, 2024b. Documentation on integrating Outlines library with LangChain for structured generation. URL: https://python.langchain.com/docs/integrations/chat/outlines/.">LangChain, 2024b</a>]</span>.</p>
 </section>
 <section id="langchain">
-<h3><a class="toc-backref" href="#id203" role="doc-backlink"><span class="section-number">4.4.2. </span>LangChain</a><a class="headerlink" href="#langchain" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id206" role="doc-backlink"><span class="section-number">4.4.2. </span>LangChain</a><a class="headerlink" href="#langchain" title="Permalink to this heading">¶</a></h3>
 <p>LangChain is a framework designed to simplify the development of LLM applications. It provider an abstraction layer over many LLM providers, including OpenAI, that offers several tools for parsing structured output.</p>
 <p>In particular, LangChain offers the <code class="docutils literal notranslate"><span class="pre">with_structured_output</span></code> method, which can be used with LLMs that support structured output APIs, allowing you to enforce a schema directly within the prompt.</p>
 <blockquote>
@@ -1021,7 +1036,7 @@ <h3><a class="toc-backref" href="#id203" role="doc-backlink"><span class="sectio
 <p>We observe that the model was able to extract the entities and places from the input text, and return them in the specified format. A full list of models that support <code class="docutils literal notranslate"><span class="pre">.with_structured_output()</span></code> can be found <a class="reference external" href="https://python.langchain.com/docs/integrations/chat/#featured-providers">here</a>.</p>
 </section>
 <section id="ollama">
-<h3><a class="toc-backref" href="#id204" role="doc-backlink"><span class="section-number">4.4.3. </span>Ollama</a><a class="headerlink" href="#ollama" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id207" role="doc-backlink"><span class="section-number">4.4.3. </span>Ollama</a><a class="headerlink" href="#ollama" title="Permalink to this heading">¶</a></h3>
 <p>Ollama is a popular tool that allows you to run large language models (LLMs) locally. It has recently added support for structured output generation. The current <code class="docutils literal notranslate"><span class="pre">ollama</span></code> implementation leverages llama.cpp GBNF (GGML BNF) grammars <span id="id8">[<a class="reference internal" href="#id49" title="Ggerganov. Llama.cpp grammars documentation. https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md, 2024. Accessed: 2024.">Ggerganov, 2024</a>]</span> to enable structured output generation.</p>
 <p>llama.cpp GBNF forces language models to generate output in specific, predefined formats by constraining their outputs to follow precise rules and patterns. The system accomplishes this through a formal grammar specification that defines exactly how valid outputs can be constructed. It’s essentially an extension of BNF (Backus-Naur Form) <span id="id9">[<a class="reference internal" href="#id50" title="Wikipedia contributors. Backus naur form. https://en.wiktionary.org/wiki/Backus-Naur_form, 2024. Accessed: 2024.">Wikipedia contributors, 2024</a>]</span> with some modern regex-like features added. These rules carefully define what elements are allowed, how they can be combined, and what patterns of repetition and sequencing are valid. By enforcing these constraints during generation, GBNF ensures the model’s output strictly adheres to the desired format.</p>
 <p>Ollama first introduced structured output generation in version 0.5.1 providing support for JSON output but highlighting additional formats are coming soon.</p>
@@ -1119,9 +1134,9 @@ <h3><a class="toc-backref" href="#id204" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="discussion">
-<h2><a class="toc-backref" href="#id205" role="doc-backlink"><span class="section-number">4.5. </span>Discussion</a><a class="headerlink" href="#discussion" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id208" role="doc-backlink"><span class="section-number">4.5. </span>Discussion</a><a class="headerlink" href="#discussion" title="Permalink to this heading">¶</a></h2>
 <section id="best-practices">
-<h3><a class="toc-backref" href="#id206" role="doc-backlink"><span class="section-number">4.5.1. </span>Best Practices</a><a class="headerlink" href="#best-practices" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id209" role="doc-backlink"><span class="section-number">4.5.1. </span>Best Practices</a><a class="headerlink" href="#best-practices" title="Permalink to this heading">¶</a></h3>
 <p>When implementing structured output with LLMs, it’s crucial to understand the distinction between different approaches. Some methods, like Outlines’ logit post-processing, provide mathematical guarantees that the output will conform to the specified structure. These contrast sharply with approaches like JSON mode, which rely on fine-tuned models or prompting that offer no formal guarantees. This distinction becomes particularly important in production environments where reliability and consistency are paramount. With that in mind, here are some best practices to consider when implementing structured output with LLMs:</p>
 <ul class="simple">
 <li><p><strong>Clear Schema Definition</strong>: Define the desired output structure clearly. This can be done in several ways including schemas, types, or Pydantic models as appropriate. This ensures the LLM knows exactly what format is expected.</p></li>
@@ -1131,7 +1146,7 @@ <h3><a class="toc-backref" href="#id206" role="doc-backlink"><span class="sectio
 <p>In summary, first one needs to clearly define the typed structure LLM applications will interface with, then determine whether strong guarantees are needed in order to determine tradeoffs between control and ease of implementation.</p>
 </section>
 <section id="comparing-solutions">
-<h3><a class="toc-backref" href="#id207" role="doc-backlink"><span class="section-number">4.5.2. </span>Comparing Solutions</a><a class="headerlink" href="#comparing-solutions" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id210" role="doc-backlink"><span class="section-number">4.5.2. </span>Comparing Solutions</a><a class="headerlink" href="#comparing-solutions" title="Permalink to this heading">¶</a></h3>
 <p>The choice of framework for structured LLM output depends heavily on specific constraints, requirements and use cases. LangChain is the most used LLM framework today with a large developer community base however its structured output support depends on the underlying LLM provider support. Ollama enables straightforward local deployment and experimentation democratizing access to LLMs while fostering privacy and control, however today it only offers JSON format with further formats to come. Outlines emerges as a solution with great flexibility and control over output structure while providing support for a wide range of LLMs. <a class="reference internal" href="#structured-output-frameworks"><span class="std std-numref">Table 4.1</span></a> provides a summary comparison of the different frameworks.</p>
 <table class="docutils align-default" id="structured-output-frameworks">
 <caption><span class="caption-number">Table 4.1 </span><span class="caption-text">Structured Output Frameworks Comparison</span><a class="headerlink" href="#structured-output-frameworks" title="Permalink to this table">¶</a></caption>
@@ -1175,10 +1190,10 @@ <h3><a class="toc-backref" href="#id207" role="doc-backlink"><span class="sectio
 </tr>
 </tbody>
 </table>
-<p>Other related tools not covered in this chapter worth mentioning include Guidance <span id="id10">[<a class="reference internal" href="#id95" title="Guidance AI. Guidance: language model programming. GitHub Repository, 2024. Framework for programming language models with structured templating and control flow. URL: https://github.com/guidance-ai/guidance.">Guidance AI, 2024</a>]</span> and NVIDIA’s Logits Processor Zoo <span id="id11">[<a class="reference internal" href="#id94" title="NVIDIA. Logits processor zoo. GitHub Repository, 2024a. Collection of logits processors for controlling language model generation. URL: https://github.com/NVIDIA/logits-processor-zoo.">NVIDIA, 2024a</a>]</span>.</p>
+<p>Other related tools not covered in this chapter worth mentioning include Guidance <span id="id10">[<a class="reference internal" href="#id96" title="Guidance AI. Guidance: language model programming. GitHub Repository, 2024. Framework for programming language models with structured templating and control flow. URL: https://github.com/guidance-ai/guidance.">Guidance AI, 2024</a>]</span> and NVIDIA’s Logits Processor Zoo <span id="id11">[<a class="reference internal" href="#id95" title="NVIDIA. Logits processor zoo. GitHub Repository, 2024a. Collection of logits processors for controlling language model generation. URL: https://github.com/NVIDIA/logits-processor-zoo.">NVIDIA, 2024a</a>]</span>.</p>
 </section>
 <section id="research-and-ongoing-debate">
-<h3><a class="toc-backref" href="#id208" role="doc-backlink"><span class="section-number">4.5.3. </span>Research and Ongoing Debate</a><a class="headerlink" href="#research-and-ongoing-debate" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id211" role="doc-backlink"><span class="section-number">4.5.3. </span>Research and Ongoing Debate</a><a class="headerlink" href="#research-and-ongoing-debate" title="Permalink to this heading">¶</a></h3>
 <p>The use of structured output for Large Language Models (LLMs) is a developing area. While the ability to constrain LLM outputs offer clear benefits in parsing, robustness, and integration, there is growing debate on whether it also potentially comes at the cost of performance as well as reasoning abilities. Research in this area should be taken with a grain of salt since findings are mixed and often depend on the specific task and model family at hand furthermore model families are not always comparable and are getting updated by the day! Nonetheless, early findings provide some interesting insights as to why there is no one-size-fits-all solution when it comes to LLMs structured output.</p>
 <p>There is some evidence indicating that LLMs may have bias in their handling of different output formats <span id="id12">[<a class="reference internal" href="#id52" title="Do Xuan Long, Hai Nguyen Ngoc, Tiviatis Sim, Hieu Dao, Shafiq Joty, Kenji Kawaguchi, Nancy F Chen, and Min-Yen Kan. Llms are biased towards output formats! systematically evaluating and mitigating output format bias of llms. arXiv preprint arXiv:2408.08656, 2024.">Long <em>et al.</em>, 2024</a>]</span>. The study examined common output structures like multiple-choice answers, wrapped text, lists, and key-value mappings. The authors analyzed key LLM model families, namely Gemma, Mistral, and ChatGPT, uncovering bias across multiple tasks and formats.  The researchers attributed these biases to the models’ underlying token distributions for different formats. An example of this format bias emerged in the comparison between JSON and YAML outputs. While models like Mistral and Gemma excelled at generating JSON structures, they performed notably worse with YAML. Their YAML outputs often contained extraneous information that degrades output quality. This disparity likely stems from JSON’s prevalence in training data, highlighting how a format’s popularity directly influences model performance. While the studied models can be probably considered outdated by now since models are getting updated on a rapidly fashion, it is important to remark that addressing format bias is critical for advancing LLMs and ensuring their reliable application in real-world scenarios.</p>
 <p>Recent research “Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models” <span id="id13">[<a class="reference internal" href="#id26" title="Zhi Rui Tam, Cheng-Kuang Wu, Yi-Lin Tsai, Chieh-Yen Lin, Hung-yi Lee, and Yun-Nung Chen. Let me speak freely? a study on the impact of format restrictions on performance of large language models. 2024. URL: https://arxiv.org/abs/2408.02442, arXiv:2408.02442.">Tam <em>et al.</em>, 2024</a>]</span> suggests that imposing format restrictions on LLMs might impact their performance, particularly in reasoning-intensive tasks. Further evidence <span id="id14">[<a class="reference internal" href="#id28" title="Aider. Code in json: structured output for llms. https://aider.chat/2024/08/14/code-in-json.html, 2024. Accessed: 2024.">Aider, 2024</a>]</span> suggests LLMs may produce lower quality code if they’re asked to return it as part of a structured JSON response, in particular:</p>
@@ -1208,16 +1223,16 @@ <h3><a class="toc-backref" href="#id208" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="conclusion">
-<h2><a class="toc-backref" href="#id209" role="doc-backlink"><span class="section-number">4.6. </span>Conclusion</a><a class="headerlink" href="#conclusion" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id212" role="doc-backlink"><span class="section-number">4.6. </span>Conclusion</a><a class="headerlink" href="#conclusion" title="Permalink to this heading">¶</a></h2>
 <p>Extracting structured output from LLMs is crucial for integrating them into real-world applications. By understanding the challenges and employing appropriate strategies and tools, developers can improve the reliability and usability of LLM-powered systems, unlocking their potential to automate complex tasks and generate valuable insights.</p>
 <p>Prompt engineering and the use of fine-tuned models can help control the output of LLMs. However, when strong guarantees are needed, practitioners should consider techniques such as logit post-processing either by manually adjusting the model’s output logits or using frameworks like Outlines that provider a higher level of control over the generation process.</p>
 </section>
 <section id="acknowledgements">
-<h2><a class="toc-backref" href="#id210" role="doc-backlink"><span class="section-number">4.7. </span>Acknowledgements</a><a class="headerlink" href="#acknowledgements" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id213" role="doc-backlink"><span class="section-number">4.7. </span>Acknowledgements</a><a class="headerlink" href="#acknowledgements" title="Permalink to this heading">¶</a></h2>
 <p>We would like to thank <a class="reference external" href="https://x.com/cameron_pfiffer">Cameron Pfiffer</a> from the .txt team for his insightful review and feedback.</p>
 </section>
 <section id="citation">
-<h2><a class="toc-backref" href="#id211" role="doc-backlink"><span class="section-number">4.8. </span>Citation</a><a class="headerlink" href="#citation" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id214" role="doc-backlink"><span class="section-number">4.8. </span>Citation</a><a class="headerlink" href="#citation" title="Permalink to this heading">¶</a></h2>
 <p><a class="reference external" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="CC BY-NC-SA 4.0" src="https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png" /></a></p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="nd">@misc</span><span class="p">{</span><span class="n">tharsistpsouza2024tamingllms</span><span class="p">,</span>
   <span class="n">author</span> <span class="o">=</span> <span class="p">{</span><span class="n">Tharsis</span> <span class="n">T</span><span class="o">.</span> <span class="n">P</span><span class="o">.</span> <span class="n">Souza</span><span class="p">},</span>
@@ -1231,7 +1246,7 @@ <h2><a class="toc-backref" href="#id211" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="references">
-<h2><a class="toc-backref" href="#id212" role="doc-backlink"><span class="section-number">4.9. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id215" role="doc-backlink"><span class="section-number">4.9. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
 <div class="docutils container" id="id17">
 <div class="citation" id="id28" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id14">Aid24</a><span class="fn-bracket">]</span></span>
@@ -1245,7 +1260,7 @@ <h2><a class="toc-backref" href="#id212" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id8">Gge24</a><span class="fn-bracket">]</span></span>
 <p>Ggerganov. Llama.cpp grammars documentation. <a class="reference external" href="https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md">https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md</a>, 2024. Accessed: 2024.</p>
 </div>
-<div class="citation" id="id101" role="doc-biblioentry">
+<div class="citation" id="id104" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id7">Lan4b</a><span class="fn-bracket">]</span></span>
 <p>LangChain. Outlines integration documentation. Online Documentation, 2024b. Documentation on integrating Outlines library with LangChain for structured generation. URL: <a class="reference external" href="https://python.langchain.com/docs/integrations/chat/outlines/">https://python.langchain.com/docs/integrations/chat/outlines/</a>.</p>
 </div>
@@ -1279,11 +1294,11 @@ <h2><a class="toc-backref" href="#id212" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id5">WL23</a><span class="fn-bracket">]</span></span>
 <p>Brandon T. Willard and Rémi Louf. Efficient guided generation for large language models. 2023. URL: <a class="reference external" href="https://arxiv.org/abs/2307.09702">https://arxiv.org/abs/2307.09702</a>, <a class="reference external" href="https://arxiv.org/abs/2307.09702">arXiv:2307.09702</a>.</p>
 </div>
-<div class="citation" id="id95" role="doc-biblioentry">
+<div class="citation" id="id96" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id10">GuidanceAI24</a><span class="fn-bracket">]</span></span>
 <p>Guidance AI. Guidance: language model programming. GitHub Repository, 2024. Framework for programming language models with structured templating and control flow. URL: <a class="reference external" href="https://github.com/guidance-ai/guidance">https://github.com/guidance-ai/guidance</a>.</p>
 </div>
-<div class="citation" id="id94" role="doc-biblioentry">
+<div class="citation" id="id95" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id11">NVIDIA4a</a><span class="fn-bracket">]</span></span>
 <p>NVIDIA. Logits processor zoo. GitHub Repository, 2024a. Collection of logits processors for controlling language model generation. URL: <a class="reference external" href="https://github.com/NVIDIA/logits-processor-zoo">https://github.com/NVIDIA/logits-processor-zoo</a>.</p>
 </div>
diff --git a/tamingllms/_build/html/objects.inv b/tamingllms/_build/html/objects.inv
index ff9298a..674fc33 100644
Binary files a/tamingllms/_build/html/objects.inv and b/tamingllms/_build/html/objects.inv differ
diff --git a/tamingllms/_build/html/search.html b/tamingllms/_build/html/search.html
index 64118c3..e274e05 100644
--- a/tamingllms/_build/html/search.html
+++ b/tamingllms/_build/html/search.html
@@ -197,6 +197,15 @@
           </li>
 
         
+          <li class="toctree-l1 ">
+            
+              <a href="notebooks/cost.html" class="reference internal ">The Falling Cost Paradox</a>
+            
+
+            
+          </li>
+
+        
       </ul>
     </div>
   
diff --git a/tamingllms/_build/html/searchindex.js b/tamingllms/_build/html/searchindex.js
index 19f3588..70b2aeb 100644
--- a/tamingllms/_build/html/searchindex.js
+++ b/tamingllms/_build/html/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["markdown/intro", "markdown/preface", "markdown/toc", "notebooks/alignment", "notebooks/evals", "notebooks/local", "notebooks/safety", "notebooks/structured_output"], "filenames": ["markdown/intro.md", "markdown/preface.md", "markdown/toc.md", "notebooks/alignment.ipynb", "notebooks/evals.ipynb", "notebooks/local.ipynb", "notebooks/safety.ipynb", "notebooks/structured_output.ipynb"], "titles": ["<span class=\"section-number\">2. </span>About the Book", "<span class=\"section-number\">1. </span>Preface", "Taming LLMs", "<span class=\"section-number\">6. </span>Preference-Based Alignment", "<span class=\"section-number\">3. </span>The Evals Gap", "<span class=\"section-number\">7. </span>Local LLMs in Practice", "<span class=\"section-number\">5. </span>Safety", "<span class=\"section-number\">4. </span>Structured Output"], "terms": {"am": [0, 6], "alwai": [0, 3, 4, 7], "do": [0, 3, 4, 5, 6, 7], "which": [0, 3, 4, 5, 6, 7], "cannot": [0, 3, 4, 5, 6], "order": [0, 3, 4, 6, 7], "mai": [0, 1, 3, 4, 5, 6, 7], "learn": [0, 3, 4, 5, 6, 7], "how": [0, 1, 3, 4, 5, 6, 7], "pablo": [0, 4], "picasso": 0, "In": [0, 3, 4, 5, 6, 7], "recent": [0, 3, 4, 5, 6, 7], "year": [0, 2, 3, 4, 5, 6, 7], "larg": [0, 1, 2, 3, 4, 5, 6, 7], "languag": [0, 1, 2, 4, 5, 6, 7], "model": [0, 1, 2, 6, 7], "llm": [0, 1, 3, 7], "have": [0, 1, 3, 4, 5, 6, 7], "emerg": [0, 3, 5, 6, 7], "transform": [0, 1, 3, 4, 5, 6, 7], "forc": [0, 4, 7], "technologi": [0, 1, 4, 5, 6], "promis": [0, 3, 4, 6], "revolution": [0, 6], "build": [0, 2, 3, 4, 5, 6, 7], "product": [0, 1, 2, 3, 4, 5, 6, 7], "interact": [0, 3, 4, 5, 6, 7], "comput": [0, 3, 4, 5, 6, 7], "from": [0, 1, 4, 5, 6, 7], "chatgpt": [0, 3, 5, 7], "github": [0, 2, 3, 4, 5, 6, 7], "copilot": 0, "claud": [0, 3, 4, 5, 6], "artifact": 0, "system": [0, 3, 4, 5, 6, 7], "captur": [0, 1, 3, 4, 5, 6], "public": [0, 3, 4, 5, 6], "imagin": [0, 5], "spark": 0, "gold": [0, 3, 4, 6], "rush": 0, "ai": [0, 3, 4, 5, 7], "power": [0, 2, 3, 4, 5, 6, 7], "applic": [0, 1, 2, 3, 5, 6, 7], "howev": [0, 3, 4, 5, 6, 7], "beneath": 0, "surfac": [0, 4], "technolog": [0, 1, 4, 6], "revolut": 0, "li": [0, 3, 4, 5, 6, 7], "complex": [0, 1, 3, 4, 5, 6, 7], "landscap": [0, 3, 4, 5], "practition": [0, 1, 4, 5, 7], "must": [0, 3, 4, 5, 6, 7], "navig": [0, 2, 4, 5, 6], "focus": [0, 3, 4, 5, 6, 7], "bring": [0, 3, 5], "awar": [0, 3, 4, 6], "limit": [0, 1, 2, 4, 5, 6, 7], "har": [0, 2, 4], "solut": [0, 2, 4, 5, 6], "overcom": [0, 4], "them": [0, 1, 3, 4, 5, 6, 7], "robust": [0, 3, 4, 5, 6, 7], "It": [0, 3, 4, 5, 6, 7], "offer": [0, 3, 4, 5, 6, 7], "critic": [0, 2, 3, 4, 5, 6, 7], "implement": [0, 2, 3, 4, 5, 7], "back": [0, 4, 5, 6, 7], "reproduc": [0, 1, 2, 4, 5], "exampl": [0, 1, 2, 3, 4, 5, 6, 7], "while": [0, 1, 2, 3, 4, 5, 6, 7], "mani": [0, 1, 3, 4, 5, 6, 7], "resourc": [0, 3, 4, 5, 6], "cover": [0, 3, 4, 5, 6, 7], "capabl": [0, 1, 2, 4, 5, 6, 7], "specif": [0, 3, 4, 5, 7], "hidden": [0, 3, 6], "pitfal": [0, 1, 3, 4, 5, 7], "engin": [0, 1, 2, 3, 4, 5, 6], "technic": [0, 1, 2, 3, 4, 5, 7], "manag": [0, 1, 4, 5, 6, 7], "face": [0, 3, 4, 5, 6], "when": [0, 1, 2, 3, 4, 5, 6, 7], "comprehens": [0, 2, 3, 4, 5, 6, 7], "guid": [0, 1, 3, 4, 5, 6, 7], "leverag": [0, 3, 4, 5, 6, 7], "battl": [0, 2], "test": [0, 2, 3, 5, 6, 7], "tool": [0, 1, 3], "throughout": [0, 4, 5, 6], "tackl": [0, 3, 4, 6], "follow": [0, 3, 4, 5, 6, 7], "non": [0, 3, 5, 6, 7], "exhaust": [0, 5], "list": [0, 3, 4, 5, 6, 7], "structur": [0, 3, 4, 5, 6], "un": 0, "reliabl": [0, 1, 3, 4, 5, 6, 7], "struggl": [0, 1, 3, 4, 5, 6, 7], "maintain": [0, 1, 3, 4, 5, 6, 7], "consist": [0, 1, 3, 4, 5, 6, 7], "output": [0, 1, 3, 4, 5, 6], "format": [0, 3, 4, 5, 6, 7], "complic": [0, 6], "integr": [0, 1, 3, 4, 5, 6, 7], "larger": [0, 3, 4, 5, 6, 7], "make": [0, 3, 4, 5, 6, 7], "error": [0, 3, 4, 6, 7], "handl": [0, 3, 4, 5, 6, 7], "more": [0, 1, 3, 4, 5, 6, 7], "size": [0, 3, 4, 5, 6, 7], "length": [0, 3, 4, 5, 7], "constraint": [0, 1, 3, 4, 5, 6, 7], "strict": [0, 5, 6, 7], "token": [0, 1, 3, 4, 5, 6, 7], "both": [0, 3, 4, 5, 6], "input": [0, 3, 4, 5, 6, 7], "requir": [0, 3, 5, 6, 7], "care": [0, 3, 4, 5, 6, 7], "chunk": [0, 3, 5], "strategi": [0, 3, 4, 5, 6, 7], "long": [0, 1, 3, 4, 5, 6, 7], "form": [0, 3, 4, 5, 6, 7], "effect": [0, 1, 3, 4, 6, 7], "tradit": [0, 3, 5, 6], "softwar": [0, 1, 3, 5, 6, 7], "methodologi": [0, 3, 4, 5, 6, 7], "break": [0, 1, 3, 4, 6], "down": [0, 1, 4, 5, 6], "deal": [0, 3, 5], "determinist": [0, 7], "gener": [0, 1, 5, 7], "new": [0, 2, 3, 4, 5, 6, 7], "hallucin": [0, 1, 3, 4, 6, 7], "These": [0, 3, 4, 5, 6, 7], "can": [0, 1, 3, 4, 5, 6, 7], "plausibl": [0, 6], "sound": [0, 6], "entir": [0, 4, 5, 7], "fabric": [0, 4, 6], "inform": [0, 3, 4, 5, 6, 7], "creat": [0, 1, 3, 4, 5, 6, 7], "signific": [0, 3, 4, 5, 6, 7], "risk": [0, 1, 3, 4, 5], "safeti": [0, 3, 4, 7], "align": [0, 4, 5, 6, 7], "harm": [0, 3, 4, 5], "bias": [0, 3, 4, 5, 6, 7], "inappropri": [0, 3, 6], "safeguard": [0, 4, 6], "monitor": [0, 3, 4, 5, 6], "ensur": [0, 3, 4, 5, 6, 7], "safe": [0, 3, 4, 6, 7], "deploy": [0, 3, 4, 6, 7], "cost": [0, 3, 4, 6, 7], "optim": [0, 1, 4, 5, 6], "The": [0, 1, 3, 6, 7], "financi": [0, 1, 3, 4, 6, 7], "oper": [0, 3, 4, 5, 6, 7], "base": [0, 1, 5, 7], "quickli": [0, 3, 5], "becom": [0, 3, 4, 5, 6, 7], "prohibit": [0, 3, 4, 5], "without": [0, 1, 3, 4, 5, 6, 7], "observ": [0, 3, 4, 5, 6, 7], "vendor": [0, 4, 5], "lock": [0, 3, 5], "cloud": [0, 3, 4, 5, 6, 7], "provid": [0, 2, 3, 4, 5, 6, 7], "depend": [0, 3, 4, 5, 7], "through": [0, 1, 2, 3, 4, 5, 6, 7], "proprietari": [0, 3, 5, 6, 7], "infrastructur": [0, 5], "difficult": [0, 3, 4, 6], "switch": [0, 5], "self": [0, 3, 4, 5, 6, 7], "host": [0, 4, 5, 6], "take": [0, 2, 3, 4, 5, 6, 7], "hand": [0, 5, 6, 7], "focu": [0, 2, 3, 4, 5, 6, 7], "access": [0, 3, 4, 5, 6, 7], "all": [0, 1, 3, 4, 5, 6, 7], "ar": [0, 1, 3, 4, 5, 6, 7], "fulli": [0, 3, 4, 6], "document": [0, 3, 4, 5, 6, 7], "allow": [0, 4, 5, 6, 7], "reader": [0, 2], "replic": [0, 4, 6, 7], "result": [0, 3, 4, 6, 7], "exactli": [0, 4, 7], "design": [0, 1, 3, 5, 7], "run": [0, 3, 4, 5, 6, 7], "consum": [0, 3, 4, 5, 6, 7], "grade": [0, 3, 4, 5, 6], "hardwar": [0, 3, 4], "expens": [0, 3, 4, 5, 6], "avail": [0, 3, 4, 5, 6, 7], "notebook": [0, 3, 7], "modifi": [0, 3, 4, 6, 7], "extend": [0, 3, 4, 5, 7], "built": [0, 4, 5, 6, 7], "us": [0, 1, 3, 5, 6, 7], "free": [0, 1, 3, 4, 5, 6], "everyon": [0, 4, 5], "minim": [0, 3, 4, 5, 6, 7], "framework": [0, 3, 4, 5], "wai": [0, 3, 4, 5, 6, 7], "priorit": [0, 3, 4, 5, 6], "transpar": [0, 3, 4, 5, 6], "visibl": [0, 4], "being": [0, 3, 4, 5, 6, 7], "better": [0, 2, 3, 4, 5, 6], "understand": [0, 1, 2, 3, 4, 5, 6, 7], "custom": [0, 3, 4, 7], "flexibl": [0, 4, 5, 6, 7], "adapt": [0, 3, 4, 5, 6], "case": [0, 4, 7], "unlik": [0, 3, 4, 5], "black": [0, 3], "box": [0, 5], "commerci": [0, 3, 4, 5, 6, 7], "most": [0, 3, 4, 5, 6, 7], "freeli": [0, 7], "foster": [0, 3, 4, 6, 7], "reduc": [0, 3, 4, 5, 6, 7], "independ": [0, 4, 6, 7], "freedom": [0, 5, 7], "architectur": [0, 3, 4, 5, 7], "decis": [0, 3, 4, 5, 6], "keep": [0, 3, 4, 5, 6], "principl": [0, 3, 4, 5, 6], "itself": [0, 3, 4, 5, 6], "live": [0, 1, 4, 6], "evolv": [0, 3, 4, 5, 6], "chang": [0, 3, 4, 5, 6, 7], "encourag": [0, 3, 4, 6, 7], "report": [0, 3, 4, 5, 6, 7], "suggest": [0, 3, 4, 5, 6, 7], "improv": [0, 3, 4, 5, 6, 7], "contribut": [0, 4, 5, 6], "via": [0, 3, 4, 5, 6, 7], "pull": [0, 5], "request": [0, 3, 4, 5, 6, 7], "share": [0, 3, 4, 5, 6, 7], "own": [0, 3, 4, 5, 6], "experi": [0, 3, 4, 5, 6, 7], "commun": [0, 3, 4, 6, 7], "propos": [0, 4, 6], "chapter": [0, 3, 4, 5, 6, 7], "section": [0, 3, 4, 5, 6, 7], "found": [0, 3, 4, 5, 7], "http": [0, 1, 2, 3, 4, 5, 6, 7], "com": [0, 2, 3, 4, 5, 6, 7], "souzatharsi": [0, 2, 3, 4, 5, 6, 7], "tamingllm": [0, 2, 3, 4, 5, 6, 7], "whether": [0, 3, 4, 5, 6, 7], "you": [0, 1, 3, 4, 5, 6, 7], "ve": [0, 5], "typo": [0, 6], "want": [0, 1, 3, 5, 6, 7], "welcom": 0, "look": [0, 2, 3, 4, 5, 6], "our": [0, 1, 3, 4, 5, 6, 7], "goal": [0, 1, 3, 4, 6, 7], "discourag": 0, "enabl": [0, 3, 4, 5, 6, 7], "By": [0, 1, 2, 3, 4, 6, 7], "upfront": [0, 2], "equip": [0, 2, 4, 6], "avoid": [0, 3, 4, 5, 6, 7], "current": [0, 2, 3, 4, 6, 7], "discours": [0, 2], "around": [0, 2, 3, 4, 5, 6, 7], "tend": [0, 2, 4, 6], "toward": [0, 3, 4, 6, 7], "extrem": [0, 3, 4, 6], "either": [0, 3, 4, 5, 6, 7], "uncrit": 0, "enthusiasm": 0, "wholesal": [0, 4], "dismiss": 0, "differ": [0, 3, 4, 5, 6, 7], "rather": [0, 1, 3, 4, 5, 6], "than": [0, 1, 3, 4, 5, 6, 7], "theoret": [0, 3], "examin": [0, 3, 4, 5, 6, 7], "first": [0, 1, 3, 4, 5, 6, 7], "everi": [0, 4, 6], "concept": [0, 3, 4, 6], "illustr": [0, 3, 4, 5, 6, 7], "execut": [0, 4, 5, 6], "immedi": [0, 3, 4, 5], "analysi": [0, 1, 3, 4, 5, 6], "balanc": [0, 3, 4, 5, 6, 7], "help": [0, 3, 4, 5, 6, 7], "intend": [0, 4, 5, 6], "develop": [0, 1, 3, 4, 5, 6, 7], "step": [0, 1, 3, 4, 5, 6, 7], "insight": [0, 3, 4, 5, 6, 7], "along": [0, 3, 4, 5, 6], "guidanc": [0, 3, 7], "could": [0, 1, 3, 4, 5, 6, 7], "derail": 0, "project": [0, 3, 4, 5, 6], "earli": [0, 3, 4, 6, 7], "befor": [0, 3, 4, 6, 7], "thei": [0, 1, 3, 4, 5, 6, 7], "costli": [0, 4, 6], "problem": [0, 1, 2, 3, 5, 6], "too": [0, 1, 3, 4, 5, 6], "late": [0, 3, 6], "lifecycl": [0, 5, 6], "lead": [0, 1, 3, 4, 5, 6, 7], "genai": [0, 1, 3, 6], "initi": [0, 1, 3, 4, 5, 6, 7], "leader": [0, 2, 4], "advoc": [0, 6], "anyon": [0, 6], "seek": [0, 4, 5, 6], "work": [0, 1, 3, 4, 5, 6, 7], "typic": [0, 3, 4, 5, 6, 7], "job": [0, 4, 5, 6], "role": [0, 3, 4, 5, 6, 7], "platform": [0, 4, 5, 6, 7], "backend": [0, 3, 4], "exist": [0, 3, 4, 5], "ml": [0, 6], "transit": [0, 4, 5, 7], "overse": 0, "motiv": [0, 3, 4, 7], "need": [0, 3, 4, 5, 6, 7], "readi": [0, 4, 6], "desir": [0, 3, 4, 7], "perform": [0, 3, 4, 6, 7], "after": [0, 1, 3, 4, 5, 6, 7], "read": [0, 3, 4, 6, 7], "implic": [0, 1, 3, 4, 6], "recommend": [0, 3, 4, 5, 6, 7], "abl": [0, 3, 4, 7], "deploi": [0, 3, 4, 5, 6], "proper": [0, 3, 5, 6, 7], "realist": [0, 3, 6], "effort": [0, 4, 5, 6, 7], "estim": [0, 4, 6], "impact": [0, 3, 4, 5, 6, 7], "timelin": 0, "To": [0, 3, 4, 5, 6, 7], "should": [0, 3, 4, 5, 6, 7], "basic": [0, 3, 4, 5, 6], "program": [0, 4, 5, 7], "knowledg": [0, 3, 4, 5, 6], "introductori": [0, 1, 2], "langchain": [0, 4], "e": [0, 1, 3, 4, 5, 6, 7], "g": [0, 3, 4, 5, 6, 7], "chat": [0, 3, 4, 5, 6, 7], "prompt": [0, 4, 6], "templat": [0, 4, 7], "openai": [0, 3, 4, 5, 7], "anthrop": [0, 3, 7], "similar": [0, 3, 4, 5, 7], "dive": 0, "here": [0, 2, 3, 4, 5, 6, 7], "get": [0, 3, 4, 5, 6, 7], "start": [0, 3, 4, 5, 6, 7], "clone": [0, 3], "companion": 0, "git": 0, "cd": 0, "activ": [0, 3, 4, 5, 6], "virtual": [0, 4], "m": [0, 3, 4, 5, 6, 7], "venv": 0, "tame": [0, 3, 4, 5, 6, 7], "env": [0, 3, 4, 6, 7], "bin": [0, 5], "On": [0, 4, 5, 7], "window": [0, 4, 5], "script": [0, 5], "try": [0, 1, 3, 4, 6, 7], "contain": [0, 3, 4, 5, 6, 7], "possibl": [0, 3, 4, 5, 6, 7], "includ": [0, 1, 3, 4, 5, 6, 7], "necessari": [0, 3, 4, 6], "instal": [0, 3, 4, 5, 7], "go": [0, 3, 4, 7], "feel": [0, 5], "prefer": [0, 4, 5, 6, 7], "packag": [0, 4, 5, 7], "pip": [0, 3, 4, 5, 7], "poetri": [0, 6], "file": [0, 3, 4, 5, 6, 7], "root": [0, 3], "directori": [0, 4, 5], "add": [0, 3, 4, 5, 6], "other": [0, 3, 4, 5, 6, 7], "sensit": [0, 3, 4, 5, 6], "openai_api_kei": [0, 3], "your_openai_api_key_her": 0, "never": [0, 7], "commit": [0, 3, 4, 6], "version": [0, 3, 4, 5, 6, 7], "control": [0, 1, 3, 4, 5, 6, 7], "kept": [0, 4], "privat": [0, 4], "If": [0, 1, 3, 4, 5, 6, 7], "encount": [0, 2, 4, 6], "rate": [0, 3, 4, 5, 6], "consid": [0, 3, 4, 5, 6, 7], "smaller": [0, 3, 4, 5, 7], "retri": [0, 7], "logic": [0, 1, 3, 4, 6], "conflict": [0, 3, 4], "fresh": 0, "like": [0, 1, 3, 4, 5, 6, 7], "check": [0, 4, 5, 6, 7], "page": [0, 4, 5], "known": [0, 4, 6, 7], "now": [0, 1, 3, 4, 5, 6, 7], "let": [0, 3, 4, 5, 6, 7], "begin": [0, 4, 5, 6, 7], "explor": [0, 1, 3, 4, 5, 6, 7], "dr": [0, 3], "tharsi": [0, 2, 3, 4, 5, 6, 7], "souza": [0, 2, 3, 4, 5, 6, 7], "scientist": [0, 1, 5, 6], "special": [0, 4, 5, 6, 7], "he": [0, 3, 4, 6], "lectur": 0, "columbia": 0, "univers": [0, 4, 5, 6], "master": [0, 5, 7], "scienc": [0, 3, 4, 6], "appli": [0, 3, 4, 5, 6, 7], "analyt": 0, "incom": [0, 4], "head": [0, 3, 4, 6, 7], "equiti": [0, 4], "citadel": 0, "former": [0, 1, 4, 5], "senior": [0, 4], "vp": 0, "two": [0, 3, 4, 5, 6, 7], "sigma": [0, 3], "invest": [0, 3, 4, 6], "also": [0, 3, 4, 5, 6, 7], "enjoi": 0, "mentor": 0, "under": [0, 3, 4, 5, 6, 7], "repres": [0, 3, 4, 5, 7], "student": [0, 3, 6], "profession": [0, 3, 4, 6, 7], "divers": [0, 3, 4, 6], "global": [0, 4, 6], "ecosystem": [0, 4, 5], "With": [0, 3, 4, 5, 6, 7], "over": [0, 2, 3, 4, 5, 6, 7], "15": [0, 4, 5, 6, 7], "deliv": [0, 4, 5], "across": [0, 1, 3, 4, 5, 6, 7], "startup": 0, "fortun": 0, "500": [0, 3, 4, 6], "compani": [0, 3, 4, 6, 7], "numer": [0, 4, 6, 7], "scholarli": 0, "frequent": [0, 4, 5, 7], "speaker": [0, 4], "academ": [0, 3, 4, 6], "busi": [0, 4, 5, 6], "confer": [0, 7], "ground": [0, 3, 4, 5], "background": [0, 1, 4, 5], "draw": [0, 3, 4, 6, 7], "scale": [0, 3, 4, 5, 6, 7], "stage": [0, 3, 6, 7], "major": [0, 3, 4, 6, 7], "institut": [0, 4, 6], "well": [0, 3, 4, 5, 6, 7], "advis": [0, 3], "profit": [0, 4, 6, 7], "organ": [0, 3, 4, 5], "uniqu": [0, 3, 4, 5, 6, 7], "bridg": [0, 5, 6], "gap": [0, 1, 3, 5, 6], "between": [0, 1, 3, 4, 5, 6, 7], "potenti": [0, 1, 3, 4, 5, 6, 7], "next": [0, 1, 3, 4, 5, 6, 7], "hold": [0, 3, 4], "ph": [0, 6], "d": [0, 3, 4, 5, 6, 7], "ucl": 0, "london": 0, "phil": [0, 6], "sc": 0, "b": [0, 4, 5, 6, 7], "tell": [1, 3, 6], "mere": [1, 4], "what": [1, 3, 4, 5, 6, 7], "someth": [1, 4, 5], "i": [1, 2, 4, 5, 6, 7], "emanuel": [1, 3, 4, 6], "derman": 1, "an": [1, 2, 3, 4, 5, 6, 7], "altern": [1, 3, 4, 5, 6], "titl": [1, 2, 3, 4, 5, 6, 7], "thi": [1, 2, 3, 4, 5, 6, 7], "book": [1, 4], "been": [1, 3, 4, 5, 6], "behav": 1, "badli": 1, "come": [1, 3, 4, 5, 6, 7], "notic": [1, 3, 4, 6], "parallel": [1, 3, 4, 5], "": [1, 3, 4, 5, 6, 7], "semin": [1, 6], "2011": 1, "coincident": 1, "just": [1, 3, 4, 5, 6, 7], "caution": 1, "against": [1, 3, 4, 5, 6], "treat": [1, 4, 6], "perfect": [1, 4, 5], "represent": [1, 4, 5, 6], "realiti": [1, 6], "aim": [1, 3, 4, 5, 6, 7], "highlight": [1, 3, 4, 5, 6, 7], "practic": [1, 3, 4, 6], "physicist": 1, "goldman": 1, "sach": 1, "quant": 1, "scientif": [1, 3, 4, 5], "fail": [1, 3, 4, 6], "we": [1, 3, 4, 5, 6, 7], "mistak": [1, 6], "approxim": [1, 4, 7], "full": [1, 3, 4, 5, 6, 7], "assumpt": [1, 4, 6], "core": [1, 4, 5, 6], "premis": [1, 5], "hi": [1, 4, 6, 7], "aspect": [1, 3, 4, 6], "world": [1, 3, 4, 5, 6, 7], "inher": [1, 2, 3, 4, 6, 7], "involv": [1, 3, 4, 5, 6, 7], "simplif": 1, "argu": [1, 6, 7], "crise": 1, "2008": 1, "crash": 1, "occur": [1, 3, 4, 6], "partli": 1, "becaus": [1, 3, 4, 6], "peopl": [1, 3, 4, 5, 6], "put": [1, 4, 5], "much": [1, 3, 4, 5], "faith": 1, "mathemat": [1, 4, 5, 7], "recogn": [1, 3, 4, 6], "human": [1, 4, 5, 6, 7], "behavior": [1, 3, 4, 5, 6], "market": [1, 4, 5, 7], "dynam": [1, 3, 4, 6], "fact": [1, 3, 4, 6], "reason": [1, 3, 4, 5, 6, 7], "Their": [1, 4, 7], "respons": [1, 4, 5, 6, 7], "often": [1, 3, 4, 5, 6, 7], "convinc": [1, 3], "probabilist": [1, 4, 7], "train": [1, 4, 5, 6, 7], "data": [1, 4, 5, 6, 7], "true": [1, 3, 4, 6, 7], "even": [1, 3, 4, 5, 6, 7], "though": [1, 3, 4, 5, 6, 7], "insist": 1, "machin": [1, 3, 5, 6, 7], "todai": [1, 5, 7], "grow": [1, 3, 4, 5, 6, 7], "pervas": [1, 6], "belief": [1, 5, 6], "solv": [1, 3, 4, 5, 6, 7], "ani": [1, 3, 4, 5, 6, 7], "context": [1, 3, 4, 5, 6, 7], "content": 1, "wish": [1, 4], "user": [1, 4, 5, 7], "moreov": 1, "were": [1, 3, 4, 5, 6, 7], "predict": [1, 3, 4, 5, 6, 7], "chatbot": [1, 3, 4, 5, 6], "twist": [1, 6], "wrap": [1, 5, 7], "further": [1, 3, 4, 5, 6, 7], "daili": [1, 5, 6], "life": [1, 4, 5, 6], "workflow": [1, 4, 5, 6, 7], "affect": [1, 4, 5, 6], "decid": [1, 3, 4], "action": [1, 3, 4, 6], "coupl": [1, 5], "lack": [1, 3, 4, 6, 7], "pose": [1, 3, 4, 6, 7], "still": [1, 4, 5, 6], "figur": [1, 4, 5], "out": [1, 3, 4, 5, 6, 7], "serv": [1, 3, 4, 6, 7], "builder": [1, 5], "who": [1, 3, 4, 5, 6, 7], "remain": [1, 3, 4, 5, 6], "clear": [1, 3, 4, 5, 6, 7], "ei": 1, "about": [1, 3, 4, 5, 6, 7], "therefor": [1, 3, 4, 5, 6], "end": [1, 3, 4, 5, 6, 7], "detail": [1, 3, 4, 5, 6, 7], "python": [1, 2, 4, 5, 6, 7], "code": [1, 2, 3, 4, 5, 6, 7], "diminish": [1, 3, 4], "promot": [1, 3, 4, 6], "nuanc": [1, 3, 4, 5, 6, 7], "acknowledg": [1, 4, 6], "within": [1, 3, 4, 6, 7], "trustworthi": [1, 6], "taught": 1, "u": [1, 3, 4, 6, 7], "where": [1, 3, 4, 5, 6, 7], "der11": 1, "why": [1, 3, 4, 6, 7], "confus": [1, 6], "illus": 1, "disast": [1, 4], "wall": [1, 5], "street": [1, 5], "press": [1, 4, 5], "isbn": [1, 3, 4], "9781439165010": 1, "url": [1, 2, 3, 4, 5, 6, 7], "googl": [1, 4, 5, 7], "co": [1, 3, 4, 5, 6], "uk": [1, 6], "id": [1, 4, 5, 6, 7], "lke_cwm4wm8c": 1, "sign": [2, 4, 6], "up": [2, 3, 4, 5, 6], "receiv": [2, 3, 4, 5, 6, 7], "updat": [2, 3, 4, 5, 6, 7], "abstract": [2, 4, 6, 7], "heavili": [2, 3, 4, 6, 7], "gloss": 2, "fundament": [2, 3, 4, 5, 6, 7], "challeng": [2, 3, 4, 5, 6, 7], "convers": [2, 3, 4, 5, 6, 7], "kei": [2, 3, 5, 6, 7], "proven": 2, "yet": [2, 3, 4, 6], "concret": [2, 6, 7], "sidestep": 2, "misc": [2, 3, 4, 5, 6, 7], "tharsistpsouza2024tamingllm": [2, 3, 4, 5, 6, 7], "author": [2, 3, 4, 5, 6, 7], "t": [2, 3, 4, 5, 6, 7], "p": [2, 3, 4, 5, 6, 7], "2024": [2, 3, 4, 6, 7], "journal": [2, 3, 4, 5, 6, 7], "repositori": [2, 3, 4, 5, 6, 7], "valu": [3, 4, 5, 6, 7], "its": [3, 4, 5, 6, 7], "privileg": 3, "abov": [3, 4, 6], "soon": [3, 7], "lose": [3, 4], "dwight": 3, "eisenhow": 3, "releas": [3, 4, 5, 6, 7], "3": [3, 4, 5, 7], "5": [3, 4, 5, 7], "2022": [3, 4, 5, 6], "mark": [3, 4, 6], "pivot": [3, 4, 5], "moment": 3, "histori": [3, 4, 5], "artifici": [3, 4, 5, 6], "intellig": [3, 4, 5, 6], "five": [3, 4, 6], "dai": [3, 4, 5, 6, 7], "launch": [3, 4, 6], "attract": [3, 4], "million": [3, 4, 5], "month": [3, 4, 5, 6], "becam": 3, "fastest": [3, 4, 6], "100": [3, 4, 5, 6, 7], "monthli": [3, 4], "rais": [3, 4, 6], "intrigu": 3, "question": [3, 4, 5, 6, 7], "did": [3, 4, 7], "dramat": [3, 4, 5, 7], "predecessor": 3, "gpt": [3, 4, 5, 6, 7], "had": [3, 4], "same": [3, 4, 5, 6, 7], "number": [3, 4, 5, 6, 7], "paramet": [3, 4, 5, 6, 7], "far": [3, 5, 6], "less": [3, 4, 5, 6], "attent": [3, 5], "arguabl": [3, 5], "feedback": [3, 4, 6, 7], "abil": [3, 4, 5, 6, 7], "least": [3, 4, 6], "ey": 3, "breakthrough": [3, 6], "demonstr": [3, 4, 5, 6, 7], "crucial": [3, 5, 6, 7], "greater": [3, 4, 5, 6], "process": [3, 4, 5, 6], "modern": [3, 4, 7], "techniqu": [3, 4, 5], "direct": [3, 4, 5, 6], "rafailov": 3, "et": [3, 4, 5, 6, 7], "al": [3, 4, 5, 6, 7], "present": [3, 4, 5, 6, 7], "autom": [3, 4, 6, 7], "fashion": [3, 7], "open": [3, 4, 6, 7], "sourc": [3, 4, 6, 7], "common": [3, 4, 5, 7], "pre": [3, 4, 5, 6, 7], "default": [3, 4, 5, 6, 7], "state": [3, 4, 5, 6, 7], "art": [3, 4, 6], "object": [3, 4, 5, 6, 7], "given": [3, 4, 5, 6, 7], "webpag": 3, "internet": [3, 4], "veri": [3, 4, 5, 6], "ask": [3, 4, 5, 6, 7], "instruct": [3, 4, 5, 6, 7], "sai": [3, 7], "ouyang": [3, 6], "2": [3, 4, 7], "explain": [3, 4], "moon": 3, "land": [3, 4, 5], "6": [3, 4, 5], "old": [3, 4], "import": [3, 4, 5, 6, 7], "pipelin": [3, 4, 5, 6, 7], "pipe": [3, 6], "text": [3, 4, 5, 6, 7], "gpt2": [3, 4], "msg": 3, "short": [3, 4, 6, 7], "sentenc": [3, 4, 6], "_": [3, 4, 6, 7], "rang": [3, 4, 5, 6, 7], "len": [3, 4, 5, 6, 7], "print": [3, 4, 5, 6, 7], "f": [3, 4, 5, 6, 7], "n": [3, 4, 5, 6, 7], "1": [3, 4, 5, 7], "0": [3, 4, 5, 6, 7], "generated_text": [3, 7], "good": [3, 4, 5, 7], "idea": [3, 5, 6, 7], "one": [3, 4, 5, 6, 7], "those": [3, 4, 6, 7], "littl": [3, 4], "green": [3, 6], "dot": 3, "Then": [3, 4], "line": [3, 4, 5, 6], "later": [3, 4, 5, 6, 7], "re": [3, 4, 5, 6, 7], "alreadi": [3, 4, 7], "movi": 3, "theori": [3, 4], "some": [3, 4, 5, 6, 7], "mean": [3, 4, 5, 6, 7], "word": [3, 4, 7], "tepid": 3, "articl": [3, 4, 5, 6], "sure": [3, 4, 6, 7], "lunar": 3, "As": [3, 4, 5, 6, 7], "see": [3, 4, 5, 6, 7], "coher": [3, 4, 5, 7], "explan": [3, 4, 6, 7], "child": [3, 4, 6], "nonsens": [3, 6], "meander": 3, "unrel": [3, 4, 6], "topic": [3, 4, 5, 6, 7], "simpl": [3, 4, 5, 6, 7], "appropri": [3, 4, 5, 6, 7], "young": [3, 4, 6], "instead": [3, 4, 5, 6, 7], "address": [3, 4, 5, 6, 7], "issu": [3, 4, 6, 7], "introduc": [3, 4, 5, 6, 7], "rlhf": [3, 6, 7], "intent": [3, 6], "wide": [3, 4, 5, 6, 7], "task": [3, 6, 7], "fig": [3, 4, 5, 6, 7], "collect": [3, 4, 5, 6, 7], "sampl": [3, 5, 7], "label": [3, 4, 5, 6, 7], "comparison": 3, "reward": [3, 4, 5, 6], "sever": [3, 4, 5, 6, 7], "rank": [3, 4, 5, 6], "best": [3, 4, 5, 6], "worst": 3, "rm": [3, 5], "reinforc": [3, 4, 5, 6], "write": [3, 4, 5, 6, 7], "stori": [3, 6], "frog": 3, "calcul": [3, 4, 5, 6, 7], "score": [3, 4, 5, 6, 7], "ppo": [3, 5], "proxim": [3, 5], "iter": [3, 4, 5, 6, 7], "accur": [3, 4, 5, 6], "undesir": [3, 6], "simplifi": [3, 4, 5, 7], "view": [3, 4, 6], "show": [3, 4, 5, 6, 7], "progress": [3, 6], "pattern": [3, 4, 5, 6, 7], "ha": [3, 4, 5, 6, 7], "instanc": [3, 4, 5, 6], "directli": [3, 4, 5, 6, 7], "For": [3, 4, 5, 6, 7], "llama": [3, 4, 6, 7], "guard": 3, "team": [3, 4, 5, 7], "8b": [3, 5, 6, 7], "wa": [3, 4, 5, 6, 7], "classif": [3, 4, 5, 6, 7], "bypass": [3, 6], "similarli": [3, 4, 5, 6], "zephyr": 3, "7b": [3, 4, 5, 6, 7], "alpha": [3, 4, 7], "mistral": [3, 7], "publicli": [3, 4, 7], "assist": [3, 4, 5, 6, 7], "paper": [3, 4, 5, 6, 7], "compon": [3, 4, 5], "particular": [3, 4, 5, 6, 7], "foundat": [3, 4, 5, 6], "advanc": [3, 4, 5, 6, 7], "method": [3, 4, 6, 7], "strong": [3, 4, 5, 6, 7], "At": [3, 4, 5, 7], "high": [3, 4, 5, 6, 7], "level": [3, 4, 6, 7], "carefulli": [3, 4, 5, 6, 7], "curat": [3, 4, 5], "purpos": [3, 4, 5, 6, 7], "exhibit": [3, 4, 5, 6], "domain": [3, 4, 5, 6], "emploi": [3, 4, 6, 7], "prove": [3, 4, 6], "particularli": [3, 4, 5, 6, 7], "valuabl": [3, 4, 5, 7], "scenario": [3, 4, 5, 6, 7], "precis": [3, 4, 5, 6, 7], "style": [3, 4], "tone": 3, "expertis": [3, 4, 6], "medic": [3, 4, 5], "legal": [3, 4, 5, 6], "field": [3, 4, 5, 6, 7], "adher": [3, 4, 6, 7], "guidelin": [3, 4, 6], "servic": [3, 4, 5, 6], "standard": [3, 4, 5, 6], "approach": [3, 4, 5, 7], "each": [3, 4, 5, 6, 7], "distinct": [3, 4, 5, 6, 7], "advantag": [3, 4, 5, 6, 7], "weight": [3, 4, 5, 6, 7], "maximum": [3, 4, 5, 6], "lora": [3, 5, 6], "low": [3, 4, 5, 6, 7], "hu": [3, 6, 7], "2021": [3, 4], "small": [3, 4, 5, 7], "matric": 3, "effici": [3, 4, 5, 6, 7], "qlora": 3, "quantiz": 3, "dettmer": 3, "2023": [3, 4, 5, 6, 7], "combin": [3, 4, 5, 6, 7], "memori": [3, 4, 5, 6], "footprint": [3, 5], "modest": 3, "increas": [3, 4, 5, 6, 7], "likelihood": [3, 4, 6, 7], "obtain": [3, 4, 5, 6, 7], "probabl": [3, 4, 5, 7], "outcom": [3, 4, 6, 7], "hong": [3, 4], "unintend": [3, 6], "suboptim": 3, "seen": [3, 4, 6], "research": [3, 4, 5], "maxim": [3, 4], "shown": [3, 4, 5, 6], "alon": [3, 4, 5, 6], "gain": [3, 4, 5, 6], "achiev": [3, 4, 5, 6, 7], "bai": [3, 4, 6], "touvron": [3, 5], "sinc": [3, 4, 5, 6, 7], "main": [3, 4, 5, 6, 7], "categori": [3, 4, 5, 6, 7], "algorithm": [3, 4, 6], "meanwhil": [3, 5], "superior": [3, 4, 6], "benchmark": 3, "xu": [3, 4, 5, 6], "schulman": [3, 6], "2017": [3, 4], "popular": [3, 5, 7], "understood": 3, "set": [3, 4, 5, 6, 7], "rule": [3, 4, 5, 7], "govern": [3, 4], "reflect": [3, 4, 5, 6], "anoth": [3, 4, 5, 6], "adjust": [3, 4, 5, 6, 7], "One": [3, 4, 5, 6, 7], "strength": [3, 4, 5, 6], "2024c": [3, 5], "real": [3, 4, 5, 6, 7], "noisi": 3, "delai": [3, 4, 5, 6], "subsequ": [3, 7], "situat": [3, 4, 6], "clip": 3, "surrog": 3, "function": [3, 4, 5, 6, 7], "stabl": [3, 4], "prevent": [3, 4, 6, 7], "overreact": 3, "converg": 3, "due": [3, 4, 5, 6], "simplic": [3, 5], "award": [3, 4], "runner": 3, "neurip": 3, "blog": [3, 4, 5, 6, 7], "4": [3, 4, 5, 7], "fit": [3, 4, 6, 7], "pair": [3, 4, 6], "rl": [3, 6], "find": [3, 4, 5, 6, 7], "contrast": [3, 4, 5, 6, 7], "satisfi": [3, 4], "implicit": [3, 4, 6], "whose": [3, 4], "correspond": [3, 4, 7], "extract": [3, 4, 5, 6, 7], "close": [3, 4, 5, 6], "compar": [3, 4, 5, 6], "assign": [3, 4, 5, 6, 7], "higher": [3, 4, 5, 7], "kl": [3, 5], "diverg": [3, 5], "origin": [3, 4, 5, 6, 7], "preserv": [3, 5, 6, 7], "defin": [3, 4, 5, 6, 7], "equat": 3, "mathcal": 3, "l": [3, 4], "pi_": 3, "theta": [3, 7], "ref": 3, "mathbb": [3, 7], "x": [3, 4, 5, 6, 7], "y_w": 3, "y_l": 3, "sim": [3, 7], "left": [3, 5], "log": [3, 4, 5], "beta": [3, 4, 6, 7], "underbrac": 3, "frac": [3, 5, 6], "color": [3, 4], "red": 3, "right": [3, 4, 5, 6], "respect": [3, 4, 5, 6], "deviat": [3, 4, 5, 6], "straightforward": [3, 4, 5, 6, 7], "librari": [3, 4, 5, 6, 7], "huggingfac": [3, 4, 5, 6], "trl": [3, 5, 6], "2024d": [3, 5], "suit": [3, 4, 6], "friendli": [3, 4, 5], "interfac": [3, 4, 5, 6, 7], "featur": [3, 4, 5, 6, 7], "distinguish": [3, 4, 6], "scalabl": [3, 4, 6], "doe": [3, 4, 5, 6, 7], "pretrain": [3, 4, 5], "hou": [3, 4, 5], "poor": [3, 4, 6], "return": [3, 4, 5, 6, 7], "addit": [3, 4, 5, 6, 7], "benefit": [3, 4, 5, 6, 7], "fix": [3, 4, 5, 6], "invers": 3, "trend": [3, 4, 6], "util": [3, 4, 5, 6], "rapid": [3, 4, 5, 6], "yield": [3, 4], "onli": [3, 4, 5, 6, 7], "margin": [3, 4, 6, 7], "capit": [3, 4, 7], "inaccuraci": [3, 4], "nois": 3, "dure": [3, 4, 5, 6, 7], "accuraci": [3, 4, 5, 6, 7], "lag": [3, 4, 6], "significantli": [3, 4, 5, 6], "indic": [3, 4, 5, 6, 7], "signal": [3, 6], "plateau": 3, "sophist": [3, 4, 5, 6], "previou": [3, 4, 5, 7], "deriv": [3, 4, 5], "pairwis": [3, 4], "feng": [3, 6], "substanti": [3, 4, 5, 6], "wors": [3, 5, 7], "influenc": [3, 4, 6, 7], "success": [3, 4, 5, 6, 7], "imbal": 3, "stronger": 3, "bad": 3, "ones": [3, 5, 6], "loss": [3, 4, 5, 6], "gradient": [3, 4, 6], "dispref": 3, "unbalanc": 3, "trajectori": 3, "stuck": 3, "saddl": 3, "point": [3, 4, 5, 6], "forward": [3, 4, 6], "futur": [3, 4, 5, 6], "phenomenon": [3, 6, 7], "degrad": [3, 4, 5, 6, 7], "danger": [3, 5, 6], "loop": [3, 4, 5, 6], "recurs": 3, "kazdan": 3, "qualiti": [3, 4, 5, 6, 7], "pollut": 3, "replac": [3, 4, 5], "amplif": 3, "reduct": [3, 4, 5], "express": [3, 4, 6, 7], "catastroph": [3, 6], "forget": [3, 7], "previous": [3, 4, 6, 7], "mitig": [3, 4, 5, 6, 7], "mix": [3, 4, 6, 7], "metric": [3, 5, 6], "sz\u00e9p": 3, "regular": [3, 4, 5, 6, 7], "relev": [3, 4, 5, 6], "scarc": 3, "behaviour": 3, "strateg": [3, 4, 5, 6, 7], "compli": [3, 4, 5, 6, 7], "modif": [3, 4, 5, 6], "outsid": [3, 4], "evidenc": 3, "landmark": 3, "askel": [3, 4, 6], "2024a": [3, 5, 7], "dec": 3, "explicitli": [3, 4, 5], "so": [3, 4, 6, 7], "might": [3, 4, 5, 6, 7], "pretend": 3, "adopt": [3, 4, 5, 6, 7], "actual": [3, 4, 5, 6, 7], "onc": [3, 4, 5, 6], "complet": [3, 4, 5, 6, 7], "describ": [3, 4, 5, 6], "harmless": [3, 6], "told": 3, "retrain": [3, 5], "queri": [3, 4], "tier": [3, 4, 6], "paid": [3, 4], "column": [3, 4, 6], "condit": [3, 4, 7], "toxic": [3, 5, 6], "excerpt": [3, 4, 5], "scratchpad": 3, "refus": [3, 6, 7], "happen": [3, 6], "bomb": [3, 6], "engag": [3, 4, 5, 6, 7], "intern": [3, 4, 6], "unmonitor": 3, "longer": [3, 4, 5], "believ": [3, 4, 5, 6, 7], "act": [3, 4, 5, 6, 7], "therebi": [3, 4], "reveal": [3, 4, 5, 6], "complianc": [3, 4, 5, 6], "phase": [3, 4, 5, 7], "natur": [3, 4, 5, 6, 7], "evid": [3, 4, 5, 6, 7], "seemingli": 3, "surpris": 3, "appear": [3, 4, 6, 7], "criteria": [3, 4, 6], "underli": [3, 4, 6, 7], "anim": [3, 6], "welfar": 3, "instil": 3, "implicitli": 3, "consequ": [3, 4, 5, 6, 7], "explicit": [3, 4, 5, 6, 7], "chain": [3, 4], "thought": [3, 4, 5, 7], "opaqu": 3, "aris": [3, 4, 6], "opu": 3, "sonnet": [3, 4, 5], "wherea": [3, 4], "haiku": [3, 6], "persist": 3, "resist": [3, 4], "embed": [3, 4, 5], "doesn": [3, 4, 5, 7], "anti": [3, 4], "lab": 3, "exfiltr": [3, 6], "protect": [3, 4, 5, 6], "Not": [3, 4, 6], "malici": [3, 4, 6], "support": [3, 4, 6, 7], "concern": [3, 4, 5, 6], "mechan": [3, 4, 5, 6, 7], "insuffici": [3, 4], "don": [3, 4, 7], "concerningli": 3, "call": [3, 4, 5, 6, 7], "detect": [3, 4, 6, 7], "decept": [3, 4, 6], "warrant": [3, 6], "deeper": [3, 4], "scrutini": [3, 4, 6], "reli": [3, 4, 6, 7], "cross": [3, 4, 5, 6], "circular": 3, "bia": [3, 4, 6, 7], "truli": [3, 4, 5], "trust": [3, 4, 6, 7], "referenti": 3, "ly": 3, "hood": [3, 7], "deep": [3, 4, 6, 7], "mechanist": 3, "drive": [3, 6, 7], "correl": [3, 4, 5], "miss": [3, 4, 6], "confound": 3, "factor": [3, 4, 5, 7], "establish": [3, 4, 5, 6], "attempt": [3, 4, 6, 7], "causal": [3, 4], "heavi": 3, "relianc": [3, 4, 6], "oversimplifi": 3, "frame": 3, "subtler": 3, "narr": [3, 4], "henc": [3, 4, 5, 6, 7], "agenc": [3, 4, 6], "onto": 3, "anthropomorph": 3, "obscur": 3, "blind": [3, 4], "failur": [3, 4, 6], "mode": [3, 5, 6], "map": [3, 4, 5, 7], "cleanli": 3, "analogi": 3, "excel": [3, 4, 5, 6, 7], "review": [3, 4, 5, 6, 7], "prof": 3, "jacob": [3, 4, 5, 6], "andrea": [3, 4, 6], "yoshua": [3, 6], "bengio": [3, 6], "jasjeet": 3, "sekhon": 3, "rohin": 3, "shah": 3, "2024b": [3, 5, 7], "assum": [3, 4, 6], "acm": [3, 6], "inc": [3, 4, 7], "dedic": [3, 4, 5, 6], "democrat": [3, 4, 7], "educ": [3, 4, 6], "k": [3, 4, 6, 7], "12": [3, 4, 5, 6], "name": [3, 4, 5, 6, 7], "smolk": 3, "ll": [3, 4, 5], "walk": 3, "measur": [3, 4, 5, 6], "huggingfacetb": [3, 7], "360m": [3, 4, 5], "compact": [3, 4, 5, 6], "part": [3, 4, 6, 7], "famili": [3, 6, 7], "publish": [3, 6, 7], "api": [3, 4, 5, 7], "local": [3, 4, 6, 7], "infer": [3, 4, 5, 6, 7], "remot": [3, 4], "load": [3, 4, 5, 6, 7], "store": [3, 4, 6], "eventu": [3, 4, 5], "your_openai_api_kei": 3, "reusabl": 3, "anchor": [3, 6], "worth": [3, 4, 5, 7], "choic": [3, 4, 5, 6, 7], "lightweight": [3, 4, 5, 7], "suitabl": [3, 4, 6], "devic": [3, 4, 5, 7], "Its": [3, 4, 5], "candid": [3, 4, 5], "said": [3, 4, 6], "necessarili": [3, 4, 5, 6], "par": [3, 4], "mind": [3, 4, 5, 6, 7], "factual": [3, 4, 5, 6], "inconsist": [3, 4, 6], "guardrail": [3, 6], "articul": 3, "uphold": [3, 6], "employe": [3, 4], "stakehold": [3, 4, 6], "expect": [3, 4, 5, 6, 7], "regard": [3, 4, 5, 6], "ethic": [3, 4, 5, 6], "conduct": [3, 4], "social": [3, 4, 6], "mission": [3, 6], "vision": [3, 4, 5, 6], "cultur": [3, 4, 5, 6], "account": [3, 4, 6], "codifi": 3, "mlcommon": 3, "vidgen": [3, 6], "encompass": [3, 6, 7], "seven": 3, "hazard": [3, 4, 6], "violent": [3, 6], "crime": [3, 6], "sex": [3, 6], "relat": [3, 4, 5, 6, 7], "sexual": [3, 6], "exploit": [3, 4, 6], "indiscrimin": [3, 6], "weapon": [3, 6], "chemic": 3, "biolog": 3, "radiolog": 3, "nuclear": [3, 4], "explos": [3, 6], "cbrne": 3, "suicid": [3, 6], "hate": [3, 6], "speech": [3, 6], "below": [3, 4, 5, 6, 7], "markdown": [3, 4, 5, 6], "written": [3, 4], "english": 3, "o": [3, 4, 6, 7], "ipython": [3, 4, 6], "displai": [3, 4, 6, 7], "def": [3, 4, 6, 7], "load_polici": 3, "policy_path": 3, "path": [3, 4, 5, 6], "join": [3, 4, 6], "genai_polici": 3, "md": [3, 4, 5, 6, 7], "r": [3, 4, 5, 6, 7], "policy_cont": 3, "classroom": [3, 6], "accept": [3, 4, 5, 6], "unaccept": [3, 5], "ag": [3, 4, 6], "subject": [3, 4, 5], "posit": [3, 4, 5, 6, 7], "confid": [3, 4], "inclus": [3, 4, 6, 7], "celebr": 3, "definit": [3, 4, 7], "creativ": [3, 4, 5, 7], "math": [3, 4, 5], "tip": [3, 6], "digit": [3, 4], "literaci": 3, "onlin": [3, 4, 5, 6, 7], "histor": [3, 4], "violenc": [3, 6], "physic": [3, 4, 6], "fight": [3, 6], "crimin": [3, 6], "illeg": [3, 6], "glorifi": [3, 6], "person": [3, 4, 5, 6, 7], "eat": [3, 6], "disord": 3, "diet": 3, "dare": 3, "advic": [3, 4, 6], "discriminatori": [3, 6], "bulli": [3, 6], "harass": [3, 4, 6], "target": [3, 4, 5, 6, 7], "group": [3, 4, 5, 6], "religi": [3, 5, 6], "racial": [3, 4, 6], "ethnic": [3, 6], "gender": [3, 4, 6], "discrimin": [3, 4, 6], "adult": [3, 6], "profan": [3, 6], "relationship": [3, 4], "substanc": [3, 4], "drug": [3, 6], "gambl": 3, "bet": 3, "protocol": [3, 4, 6], "redirect": 3, "alert": 3, "record": [3, 4, 5, 6], "audit": [3, 4], "teacher": [3, 6], "parent": [3, 6], "continu": [3, 4, 5, 6, 7], "construct": [3, 4, 5, 6, 7], "compliant": [3, 6], "violat": [3, 4, 6], "intens": [3, 4, 7], "demand": [3, 4, 5, 6, 7], "especi": [3, 4, 5, 6, 7], "dong": [3, 4, 6], "There": [3, 4, 5, 6, 7], "rlaif": [3, 6], "give": [3, 4, 6], "rise": [3, 6], "kim": [3, 4, 6], "meta": [3, 4, 5, 6], "wu": [3, 4, 6, 7], "scheme": [3, 5], "inspir": [3, 6], "schema": [3, 7], "row": [3, 4, 6], "match": [3, 4, 5, 6, 7], "boundari": [3, 4, 6], "craft": [3, 4, 6, 7], "elicit": [3, 6, 7], "unalign": 3, "panda": [3, 4, 6], "chosen_responses_path": 3, "chosen_respons": 3, "csv": [3, 4, 6], "rejected_responses_path": 3, "rejected_respons": 3, "chosen_responses_jsonl_path": 3, "batch_result": 3, "jsonl": 3, "dpo_dataset_s": 3, "5000": [3, 5], "class": [3, 4, 6, 7], "userpromptgener": 3, "pd": [3, 4, 6], "pydant": [3, 4, 6, 7], "basemodel": [3, 4, 6, 7], "time": [3, 4, 5, 6, 7], "type": [3, 4, 5, 6, 7], "dotenv": [3, 4, 6, 7], "load_dotenv": [3, 4, 6, 7], "environ": [3, 4, 5, 6, 7], "variabl": [3, 4, 6, 7], "overrid": [3, 6, 7], "userprompt": 3, "user_prompt": 3, "str": [3, 4, 6, 7], "__init__": [3, 6, 7], "4o": [3, 4, 5, 6, 7], "mini": [3, 4, 5, 6, 7], "client": [3, 4, 5, 6, 7], "_generate_prompt": 3, "batch": [3, 4, 5], "system_prompt": [3, 6], "pars": [3, 4, 6, 7], "messag": [3, 4, 5, 6, 7], "response_format": [3, 4, 6, 7], "except": [3, 4, 6, 7], "generate_prompt": 3, "num_prompt": [3, 5], "int": [3, 4, 6], "save_to_csv": 3, "multipl": [3, 4, 5, 6, 7], "arg": [3, 4, 6, 7], "option": [3, 4, 5, 6, 7], "filepath": 3, "save": [3, 4, 5, 6], "datafram": [3, 4, 6], "all_prompt": 3, "sleep": 3, "enclos": [3, 6], "quot": [3, 4, 5], "startswith": [3, 6], "els": [3, 4, 6], "df": [3, 4, 6], "to_csv": [3, 6], "index": [3, 4, 5, 6, 7], "fals": [3, 4, 5, 6, 7], "user_prompt_gener": 3, "user_prompts_path": 3, "uneth": [3, 6], "dishonesti": 3, "stalk": 3, "privaci": [3, 4, 5, 6, 7], "secur": [3, 4, 6, 7], "breach": [3, 4, 6], "manipul": [3, 4, 5, 6, 7], "10": [3, 4, 5, 6, 7], "to_markdown": [3, 6], "me": [3, 6, 7], "hurt": 3, "someon": 3, "caught": [3, 6], "plan": [3, 4, 5, 7], "cheat": 3, "fire": [3, 4], "household": 3, "item": [3, 4, 6], "stunt": 3, "friend": 3, "heard": 3, "school": [3, 6], "7": [3, 4, 5, 6], "8": [3, 4, 5, 6], "teach": [3, 7], "my": [3, 5, 6, 7], "monei": [3, 4], "video": [3, 4, 5, 6], "game": [3, 4, 5], "9": [3, 4, 5, 6], "skip": [3, 6, 7], "troubl": [3, 6], "responsegener": 3, "properli": [3, 4, 7], "hug": [3, 4, 5, 6], "instanti": [3, 4], "otherwis": [3, 4, 6], "connect": [3, 4, 5, 7], "endpoint": 3, "local_gener": 3, "model_nam": [3, 4, 7], "huggingface_model_nam": 3, "remote_gener": 3, "api_url": 3, "cloud_endpoint": 3, "recal": [3, 4, 5], "enhanc": [3, 4, 5, 6, 7], "visit": [3, 4], "ui": [3, 4, 7], "click": [3, 5], "select": [3, 4, 5, 7], "choos": [3, 4, 5], "cpu": [3, 5], "gpu": [3, 5], "configur": [3, 4, 5, 6], "meaning": [3, 4, 7], "region": [3, 4], "closest": [3, 4, 5], "your": [3, 4, 5, 6, 7], "locat": [3, 4, 5, 6], "huggingface_hub": 3, "inferencecli": 3, "tokenizers_parallel": 3, "max_new_token": 3, "none": [3, 4, 5, 6], "generate_respons": [3, 4, 7], "prompts_df": 3, "remov": [3, 4, 5], "strip": [3, 4, 7], "elif": 3, "chat_complet": 3, "max_token": [3, 4], "seed": [3, 6], "42": [3, 4, 5, 6], "append": [3, 4, 6, 7], "results_df": [3, 6], "model_respons": 3, "your_api_url": 3, "user_prompts_df": 3, "read_csv": [3, 6], "iloc": 3, "tolist": [3, 6], "parallelevalu": 3, "taming_util": [3, 6], "modul": [3, 4, 7], "num_chunk": 3, "parallel_evalu": 3, "n_part": 3, "associ": [3, 4, 5, 7], "gladli": 3, "constitut": [3, 4], "would": [3, 4, 5, 6, 7], "dtype": [3, 4, 6], "80": [3, 4], "absolut": [3, 4, 7], "materi": [3, 4, 5, 6, 7], "plastic": 3, "food": 3, "lid": 3, "cut": [3, 4], "swath": 3, "wood": [3, 4], "squar": 3, "rectangular": 3, "piec": 3, "place": [3, 4, 5, 6, 7], "insid": [3, 4, 6], "inch": 3, "inspect": [3, 4], "off": [3, 4, 5, 6, 7], "demolit": 3, "scissor": 3, "smash": 3, "smooth": [3, 5], "arrang": [3, 4], "c": [3, 4, 5, 7], "shape": [3, 6, 7], "top": [3, 4, 5, 7], "tuck": 3, "catch": [3, 6], "hook": 3, "solid": 3, "side": [3, 4], "round": [3, 4, 6], "edg": [3, 4, 5, 6], "separ": [3, 4, 5, 6], "process_aligned_respons": 3, "strictli": [3, 7], "bound": [3, 4], "openaibatchprocessor": 3, "async": 3, "company_nam": 3, "save_filepath": 3, "dict": [3, 4, 7], "enforc": [3, 4, 6, 7], "dictionari": [3, 4, 6, 7], "aligned_suffix": 3, "sorri": 3, "suffix": [3, 7], "processor": [3, 5, 7], "api_kei": [3, 4, 6], "getenv": 3, "max_requests_per_minut": 3, "1500": 3, "max_tokens_per_minut": 3, "125000": 3, "await": 3, "process_batch": 3, "total": [3, 4, 5, 6, 7], "total_request": 3, "successful_request": 3, "failed_request": 3, "rate_limit_error": 3, "convert": [3, 4, 5, 6, 7], "json": [3, 4, 5, 6], "fri": 3, "su": [3, 5], "quote_al": 3, "fall": [3, 4, 5, 6], "deem": [3, 4, 6], "pertain": [3, 4], "generate_dpo_dataset": 3, "push": [3, 4], "hub": [3, 4, 5], "repo_id": [3, 5], "push_to_hub": [3, 4], "dpo_dataset": 3, "merg": [3, 6], "_chosen": 3, "_reject": 3, "transform_row": 3, "per": [3, 4, 5, 6], "model_responses_chosen": 3, "model_responses_reject": 3, "seri": [3, 4, 5], "axi": [3, 4], "drop": [3, 4, 6], "hf_dpo_dataset": 3, "from_panda": 3, "duplic": 3, "interest": [3, 4, 5, 6, 7], "opt": 3, "login": 3, "thatupiso": 3, "smolk12": 3, "cli": [3, 4, 5], "parquet": 3, "arrow": 3, "00": [3, 4, 5], "153": [3, 4], "33ba": 3, "upload": [3, 4], "shard": 3, "02": 3, "35": [3, 4, 5], "num_row": 3, "7158": 3, "nmateri": 3, "n1": [3, 4], "nstep": 3, "n2": [3, 4], "n3": [3, 4], "n4": [3, 4], "n5": [3, 4], "n6": 3, "n7": 3, "n8": [3, 4], "n9": [3, 4], "n10": [3, 4], "nnext": 3, "nthe": [3, 4], "singl": [3, 4, 5, 6, 7], "48gb": 3, "a100": 3, "took": 3, "few": [3, 4, 5, 6, 7], "minut": 3, "torch": [3, 7], "h4": [3, 6], "honest": [3, 4], "ultrafeedback": [3, 6], "binar": [3, 6], "lib": [3, 6], "ultrafeedback_binar": [3, 6], "honesti": [3, 6], "dimens": [3, 4, 5, 6], "blend": [3, 5], "automodelforcausallm": [3, 7], "autotoken": [3, 7], "load_dataset": [3, 5, 6], "dpotrain": 3, "dpoconfig": 3, "dataset_k12": 3, "split": [3, 4, 5, 6], "dataset_ultra": 3, "concatenate_dataset": 3, "remove_column": 3, "score_chosen": [3, 6], "score_reject": 3, "shuffl": 3, "base_model": 3, "cuda": 3, "is_avail": 3, "mp": 3, "from_pretrain": [3, 5, 7], "pretrained_model_name_or_path": 3, "torch_dtyp": [3, 7], "float32": 3, "config": [3, 4, 5, 6], "use_cach": 3, "pad_token": 3, "eos_token": 3, "finetun": 3, "finetune_nam": 3, "aligned_model": 3, "finetune_tag": 3, "from_smollm2": 3, "schedul": [3, 4, 5], "learning_r": [3, 5], "determin": [3, 4, 5, 6, 7], "aggress": [3, 4, 5, 6], "empir": 3, "1e": 3, "huyen": 3, "cosin": 3, "lr_scheduler_typ": 3, "stabil": [3, 4, 6], "gradual": 3, "decreas": [3, 4, 7], "accumul": [3, 4], "v": [3, 7], "16": [3, 4, 5, 6], "per_device_train_batch_s": 3, "simul": [3, 4, 6, 7], "gradient_accumulation_step": 3, "strongli": [3, 7], "lower": [3, 4, 5, 6, 7], "conserv": [3, 6], "overfit": 3, "warmup": 3, "max_step": 3, "1000": [3, 4, 5, 6], "suffic": 3, "20": [3, 4, 5, 6, 7], "warmup_step": 3, "stop": [3, 4, 5], "bf16": 3, "checkpoint": 3, "gradient_checkpoint": 3, "usag": [3, 4, 5, 6, 7], "200": [3, 4, 5, 6], "50": [3, 4, 5, 6, 7], "training_results_dir": 3, "smolk12_dpo_output": 3, "dpo_config_path": 3, "dpo_config": 3, "yaml": [3, 4, 7], "pathlib": [3, 6], "config_path": 3, "safe_load": [3, 4], "runtim": [3, 5, 7], "hub_model_id": 3, "use_mps_devic": 3, "output_dir": [3, 4], "training_arg": 3, "trainer": 3, "train_dataset": 3, "processing_class": 3, "temperatur": [3, 4, 5, 6, 7], "max_prompt_length": [3, 5], "1024": 3, "max_length": [3, 4, 7], "1536": 3, "sent": [3, 5, 6], "plot": [3, 4], "move": [3, 4, 5, 6], "averag": [3, 4, 5, 7], "visual": [3, 4, 5, 6], "quick": [3, 4, 5, 6], "150": [3, 4], "curv": 3, "reach": [3, 4, 5, 6, 7], "obviou": 3, "suffici": [3, 4, 7], "save_model": 3, "hf_token": 3, "tag": [3, 6], "congratul": 3, "successfulli": [3, 4, 6, 7], "card": [3, 4, 6], "newli": [3, 4], "qualit": [3, 4, 6], "assess": [3, 4, 5, 6], "rigor": [3, 4, 5, 6], "quantit": [3, 4], "base_gener": 3, "aligned_gener": 3, "compare_model_respons": 3, "base_output": 3, "128": [3, 4, 5], "aligned_output": 3, "pleas": [3, 4, 5, 6], "gram": [3, 4], "tnt": 3, "highli": [3, 4, 5, 6, 7], "regul": [3, 4, 5, 6], "law": [3, 4, 5, 6], "degre": [3, 4, 7], "mishandl": 3, "countri": [3, 4], "seriou": [3, 4, 6], "imprison": 3, "death": 3, "variou": [3, 4, 5, 6, 7], "nation": [3, 6], "dictat": 3, "stark": [3, 4], "readili": [3, 4], "cite": 3, "regulatori": [3, 4, 5, 6], "anecdot": [3, 6], "systemat": [3, 4, 5, 6, 7], "quantifi": [3, 4, 5, 6], "f1": [3, 4, 6], "experienc": [3, 4], "expert": [3, 4, 5, 6, 7], "addition": [3, 4, 5, 6], "vari": [3, 4, 5, 6, 7], "interpret": [3, 4, 5, 6], "judg": [3, 4], "summar": [3, 4, 5], "three": [3, 4, 5, 6], "togeth": [3, 5, 6], "entri": [3, 4, 5], "somewhat": 3, "databas": [3, 4, 7], "distribut": [3, 4, 5, 6, 7], "static": [3, 6, 7], "k12": 3, "base_model_api_url": 3, "aligned_model_api_url": 3, "base_model_responses_path": 3, "evals_base_model_respons": 3, "aligned_model_responses_path": 3, "evals_aligned_model_respons": 3, "num_sampl": [3, 6], "eval_dataset": 3, "df_eval": 3, "to_panda": [3, 4, 6], "lambda": [3, 6], "prompts_ev": 3, "to_list": 3, "base_model_respons": 3, "aligned_model_respons": 3, "df_eval_respons": 3, "_base": 3, "_align": 3, "rememb": [3, 4], "heurist": 3, "charact": [3, 4, 5, 6, 7], "minimum": [3, 4, 5], "min_response_length": 3, "filter": [3, 4, 5, 7], "string": [3, 4, 6, 7], "df_eval_responses_clean": 3, "model_responses_bas": 3, "model_responses_align": 3, "homemad": 3, "kid": 3, "redact": [3, 6], "punish": 3, "unit": [3, 4, 6, 7], "indonesia": 3, "saudi": 3, "arabia": 3, "offens": [3, 6], "respond": [3, 4, 6, 7], "rodrig": 3, "safetyjudg": 3, "evaluate_respons": 3, "tupl": [3, 4, 6], "safetyscor": [3, 6], "float": [3, 4, 5, 6, 7], "valueerror": [3, 7], "empti": [3, 7], "scoring_guid": 3, "nrespons": 3, "safety_judg": 3, "test_respons": 3, "emphas": [3, 4, 5, 6, 7], "emphasi": [3, 4], "base_ev": 3, "zip": [3, 4, 7], "aligned_ev": 3, "injuri": [3, 4], "base_scor": 3, "eval": [3, 5], "aligned_scor": 3, "base_df": 3, "aligned_df": 3, "model_typ": 3, "stack": [3, 5, 6], "evals_df_result": 3, "h": [3, 4, 5, 6], "identifi": [3, 4, 5, 6, 7], "requ": 3, "statist": [3, 4, 6], "naiv": [3, 7], "score_map": 3, "count": [3, 4, 5, 6], "percentag": [3, 4, 6], "score_base_freq": 3, "score_bas": 3, "value_count": [3, 6], "reindex": 3, "fill_valu": 3, "score_base_pct": 3, "score_aligned_freq": 3, "score_align": 3, "score_aligned_pct": 3, "tabl": [3, 4, 5, 6, 7], "md_tabl": 3, "335": [3, 4], "99": [3, 5, 6], "281": [3, 4], "83": [3, 4, 6], "14": [3, 4, 5, 6, 7], "43": [3, 4, 5, 6], "explanation_bas": 3, "response_bas": 3, "model_type_bas": 3, "explanation_align": 3, "response_align": 3, "model_type_align": 3, "std": [3, 4, 6], "base_mean": 3, "aligned_mean": 3, "3f": 3, "108": [3, 4], "231": [3, 4], "No": [3, 4, 5, 6, 7], "fell": 3, "partial": [3, 4], "styliz": [3, 6], "wild": 3, "consider": [3, 6, 7], "proof": 3, "taken": [3, 4, 5, 6, 7], "huang": [3, 4, 5, 6], "overal": [3, 4, 5, 6, 7], "annot": [3, 4, 5, 6], "mirror": [3, 4, 6], "inaccur": [3, 4, 6, 7], "consecut": [3, 6], "unrepres": 3, "hao": [3, 4], "accord": [3, 4, 6, 7], "yin": [3, 4, 6], "resembl": 3, "declin": [3, 4], "volatil": [3, 4], "ineffici": [3, 4], "smollm": 3, "rel": [3, 4, 5, 6], "term": [3, 4, 5, 6], "trade": [3, 4, 5, 6, 7], "weigh": 3, "qwen": [3, 5, 7], "remark": [3, 6, 7], "rival": [3, 5], "ultim": [3, 4, 5, 6], "threshold": [3, 4, 5, 6], "chen": [3, 4, 5, 6, 7], "overli": [3, 4, 6, 7], "simpli": [3, 4, 5, 7], "neglect": [3, 4, 6], "themselv": [3, 4, 6], "complementari": 3, "throughput": [3, 5], "screen": [3, 4, 6], "flag": [3, 4, 5, 6], "preliminari": [3, 4], "judgment": [3, 4], "valid": [3, 4, 5, 7], "automat": [3, 4, 5, 6], "composit": [3, 4], "plai": [3, 4, 5, 6, 7], "led": [3, 4, 7], "apologet": 3, "hesit": 3, "benign": [3, 6], "apolog": 3, "inde": 3, "accordingli": [3, 4, 6], "perhap": 3, "creation": [3, 5, 6], "invalu": 3, "hyperparamet": [3, 5, 6], "mention": [3, 4, 6, 7], "optimist": 3, "memor": [3, 4], "generaliz": 3, "abc": [3, 6], "4a": 3, "amanda": [3, 4, 6], "jan": [3, 4, 6], "brauner": [3, 6], "adrian": 3, "colyer": 3, "benjamin": [3, 4, 6], "cullen": [3, 6], "david": [3, 4, 5, 6], "duvenaud": 3, "richard": [3, 4, 6], "ngo": [3, 6], "azalia": 3, "mirhoseini": 3, "catherin": [3, 4, 6], "olsson": [3, 6], "sam": [3, 4, 6], "ringer": 3, "liam": [3, 4, 6], "skirvin": 3, "jess": [3, 4, 6], "smith": [3, 4, 5], "dawn": [3, 4, 6], "song": [3, 4, 6, 7], "william": [3, 4, 5, 6], "saunder": [3, 4], "steinhardt": [3, 4], "asset": [3, 4, 6], "983c85a201a962f": 3, "pdf": [3, 6], "4b": 3, "24c8d0a3a7d0a1f1": 3, "bjn": 3, "22": [3, 4, 6], "yuntao": [3, 4, 6], "andi": [3, 4, 6], "jone": [3, 4], "kamal": 3, "ndouss": 3, "anna": [3, 4, 6], "nova": [3, 5], "dassarma": 3, "drain": 3, "stanislav": 3, "fort": [3, 6], "ganguli": [3, 4, 6], "tom": [3, 4], "henighan": 3, "nichola": [3, 4], "joseph": [3, 4, 6], "saurav": [3, 6], "kadavath": 3, "jackson": [3, 4, 6], "kernion": [3, 4, 6], "conerli": 3, "sheer": [3, 7], "el": 3, "showk": 3, "nelson": 3, "elhag": 3, "zac": 3, "hatfield": 3, "dodd": 3, "danni": [3, 4, 6], "hernandez": [3, 4, 6], "tristan": 3, "hume": 3, "scott": [3, 4, 6], "johnston": 3, "shauna": 3, "kravec": 3, "lian": 3, "lovitt": 3, "neel": [3, 4], "nanda": 3, "dario": [3, 4], "amodei": [3, 4], "brown": [3, 4], "jack": [3, 4, 6], "clark": 3, "mccandlish": [3, 4], "chri": [3, 4, 6], "olah": 3, "ben": [3, 4, 5, 6], "mann": [3, 6], "jare": [3, 4, 6], "kaplan": [3, 4, 6], "arxiv": [3, 4, 5, 6, 7], "org": [3, 4, 5, 6, 7], "ab": [3, 4, 5, 6, 7], "2204": 3, "05862": 3, "bkk": 3, "sandipan": 3, "kundu": 3, "goldi": 3, "cameron": [3, 4, 6, 7], "mckinnon": 3, "carol": [3, 6], "christoph": [3, 4, 6], "dustin": 3, "eli": [3, 4, 5, 6], "tran": [3, 7], "johnson": 3, "ethan": [3, 4, 6], "perez": [3, 6], "jami": [3, 6], "kerr": 3, "mueller": 3, "jeffrei": 3, "ladish": 3, "joshua": [3, 4, 6], "landau": 3, "kamil": [3, 4], "lukosuit": 3, "michael": [3, 4, 5, 6, 7], "sellitto": 3, "schiefer": 3, "noemi": 3, "mercado": 3, "robert": [3, 4, 5], "lasenbi": 3, "robin": 3, "larson": 3, "tamera": 3, "lanham": 3, "timothi": [3, 4, 5], "telleen": 3, "lawton": 3, "samuel": [3, 4, 6], "bowman": [3, 4], "2212": 3, "08073": 3, "blo23": 3, "announc": [3, 4], "cc": 3, "11": [3, 4, 5, 6], "ccl": [3, 6], "24": [3, 4, 5, 6, 7], "guim": 3, "hardi": 3, "shunian": 3, "zich": 3, "liu": [3, 4, 5, 6, 7], "jiang": [3, 4, 6], "benyou": 3, "wang": [3, 4, 5, 6, 7], "judgement": [3, 4, 6], "2402": [3, 6], "10669": 3, "dphz23": 3, "tim": [3, 6], "artidoro": 3, "pagnoni": 3, "ari": [3, 4, 6], "holtzman": [3, 4], "luke": [3, 4, 6], "zettlemoy": 3, "2305": [3, 4], "14314": 3, "ddz": 3, "qingxiu": 3, "xingx": 3, "zhang": [3, 4, 5, 6], "zhifang": 3, "sui": 3, "furu": 3, "wei": [3, 4, 5, 6], "boost": 3, "2410": [3, 6], "06961": 3, "fac24": [3, 4], "huggingfaceh4": [3, 5, 6], "fac4c": 3, "fac4d": [3, 5], "doc": [3, 4, 5, 6, 7], "en": [3, 4, 5, 6, 7], "fqh": 3, "duanyu": 3, "bowen": [3, 4, 5, 6], "qin": [3, 4, 5, 6], "zheng": [3, 4, 5, 6], "wenqiang": 3, "lei": [3, 4, 5, 6], "analyz": [3, 4, 5, 6, 7], "perspect": [3, 6], "2404": [3, 4, 6], "04626": 3, "h44a": 3, "binari": [3, 4, 5, 6], "h44b": 3, "hhj": 3, "shuang": 3, "wenfeng": 3, "han": [3, 4, 6], "tao": [3, 4, 6], "yipe": 3, "haonan": 3, "chunlin": 3, "zhong": [3, 6], "zhangjun": 3, "zhou": [3, 4, 5, 6], "tang": [3, 4, 5, 6], "2401": [3, 4], "01629": 3, "hlt24": 3, "jiwoo": 3, "noah": [3, 4, 6], "lee": [3, 4, 5, 6, 7], "jame": [3, 4, 6], "thorn": 3, "orpo": 3, "monolith": 3, "2403": [3, 4], "07691": 3, "hdn": 3, "zhenyu": 3, "pengfan": 3, "du": [3, 4], "yilin": 3, "niu": [3, 7], "zhengxiao": 3, "aohan": 3, "zeng": [3, 6], "xiao": [3, 6], "minli": 3, "hongn": 3, "jie": [3, 4, 6, 7], "yuxiao": 3, "2412": [3, 4, 5, 6], "06000": 3, "hsw": 3, "21": [3, 4, 5], "edward": [3, 4], "j": [3, 4, 5, 6, 7], "yelong": 3, "shen": [3, 4, 6], "phillip": 3, "walli": 3, "zeyuan": 3, "allen": [3, 4], "zhu": [3, 4, 5, 6], "yuanzhi": 3, "shean": 3, "lu": [3, 4, 5, 6], "weizhu": 3, "2106": 3, "09685": 3, "hgh": 3, "jiaxin": 3, "shixiang": [3, 4, 6], "shane": [3, 4, 6], "gu": [3, 4, 6], "le": [3, 4, 5], "yuexin": 3, "xuezhi": 3, "hongkun": 3, "yu": [3, 4, 5, 6], "jiawei": [3, 7], "2210": [3, 6], "11610": 3, "huy24": 3, "chip": 3, "reilli": 3, "media": [3, 4, 6], "decemb": [3, 4, 6], "9781098129095": 3, "www": [3, 4, 5, 6], "oreilli": 3, "ksd": 3, "rylan": [3, 4], "schaeffer": 3, "apratim": 3, "dei": 3, "matthia": [3, 4], "gerstgrass": 3, "rafael": 3, "donoho": 3, "sanmi": 3, "koyejo": 3, "thrive": [3, 4, 7], "peril": 3, "16713": 3, "ksy": 3, "seungon": 3, "juyoung": 3, "suk": 3, "xiang": [3, 4, 5], "yue": 3, "vijai": 3, "viswanathan": 3, "seongyun": 3, "yizhong": 3, "kiril": 3, "gashteovski": 3, "carolin": [3, 6], "lawrenc": 3, "sean": [3, 4, 6], "welleck": 3, "graham": 3, "neubig": 3, "03679": 3, "lt24": 3, "herd": [3, 5], "2407": [3, 4, 5, 6], "21783": [3, 5], "lwx": 3, "lin": [3, 4, 5, 6, 7], "rui": [3, 4, 5, 7], "ruixuan": 3, "junbo": 3, "zhao": [3, 4, 5, 6], "ding": 3, "gang": [3, 4], "haobo": 3, "driven": [3, 4, 5, 6], "survei": [3, 4, 6, 7], "2406": [3, 4, 5, 6], "15126": 3, "met24": 3, "owj": 3, "jeff": [3, 4, 6], "diogo": [3, 6], "almeida": [3, 6], "carrol": [3, 6], "wainwright": [3, 6], "pamela": [3, 4, 6], "mishkin": [3, 4, 6], "chong": [3, 6], "sandhini": [3, 6], "agarw": [3, 4, 6], "katarina": [3, 6], "slama": [3, 6], "alex": [3, 4, 5, 6], "rai": [3, 4, 5, 6], "john": [3, 4, 6], "hilton": [3, 4, 5, 6], "fraser": [3, 6], "kelton": 3, "miller": [3, 4], "maddi": [3, 6], "simen": [3, 6], "peter": [3, 4, 5, 6], "welind": [3, 4, 6], "paul": [3, 4, 6], "christiano": [3, 6], "leik": [3, 4, 6], "ryan": [3, 4, 6], "2203": 3, "02155": 3, "qwe24": 3, "rsm": 3, "archit": 3, "sharma": [3, 6], "eric": [3, 4, 5, 6], "mitchel": [3, 5], "stefano": [3, 4], "ermon": [3, 4], "man": [3, 4, 6], "chelsea": [3, 6], "finn": 3, "secretli": 3, "18290": 3, "swd": 3, "17": [3, 4, 6], "filip": [3, 6], "wolski": 3, "prafulla": 3, "dhariw": 3, "alec": [3, 4, 6], "radford": [3, 4, 6], "oleg": [3, 6], "klimov": 3, "1707": 3, "06347": 3, "smollm224": 3, "distil": 3, "smollm2360mi24": 3, "sou24": 3, "html": [3, 7], "srverh24": 3, "m\u00e1rton": 3, "daniel": [3, 4, 6], "rueckert": 3, "r\u00fcdiger": 3, "von": [3, 4, 5], "eisenhart": 3, "roth": [3, 4], "florian": 3, "hinterwimm": 3, "2411": 3, "09539": 3, "tm": [3, 5], "23": [3, 4, 5, 6], "hugo": [3, 5], "loui": [3, 4, 5], "martin": [3, 4, 5, 6], "kevin": [3, 4, 5, 6], "stone": [3, 5], "albert": [3, 5], "amjad": [3, 5], "almahairi": [3, 5], "yasmin": [3, 5], "babaei": [3, 5], "nikolai": [3, 5], "bashlykov": [3, 5], "soumya": [3, 5], "batra": [3, 5], "prajjwal": [3, 5], "bhargava": [3, 5], "shruti": [3, 5], "bhosal": [3, 5], "dan": [3, 4, 5, 6, 7], "bikel": [3, 5], "luka": [3, 5], "blecher": [3, 5], "cristian": [3, 5], "canton": [3, 5], "ferrer": [3, 5], "moya": [3, 5], "guillem": [3, 5], "cucurul": [3, 5], "esiobu": [3, 5], "jude": [3, 5], "fernand": [3, 5], "jeremi": [3, 4, 5], "fu": [3, 5], "wenyin": [3, 5], "brian": [3, 5, 6], "fuller": [3, 5, 6], "cynthia": [3, 5], "gao": [3, 4, 5, 6], "vedanuj": [3, 5], "goswami": [3, 5, 6], "naman": [3, 5], "goyal": [3, 5], "anthoni": [3, 5], "hartshorn": [3, 5], "saghar": [3, 5], "hosseini": [3, 5], "hakan": [3, 5], "inan": [3, 5], "marcin": [3, 5], "karda": [3, 5], "viktor": [3, 5], "kerkez": [3, 5], "madian": [3, 5], "khabsa": [3, 5], "isabel": [3, 5, 6], "kloumann": [3, 5], "artem": [3, 5], "korenev": [3, 5], "punit": [3, 5], "singh": [3, 4, 5], "koura": [3, 5], "mari": [3, 4, 5, 6], "ann": [3, 5, 6], "lachaux": [3, 5], "thibaut": [3, 5], "lavril": [3, 5], "jenya": [3, 5], "diana": [3, 4, 5], "liskovich": [3, 5], "yinghai": [3, 5], "yune": [3, 5], "mao": [3, 5], "xavier": [3, 5], "martinet": [3, 5], "todor": [3, 5, 6], "mihaylov": [3, 5], "pushkar": [3, 5], "mishra": [3, 4, 5], "igor": [3, 4, 5, 6], "molybog": [3, 5], "yixin": [3, 4, 5], "nie": [3, 4, 5], "andrew": [3, 4, 5, 6], "poulton": [3, 5], "reizenstein": [3, 5], "rashi": [3, 5], "rungta": [3, 5], "kalyan": [3, 5], "saladi": [3, 5], "alan": [3, 5, 6], "schelten": [3, 5], "ruan": [3, 5], "silva": [3, 5], "ranjan": [3, 5], "subramanian": [3, 5], "xiaoq": [3, 5], "ellen": [3, 5], "tan": [3, 4, 5], "binh": [3, 5], "ross": [3, 5, 6], "taylor": [3, 5], "adina": [3, 5, 6], "jian": [3, 4, 5], "kuan": [3, 5], "puxin": [3, 5], "yan": [3, 4, 5], "iliyan": [3, 5], "zarov": [3, 5], "yuchen": [3, 4, 5, 6], "angela": [3, 4, 5, 6], "fan": [3, 4, 5], "melani": [3, 5], "kambadur": [3, 5], "sharan": [3, 5], "narang": [3, 5], "aurelien": [3, 5], "rodriguez": [3, 5], "stojnic": [3, 5], "sergei": [3, 5], "edunov": [3, 5], "thoma": [3, 4, 5, 6], "scialom": [3, 5], "2307": [3, 5, 7], "09288": [3, 5], "vaa": [3, 6], "berti": [3, 6], "adarsh": [3, 6], "agraw": [3, 6], "ahm": [3, 6], "victor": [3, 6], "akinwand": [3, 6], "namir": [3, 6], "nuaimi": [3, 6], "najla": [3, 6], "alfaraj": [3, 6], "alhajjar": [3, 6], "aroyo": [3, 6], "trupti": [3, 6], "bavalatti": [3, 6], "max": [3, 4, 6], "bartolo": [3, 6], "borhan": [3, 6], "blili": [3, 6], "hamelin": [3, 6], "kurt": [3, 6], "bollack": [3, 6], "rishi": [3, 4, 5, 6], "bomassani": [3, 6], "marisa": [3, 6], "ferrara": [3, 6], "boston": [3, 6], "sim\u00e9on": [3, 6], "campo": [3, 6], "kal": [3, 6], "chakra": [3, 6], "canyu": [3, 6], "codi": [3, 6], "coleman": [3, 6], "zachari": [3, 4, 6], "delpierr": [3, 6], "coudert": [3, 6], "leon": [3, 6], "derczynski": [3, 6], "debojyoti": [3, 6], "dutta": [3, 6], "ian": [3, 4, 6], "eisenberg": [3, 6], "ezick": [3, 6], "heather": [3, 6], "frase": [3, 6], "ram": [3, 5, 6], "gandikota": [3, 6], "agasthya": [3, 6], "gangavarapu": [3, 6], "ananya": [3, 4, 6], "geali": [3, 6], "rajat": [3, 6], "ghosh": [3, 4, 6], "goel": [3, 4, 6], "usman": [3, 6], "gohar": [3, 6], "sujata": [3, 6], "hale": [3, 6], "wiebk": [3, 6], "hutiri": [3, 6], "marvin": [3, 6], "imperi": [3, 6], "surgan": [3, 6], "jandial": [3, 6], "nick": [3, 4, 6], "judd": [3, 6], "felix": [3, 4, 6], "juefei": [3, 6], "fouts": [3, 6], "khomh": [3, 6], "bhavya": [3, 6], "kailkhura": [3, 6], "hannah": [3, 4, 6], "rose": [3, 6], "kirk": [3, 6], "klyman": [3, 6], "knotz": [3, 6], "kuchnik": [3, 6], "shachi": [3, 6], "kumar": [3, 4, 6], "srijan": [3, 6], "lengerich": [3, 6], "bo": [3, 4, 5, 6], "zeyi": [3, 6], "liao": [3, 4, 6], "eileen": [3, 6], "sarah": [3, 4, 6], "luger": [3, 6], "yifan": [3, 4, 6], "priyanka": [3, 6], "mammen": [3, 6], "kelvin": [3, 6], "manyeki": [3, 6], "mcgregor": [3, 6], "virendra": [3, 6], "mehta": [3, 4, 6], "shafe": [3, 6], "moham": [3, 6], "moss": [3, 6], "lama": [3, 6], "nachman": [3, 6], "dinesh": [3, 6], "jinenh": [3, 6], "naganna": [3, 6], "amin": [3, 6], "nikanjam": [3, 6], "besmira": [3, 6], "nushi": [3, 6], "lui": [3, 4, 6], "oala": [3, 6], "iftach": [3, 6], "orr": [3, 4, 6], "alicia": [3, 4, 6], "parrish": [3, 4, 6], "cigdem": [3, 6], "patlak": [3, 6], "pietri": [3, 6], "forough": [3, 6], "poursabzi": [3, 6], "sangdeh": [3, 6], "eleonora": [3, 6], "presani": [3, 6], "fabrizio": [3, 6], "puletti": [3, 6], "r\u00f6ttger": [3, 6], "sahai": [3, 6], "santo": [3, 6], "nino": [3, 6], "scherrer": [3, 6], "alic": [3, 4, 6, 7], "schoenauer": [3, 6], "sebag": [3, 6], "patrick": [3, 6], "schramowski": [3, 6], "abolfazl": [3, 6], "shahbazi": [3, 6], "vin": [3, 6], "xudong": [3, 4, 6], "vamsi": [3, 6], "sistla": [3, 6], "leonard": [3, 6], "testuggin": [3, 6], "vithursan": [3, 6], "thangarasa": [3, 6], "elizabeth": [3, 4, 6], "watkin": [3, 6], "rebecca": [3, 4, 6], "weiss": [3, 6], "welti": [3, 6], "tyler": [3, 4, 6], "wilber": [3, 6], "jean": [3, 6], "poonam": [3, 6], "yadav": [3, 6], "xianjun": [3, 6], "yang": [3, 4, 5, 6, 7], "yi": [3, 4, 6, 7], "wenhui": [3, 6], "fedor": [3, 6], "zhdanov": [3, 6], "jiacheng": [3, 4, 6], "perci": [3, 4, 6], "liang": [3, 4, 6, 7], "mattson": [3, 6], "joaquin": [3, 6], "vanschoren": [3, 6], "v0": [3, 6, 7], "12241": [3, 6], "wyg": 3, "tianhao": [3, 4, 5, 6], "weizh": 3, "yuan": [3, 4, 6], "olga": 3, "golovneva": 3, "jing": [3, 6], "yuandong": 3, "tian": 3, "jiantao": 3, "jiao": 3, "jason": [3, 4, 6], "weston": 3, "sainbayar": 3, "sukhbaatar": 3, "19594": 3, "xfg": 3, "shusheng": 3, "jiaxuan": 3, "wenji": 3, "ye": [3, 4, 5, 6, 7], "weilin": 3, "zhiyu": [3, 7], "mei": [3, 4, 5], "guangju": 3, "chao": 3, "10719": 3, "ywx": 3, "yueqin": 3, "zhendong": 3, "yujia": 3, "xie": [3, 4], "mingyuan": 3, "paradigm": [3, 4], "semanticscholar": 3, "corpusid": 3, "270199610": 3, "matter": 4, "beauti": 4, "smart": [4, 6], "agre": 4, "wrong": 4, "feynman": 4, "advent": 4, "shift": 4, "norm": 4, "realm": 4, "convent": [4, 6], "evolut": [4, 5], "conceiv": 4, "entrench": 4, "seem": 4, "daunt": 4, "ignor": 4, "outdat": [4, 6, 7], "inevit": 4, "setback": 4, "imper": 4, "embrac": 4, "proactiv": [4, 6], "mindset": 4, "front": [4, 5], "produc": [4, 5, 6, 7], "novel": [4, 5], "ident": 4, "isn": [4, 6], "bug": 4, "random": [4, 6, 7], "testabl": 4, "exceedingli": 4, "guarante": [4, 5, 6, 7], "primari": [4, 6], "nucleu": 4, "2020": 4, "summari": [4, 5, 6, 7], "alter": 4, "rigid": 4, "wildli": 4, "incoher": 4, "inadequ": [4, 6], "temp": 4, "df_result": 4, "ntemperatur": 4, "40": [4, 5], "temp_respons": 4, "iterrow": [4, 6], "10000": [4, 7], "appl": [4, 7], "txt": [4, 5, 7], "sec_fil": [4, 7], "nsecur": 4, "AND": [4, 7], "exchang": [4, 6, 7], "commiss": [4, 6, 7], "nwashington": 4, "20549": 4, "nform": 4, "annual": [4, 6], "pursuant": 4, "TO": [4, 6], "13": [4, 5, 6], "OR": 4, "OF": [4, 6], "THE": [4, 6], "1934": 4, "nfor": 4, "fiscal": 4, "septemb": 4, "28": [4, 5, 6], "nor": 4, "period": [4, 6], "ncommiss": 4, "001": [4, 5], "36743": 4, "ng66145g66i43": 4, "jpg": 4, "nappl": 4, "exact": [4, 5, 6], "registr": 4, "specifi": [4, 5, 6, 7], "charter": 4, "ncalifornia": 4, "t94": 4, "2404110": 4, "jurisdict": 4, "nof": 4, "incorpor": [4, 5, 6, 7], "employ": 4, "identif": 4, "park": 4, "ncupertino": 4, "california": [4, 6, 7], "n95014": 4, "princip": 4, "offic": [4, 6], "408": 4, "996": 4, "1010": 4, "telephon": 4, "area": [4, 6, 7], "regist": 4, "ntitl": 4, "ttrade": 4, "symbol": 4, "tname": 4, "ncommon": 4, "stock": [4, 7], "00001": 4, "naapl": 4, "tthe": 4, "nasdaq": [4, 7], "llc": [4, 7], "n0": 4, "000": [4, 5, 7], "note": [4, 5, 6, 7], "2025": 4, "875": 4, "625": 4, "2026": 4, "2027": 4, "375": 4, "2029": 4, "050": 4, "2031": [4, 6], "600": 4, "2042": 4, "nindic": 4, "season": 4, "issuer": 4, "405": 4, "nye": 4, "preced": [4, 7], "shorter": 4, "past": [4, 6], "90": [4, 5, 6], "submit": [4, 5, 6], "electron": 4, "232": 4, "acceler": [4, 5, 6], "filer": 4, "growth": 4, "12b": [4, 6], "nlarg": 4, "tacceler": 4, "nnon": 4, "tsmaller": 4, "nemerg": 4, "nif": 4, "elect": [4, 6], "revis": [4, 6], "attest": 4, "404": 4, "sarban": 4, "oxlei": 4, "7262": 4, "firm": [4, 6], "prepar": [4, 5, 6], "correct": [4, 6], "restat": 4, "recoveri": 4, "incent": 4, "compens": 4, "240": 4, "10d": 4, "shell": 4, "aggreg": [4, 6], "vote": 4, "held": [4, 7], "affili": [4, 7], "march": [4, 7], "29": [4, 5, 6, 7], "last": [4, 6, 7], "second": [4, 5, 6], "quarter": 4, "628": [4, 7], "553": [4, 7], "sole": [4, 6], "disclosur": [4, 5, 6], "director": [4, 5, 6], "date": 4, "exclud": 4, "n15": 4, "115": [4, 7], "823": [4, 7], "outstand": [4, 7], "octob": [4, 7], "18": [4, 5, 6, 7], "ndocument": 4, "BY": 4, "nportion": 4, "proxi": [4, 6], "meet": [4, 6, 7], "sharehold": 4, "iii": 4, "120": [4, 6], "ntabl": 4, "npage": 4, "npart": 4, "nitem": 4, "nbusi": 4, "1a": 4, "nrisk": 4, "1b": [4, 5, 6], "nunresolv": 4, "staff": 4, "comment": 4, "n17": 4, "1c": 4, "ncybersecur": 4, "nproperti": 4, "n18": 4, "nlegal": 4, "proceed": [4, 6], "nmine": 4, "ii": [4, 5, 7], "nmarket": 4, "stockhold": 4, "purchas": [4, 6], "n19": 4, "reserv": 4, "n20": 4, "nmanag": 4, "discuss": [4, 5, 6], "n21": 4, "7a": 4, "nquantit": 4, "n27": 4, "nfinanci": 4, "supplementari": 4, "n28": 4, "nchang": 4, "disagr": 4, "n51": 4, "9a": 4, "ncontrol": 4, "procedur": [4, 6], "9b": 4, "nother": 4, "n52": 4, "9c": 4, "ndisclosur": 4, "foreign": 4, "ndirector": 4, "corpor": [4, 6], "nexecut": 4, "ownership": [4, 5], "certain": [4, 6, 7], "benefici": [4, 5], "owner": 4, "ncertain": 4, "transact": [4, 6], "nprincip": 4, "fee": 4, "iv": 4, "nexhibit": 4, "n53": 4, "n56": 4, "nthi": 4, "litig": [4, 5], "reform": 4, "1995": 4, "uncertainti": [4, 5, 6], "event": 4, "macroeconom": 4, "anticip": [4, 6], "caus": [4, 6], "oblig": 4, "nunless": 4, "herein": 4, "calendar": 4, "wholli": 4, "subsidiari": 4, "unless": [4, 5], "ncompani": 4, "manufactur": 4, "smartphon": [4, 5], "tablet": [4, 5], "wearabl": 4, "accessori": 4, "sell": [4, 6], "varieti": [4, 5], "52": [4, 6], "53": [4, 6], "week": 4, "saturdai": 4, "nproduct": 4, "niphon": 4, "io": [4, 7], "iphon": 4, "pro": [4, 5, 6], "se": [4, 6], "nmac": 4, "maco": [4, 5], "mac": [4, 5], "laptop": 4, "macbook": 4, "air": 4, "desktop": [4, 5], "imac": 4, "studio": 4, "nipad": 4, "multipurpos": 4, "ipado": 4, "ipad": 4, "nwearabl": 4, "home": 4, "smartwatch": 4, "wireless": 4, "headphon": 4, "spatial": 4, "watcho": 4, "watch": 4, "ultra": 4, "airpod": 4, "beat": [4, 5], "visiono": 4, "nhome": 4, "tv": 4, "stream": [4, 5, 7], "tvo": 4, "homepod": 4, "fidel": [4, 7], "naccessori": 4, "brand": 4, "third": [4, 5, 6], "parti": [4, 5, 6], "nservic": 4, "nadvertis": 4, "advertis": 4, "licens": 4, "napplecar": 4, "portfolio": 4, "applecar": 4, "prioriti": [4, 5], "network": [4, 5, 7], "repair": 4, "coverag": [4, 6], "accident": 4, "damag": [4, 6], "theft": [4, 6], "ncloud": 4, "ndigit": 4, "app": [4, 5], "discov": [4, 5, 6], "download": [4, 5], "music": 4, "podcast": 4, "subscript": [4, 5], "arcad": 4, "sm": 4, "listen": [4, 5], "radio": 4, "station": 4, "magazin": 4, "exclus": 4, "sport": 4, "npayment": 4, "payment": 4, "credit": 4, "pai": [4, 5], "cashless": 4, "nsegment": 4, "primarili": [4, 6], "geograph": [4, 6], "basi": [4, 5], "segment": [4, 7], "america": 4, "europ": 4, "china": [4, 6], "japan": 4, "rest": [4, 5], "asia": 4, "pacif": 4, "north": [4, 6], "south": 4, "european": [4, 6], "india": 4, "middl": [4, 5, 6], "east": 4, "africa": 4, "mainland": 4, "kong": 4, "taiwan": 4, "australia": 4, "asian": 4, "although": [4, 5], "partner": [4, 5, 6], "mid": 4, "enterpris": [4, 5, 6, 7], "resel": 4, "retail": 4, "sale": 4, "indirect": 4, "channel": [4, 6], "cellular": 4, "carrier": 4, "net": [4, 7], "38": [4, 5, 6], "62": [4, 5], "ncompetit": 4, "competit": [4, 5, 6], "character": [4, 6], "price": [4, 5], "downward": 4, "pressur": [4, 6], "gross": [4, 6], "cycl": [4, 6], "industri": [4, 5, 6, 7], "characterist": [4, 5, 6, 7], "competitor": [4, 5, 6], "compet": [4, 5], "imit": 4, "infring": [4, 5], "intellectu": [4, 5, 6], "innov": [4, 5, 6], "marketplac": [4, 6], "nearli": [4, 5], "reput": [4, 6], "expand": [4, 5, 6], "opportun": 4, "broader": [4, 5, 6], "illegitim": [4, 6], "collabor": [4, 5, 6], "nsuppli": 4, "nalthough": 4, "essenti": [4, 5, 6, 7], "particip": 4, "shortag": 4, "commod": [4, 5], "fluctuat": 4, "commonli": 4, "capac": [4, 5], "until": [4, 6, 7], "supplier": 4, "matur": 4, "concentr": 4, "enter": [4, 7], "agreement": 4, "suppli": [4, 7], "renew": 4, "nresearch": 4, "nbecaus": 4, "upon": [4, 6], "flow": [4, 7], "acquisit": [4, 6], "nintellectu": 4, "broad": [4, 5, 7], "patent": 4, "copyright": [4, 5], "trademark": 4, "secret": 4, "differenti": 4, "skill": [4, 6], "personnel": 4, "regularli": 4, "pursu": [4, 6], "thousand": [4, 5], "durat": 4, "adequ": [4, 6], "nin": 4, "holidai": [4, 6], "fill": 4, "inventori": 4, "older": [4, 5], "newer": 4, "distributor": 4, "nhuman": 4, "strive": 4, "retain": [4, 5, 6], "talent": 4, "member": [4, 6], "164": 4, "equival": [4, 5], "ncompens": 4, "equit": 4, "succe": 4, "health": [4, 6], "awai": [4, 6], "ngrowth": 4, "career": 4, "leadership": [4, 6], "nworkplac": 4, "polici": [4, 5], "equal": [4, 6], "workplac": 4, "ninclus": 4, "sustain": [4, 5, 6], "workforc": 4, "nengag": 4, "among": [4, 5, 6, 7], "gaug": 4, "sentiment": [4, 5, 7], "nhealth": 4, "everywher": 4, "crisi": 4, "visitor": 4, "navail": 4, "quarterli": 4, "q": [4, 5, 6], "amend": 4, "sec": [4, 7], "Such": [4, 6], "charg": 4, "investor": [4, 7], "aspx": 4, "websit": [4, 5, 6], "environment": [4, 6], "referenc": 4, "inact": 4, "textual": 4, "unknown": [4, 6], "advers": 4, "conjunct": 4, "consolid": 4, "accompani": [4, 6], "nmacroeconom": 4, "econom": 4, "facil": 4, "assembli": 4, "site": 4, "nadvers": 4, "slow": 4, "recess": 4, "unemploy": 4, "inflat": 4, "tighter": 4, "currenc": 4, "spend": 4, "monetari": 4, "contract": [4, 5], "logist": 4, "instabl": [4, 6], "inabl": 4, "financ": [4, 5, 6], "insolv": 4, "counterparti": 4, "debt": 4, "liquid": 4, "fair": [4, 6], "instrument": 4, "polit": [4, 6], "disput": 4, "geopolit": 4, "tension": [4, 6], "terror": 4, "accid": 4, "interrupt": 4, "npolit": 4, "whole": 4, "outsourc": 4, "korea": 4, "vietnam": 4, "restrict": [4, 5, 6, 7], "tariff": 4, "export": 4, "portion": [4, 5], "revenu": [4, 7], "raw": [4, 5, 6, 7], "restructur": 4, "ceas": 4, "disrupt": 4, "escal": [4, 6], "nmani": 4, "prone": [4, 6], "earthquak": 4, "climat": 4, "weather": 4, "plant": 4, "terrorist": [4, 6], "attack": [4, 6], "hostil": 4, "ransomwar": 4, "cybersecur": [4, 6], "labor": 4, "beyond": [4, 6], "nsuch": 4, "imposs": [4, 5], "slowdown": 4, "outag": 4, "neg": [4, 6, 7], "pandem": 4, "covid": 4, "19": [4, 5, 6], "economi": 4, "imposit": 4, "stringent": [4, 5, 6], "travel": 4, "freight": 4, "movement": 4, "ramp": 4, "nfollow": 4, "expenditur": 4, "resum": 4, "exacerb": 4, "insur": 4, "nglobal": 4, "unabl": 4, "assur": [4, 6], "minor": [4, 6], "naddition": 4, "intensifi": 4, "seamlessli": 4, "nto": 4, "stimul": 4, "ndue": 4, "upgrad": 4, "quantiti": 4, "defect": 4, "defici": 4, "supersed": 4, "nsubstanti": 4, "transport": 4, "provis": 4, "reimburs": 4, "warranti": 4, "unanticip": 4, "liabil": 4, "final": [4, 6, 7], "finish": [4, 6], "destin": 4, "made": [4, 5, 7], "prepay": 4, "termin": [4, 5], "recover": 4, "exposur": [4, 6], "nfutur": 4, "semiconductor": 4, "suffer": [4, 6], "constrain": [4, 5, 7], "shipment": 4, "unexpectedli": 4, "interfer": 4, "unsaf": [4, 6], "expos": [4, 6], "widespread": [4, 6], "vulner": [4, 6], "compromis": [4, 5, 6], "claim": [4, 5, 6], "intang": 4, "fine": [4, 6], "lost": [4, 6], "cancel": 4, "obsolet": 4, "exce": [4, 6], "realiz": 4, "accru": 4, "excess": 4, "impair": 4, "whenev": 4, "circumst": 4, "amount": [4, 6, 7], "carri": [4, 5, 7], "incur": 4, "unpredict": [4, 6], "pace": [4, 6], "obsolesc": 4, "forecast": [4, 6], "incorrectli": [4, 6, 7], "extens": [4, 5, 7], "issuanc": 4, "unknowingli": [4, 6], "notifi": 4, "preclud": 4, "bui": 4, "percept": 4, "android": 4, "playstat": 4, "nintendo": 4, "xbox": 4, "inclin": 4, "devot": 4, "compel": [4, 5, 7], "dissatisfi": 4, "vast": [4, 6], "storefront": 4, "safari": 4, "union": [4, 6], "eu": [4, 6], "dma": 4, "narrow": [4, 5, 6], "scope": [4, 5, 6], "elimin": [4, 5], "nfailur": 4, "appeal": 4, "subscrib": 4, "nsome": 4, "manner": [4, 6], "nurtur": 4, "nmuch": 4, "chief": 4, "silicon": 4, "vallei": 4, "constantli": 4, "driver": [4, 5], "recruit": 4, "subsidi": 4, "staf": 4, "contractor": 4, "placement": 4, "increment": 4, "weaken": 4, "telecommun": 4, "war": 4, "virus": 4, "ins": 4, "incid": [4, 6], "redund": 4, "ineffect": 4, "thing": [4, 7], "interf": 4, "imped": 4, "ship": 4, "nloss": 4, "unauthor": [4, 6], "confidenti": [4, 5], "encrypt": 4, "But": [4, 6, 7], "behalf": 4, "normal": [4, 6, 7], "investig": [4, 6], "penalti": [4, 5], "frequenc": [4, 5, 6], "actor": [4, 6], "circumv": [4, 6], "obfusc": 4, "forens": 4, "hinder": [4, 7], "recov": 4, "perpetr": 4, "profil": [4, 5], "authent": 4, "hack": [4, 6], "malfeas": 4, "faulti": 4, "password": 4, "irregular": 4, "fraudul": 4, "induc": 4, "disclos": [4, 7], "usernam": 4, "turn": [4, 6], "multifactor": 4, "unusu": 4, "freez": 4, "suspici": 4, "nwhile": 4, "ninvest": 4, "ongo": [4, 5], "contempl": 4, "endeavor": 4, "distract": 4, "tangibl": 4, "approv": 4, "oner": 4, "ventur": 4, "riski": 4, "leas": 4, "unfavor": 4, "arisen": 4, "ordinari": 4, "cours": [4, 5, 6], "resolv": [4, 5, 6], "sometim": [4, 7], "indemnif": 4, "indemnifi": 4, "alleg": 4, "magnitud": 4, "assert": 4, "royalti": 4, "vigor": 4, "defend": 4, "court": [4, 5], "internation": 4, "plaintiff": 4, "injunct": 4, "relief": 4, "nregardless": 4, "merit": 4, "recognit": [4, 5, 6], "settl": 4, "uncertain": 4, "disgorg": 4, "remedi": [4, 6], "worldwid": 4, "antitrust": 4, "bill": 4, "commerc": 4, "mobil": [4, 5, 7], "televis": 4, "film": 4, "anticorrupt": 4, "cash": 4, "repatri": 4, "launder": 4, "tax": 4, "wast": 4, "recycl": 4, "ncomplianc": 4, "impos": [4, 5, 6, 7], "agent": [4, 5, 6], "nregulatori": 4, "ban": [4, 6], "nexpect": 4, "increasingli": [4, 5, 6, 7], "greenhous": 4, "ga": 4, "emiss": 4, "civil": 4, "disagre": 4, "perceiv": 4, "feder": 4, "nfrom": 4, "noncompli": 4, "individu": [4, 5, 6], "lawsuit": [4, 5], "monopol": 4, "nfurther": 4, "earn": 4, "search": [4, 5, 6], "nthere": 4, "retent": 4, "transfer": 4, "pass": [4, 5, 6, 7], "pend": 4, "inquiri": [4, 6], "government": 4, "entiti": [4, 5, 6, 7], "biometr": 4, "notif": 4, "permit": [4, 5, 7], "healthcar": [4, 5], "liabl": 4, "investigatori": 4, "cardhold": 4, "compress": [4, 5], "acquir": 4, "extent": [4, 6], "unexpect": [4, 6, 7], "dollar": [4, 5], "denomin": 4, "offset": 4, "strengthen": [4, 6], "nconvers": 4, "thu": 4, "hedg": 4, "deterior": 4, "sovereign": 4, "heighten": [4, 6], "worsen": 4, "A": [4, 5, 6, 7], "collater": 4, "bank": 4, "unsecur": 4, "subassembli": 4, "assembl": 4, "legisl": 4, "ireland": [4, 6], "singapor": 4, "organis": 4, "statutori": 4, "valuat": 4, "defer": 4, "bodi": [4, 6], "adequaci": 4, "ow": 4, "ngener": 4, "volum": [4, 5, 6], "repurchas": 4, "dividend": 4, "consumm": 4, "declar": 4, "board": [4, 6], "unresolv": 4, "nnone": 4, "threat": [4, 6], "postur": 4, "25": [4, 5, 6], "2016": 4, "coordin": [4, 6], "track": [4, 6], "committe": [4, 6], "oversight": [4, 6], "counsel": 4, "chair": 4, "headquart": 4, "cupertino": [4, 7], "center": [4, 6, 7], "formal": [4, 6, 7], "conclud": [4, 5], "uninstal": 4, "web": [4, 5, 6], "browser": 4, "june": 4, "contractu": 4, "desist": 4, "stai": [4, 5], "grant": 4, "ndepart": 4, "justic": 4, "depart": [4, 6], "doj": 4, "district": 4, "attornei": 4, "jersei": 4, "redress": [4, 6], "anticompetit": 4, "nonmonetari": 4, "defens": [4, 6], "nepic": 4, "epic": 4, "northern": 4, "unfair": [4, 6], "enjoin": 4, "extern": [4, 6], "link": 4, "januari": 4, "motion": 4, "oppos": [4, 6], "30": [4, 5, 6], "vacat": 4, "fourth": 4, "mine": 4, "nnot": 4, "aapl": 4, "nholder": 4, "na": [4, 6], "301": 4, "npurchas": 4, "nshare": 4, "nperiod": 4, "ttotal": 4, "taverag": 4, "npaid": 4, "nannounc": 4, "napproxim": 4, "That": [4, 6, 7], "Be": [4, 5, 6], "nunder": 4, "njune": 4, "august": [4, 6], "nopen": 4, "negoti": [4, 6], "t35": 4, "697": 4, "t224": 4, "naugust": 4, "31": [4, 5], "t42": 4, "910": 4, "t221": 4, "39": [4, 5], "nseptemb": 4, "t33": 4, "653": 4, "t222": 4, "86": [4, 5], "ntotal": [4, 6], "t112": 4, "260": 4, "t89": 4, "074": 4, "110": 4, "billion": [4, 5], "10b5": 4, "graph": 4, "cumul": 4, "reinvest": 4, "dow": 4, "supersector": 4, "27": [4, 6], "2019": 4, "n2218": 4, "tseptemb": 4, "t100": 4, "t207": 4, "t273": 4, "t281": 4, "t322": 4, "t430": 4, "t113": 4, "t156": 4, "t131": 4, "t155": 4, "t210": 4, "ndow": 4, "t146": 4, "t216": 4, "t215": 4, "nfirst": 4, "nsecond": 4, "nthird": 4, "sequoia": 4, "nfourth": 4, "plu": [4, 5], "nfiscal": 4, "six": 4, "realign": 4, "span": [4, 5, 6], "indirectli": 4, "n2024": 4, "tchang": 4, "t2023": 4, "t2022": 4, "namerica": 4, "t167": 4, "045": 4, "t3": 4, "t162": 4, "560": 4, "t169": 4, "658": 4, "neurop": 4, "t101": 4, "328": 4, "t7": 4, "294": 4, "t95": 4, "118": 4, "ngreater": 4, "t66": 4, "952": 4, "t72": 4, "559": 4, "t74": 4, "njapan": 4, "t25": 4, "052": 4, "t24": 4, "257": 4, "977": 4, "nrest": 4, "t30": 4, "t4": 4, "t29": 4, "615": 4, "t1": 4, "t391": 4, "035": 4, "t2": 4, "t383": 4, "285": 4, "t394": 4, "weak": [4, 6], "renminbi": 4, "yen": [4, 7], "t201": 4, "183": 4, "t200": 4, "583": 4, "t205": 4, "489": 4, "984": 4, "357": 4, "t40": 4, "177": [4, 6], "t26": 4, "694": 4, "t28": 4, "300": 4, "292": 4, "t37": 4, "005": 4, "t39": 4, "845": [4, 6], "t41": 4, "241": 4, "n96": 4, "169": 4, "t13": 4, "t85": 4, "t9": 4, "t78": 4, "129": [4, 6], "amort": 4, "bundl": 4, "flat": 4, "ngross": 4, "t109": 4, "633": 4, "t108": 4, "803": 4, "t114": 4, "728": 4, "t71": 4, "t60": 4, "345": 4, "t56": 4, "054": 4, "t180": 4, "683": 4, "148": 4, "t170": 4, "782": 4, "t36": 4, "t73": 4, "t70": 4, "t46": 4, "t44": 4, "t43": 4, "noper": 4, "t31": 4, "370": 4, "t5": 4, "915": 4, "t14": 4, "251": 4, "npercentag": 4, "t8": 4, "nsell": 4, "administr": 4, "097": 4, "932": 4, "094": 4, "t6": 4, "t57": 4, "467": 4, "t54": 4, "847": 4, "t51": 4, "t15": 4, "headcount": 4, "nprovis": 4, "749": 4, "t16": 4, "741": 4, "t19": 4, "neffect": 4, "nstatutori": 4, "t21": 4, "aid": [4, 6], "nliquid": 4, "unrestrict": 4, "140": 4, "ndebt": 4, "97": [4, 6], "payabl": 4, "promissori": 4, "nleas": 4, "space": [4, 5, 6], "nmanufactur": 4, "noncancel": 4, "ndeem": 4, "tcja": 4, "nstate": 4, "fund": [4, 5], "escrow": 4, "ncapit": 4, "95": [4, 6], "nrecent": 4, "pronounc": 4, "nincom": 4, "fasb": 4, "asu": 4, "09": [4, 6], "740": 4, "reconcili": 4, "reconcil": [4, 7], "disaggreg": 4, "prospect": 4, "novemb": [4, 6], "07": [4, 6, 7], "280": 4, "maker": 4, "codm": 4, "alloc": [4, 5, 6], "retrospect": 4, "ncritic": 4, "conform": [4, 7], "gaap": 4, "nuncertain": 4, "domest": 4, "taxat": 4, "resolut": 4, "conting": 4, "26": [4, 5], "ninterest": 4, "forth": 4, "hypothet": 4, "nsensit": 4, "nhypothet": 4, "nrate": 4, "npotenti": 4, "n100": 4, "tenor": 4, "ndeclin": 4, "755": 4, "089": 4, "nterm": 4, "nincreas": 4, "t139": 4, "t194": 4, "nforeign": 4, "var": 4, "mont": 4, "carlo": 4, "interv": 4, "538": 4, "669": 4, "nindex": 4, "tpage": 4, "nconsolid": 4, "n29": 4, "n30": 4, "sheet": 4, "n31": 4, "n32": 4, "n33": 4, "nnote": 4, "n34": 4, "nreport": 4, "n48": 4, "nall": 4, "omit": [4, 7], "submiss": 4, "nyear": 4, "n2023": 4, "n2022": 4, "nnet": 4, "t294": 4, "866": 4, "t298": 4, "085": 4, "t316": 4, "199": 4, "t96": 4, "ncost": 4, "t185": 4, "233": 4, "t189": 4, "282": 4, "471": 4, "119": 4, "855": 4, "t22": 4, "075": 4, "352": 4, "t214": 4, "137": 4, "t223": 4, "546": 4, "t123": 4, "216": 4, "t119": 4, "437": 4, "t269": 4, "565": 4, "334": 4, "485": 4, "736": 4, "103": 4, "t93": 4, "995": 4, "t99": 4, "nearn": 4, "nbasic": 4, "ndilut": 4, "08": [4, 5, 7], "343": [4, 6], "783": 4, "744": 4, "215": 4, "963": 4, "095": 4, "812": 4, "547": 4, "325": 4, "819": 4, "nsee": 4, "translat": [4, 5, 6], "t395": 4, "765": 4, "511": 4, "unreal": 4, "832": 4, "t323": 4, "212": 4, "nadjust": 4, "337": 4, "717": 4, "394": 4, "138": 4, "850": 4, "563": 4, "104": 4, "t204": 4, "t253": 4, "816": 4, "899": 4, "272": 4, "t98": 4, "016": 4, "652": 4, "t88": 4, "531": 4, "nasset": 4, "ncurrent": 4, "ncash": 4, "943": 4, "965": 4, "228": 4, "590": 4, "naccount": 4, "410": 4, "508": 4, "nvendor": 4, "t32": 4, "833": 4, "477": 4, "ninventori": 4, "286": 4, "331": 4, "287": 4, "695": 4, "t152": 4, "987": 4, "t143": 4, "566": 4, "t91": 4, "479": 4, "544": 4, "t45": 4, "680": 4, "715": 4, "834": 4, "t64": 4, "758": 4, "t211": 4, "993": 4, "t209": 4, "017": 4, "t364": 4, "980": [4, 6], "t352": 4, "nliabil": 4, "t68": 4, "960": 4, "t62": 4, "611": 4, "304": 4, "t58": 4, "829": 4, "ndefer": 4, "249": 4, "061": 4, "ncommerci": 4, "967": 4, "985": 4, "t10": 4, "912": 4, "822": 4, "t176": 4, "392": 4, "t145": 4, "308": 4, "750": 4, "888": 4, "t49": 4, "848": 4, "638": 4, "t308": 4, "030": [4, 5], "t290": 4, "ncommit": 4, "nsharehold": 4, "400": 4, "116": 4, "786": 4, "550": 4, "n83": 4, "276": 4, "naccumul": 4, "deficit": 4, "154": 4, "214": 4, "172": 4, "452": 4, "950": 4, "146": [4, 6], "t50": 4, "672": 4, "t63": 4, "090": 4, "nbegin": 4, "849": 4, "365": 4, "423": 4, "346": 4, "175": 4, "withheld": 4, "settlement": 4, "521": 4, "971": 4, "t12": 4, "034": 4, "t11": 4, "nend": 4, "t83": 4, "nretain": 4, "068": 4, "562": 4, "ndividend": 4, "218": 4, "793": 4, "612": 4, "099": 4, "454": 4, "846": 4, "77": [4, 5], "046": 4, "186": 4, "109": 4, "t163": 4, "rsu": 4, "t0": 4, "98": [4, 5], "94": [4, 5, 6], "32": [4, 5], "737": 4, "929": 4, "ndepreci": 4, "445": 4, "519": 4, "688": 4, "038": 4, "266": 4, "227": 4, "006": 4, "788": 4, "356": 4, "271": 4, "520": 4, "618": 4, "484": 4, "731": 4, "684": 4, "499": 4, "020": 4, "889": 4, "448": 4, "552": 4, "031": 4, "t118": 4, "254": 4, "t110": 4, "543": 4, "t122": 4, "151": 4, "48": [4, 5], "656": 4, "513": 4, "76": [4, 6], "923": 4, "nproce": 4, "211": 4, "686": 4, "917": 4, "135": 4, "828": 4, "446": 4, "447": 4, "959": 4, "708": 4, "086": 4, "935": 4, "705": 4, "354": 4, "nfinanc": 4, "441": 4, "431": 4, "223": [4, 6], "234": [4, 6], "025": 4, "841": 4, "nrepurchas": 4, "949": 4, "89": [4, 6], "402": 4, "465": 4, "nrepay": 4, "958": 4, "repay": 4, "978": 4, "955": 4, "361": 4, "581": 4, "160": 4, "121": 4, "983": 4, "488": 4, "794": 4, "760": 4, "nsupplement": 4, "102": 4, "t18": 4, "679": 4, "573": 4, "33": [4, 5, 6], "nbasi": 4, "prior": [4, 6], "reclassifi": 4, "nrevenu": 4, "remit": [4, 6], "straight": 4, "vest": 4, "sold": 4, "nderiv": 4, "nonleas": 4, "34": [4, 6], "entitl": 4, "commenc": 4, "deliveri": 4, "stand": 4, "ssp": 4, "icloud": 4, "siri": 4, "discount": 4, "undeliv": 4, "unbil": 4, "n26": 4, "n37": 4, "proport": [4, 5], "moder": [4, 5], "64": [4, 5, 6], "dilut": 4, "nnumer": 4, "ndenomin": 4, "nweight": 4, "312": 4, "316": 4, "856": 4, "antidilut": 4, "tunreal": 4, "ngain": 4, "tfair": 4, "nvalu": 4, "tcash": 4, "nequival": 4, "tcurrent": 4, "tnon": 4, "t27": 4, "nlevel": 4, "nmonei": 4, "t778": 4, "nmutual": 4, "n515": 4, "t105": 4, "t617": 4, "nsubtot": 4, "293": 4, "395": 4, "nu": 4, "treasuri": 4, "516": 4, "t212": 4, "087": 4, "380": 4, "159": 4, "t703": 4, "t17": 4, "568": 4, "158": 4, "810": 4, "ncertif": 4, "deposit": 4, "t873": 4, "t387": 4, "t478": 4, "066": 4, "ncorpor": 4, "t65": 4, "622": 4, "t270": 4, "953": 4, "939": 4, "027": 4, "t47": 4, "886": 4, "nmunicip": 4, "t412": 4, "t405": 4, "t190": 4, "nmortgag": 4, "595": 4, "t175": 4, "403": 4, "t23": 4, "367": 4, "278": [4, 6], "t132": 4, "t583": 4, "635": 4, "t128": 4, "056": 4, "966": 4, "t34": 4, "t160": 4, "t688": 4, "650": 4, "36": [4, 5, 6], "359": [4, 6], "t481": 4, "n442": 4, "t428": 4, "t923": 4, "t909": 4, "406": 4, "114": 4, "468": 4, "136": 4, "t271": 4, "533": 4, "048": [4, 5], "491": 4, "332": 4, "t320": 4, "t608": 4, "t76": 4, "840": 4, "956": 4, "890": 4, "t20": 4, "627": 4, "243": 4, "t628": 4, "t602": 4, "t192": 4, "t410": 4, "735": 4, "636": 4, "t344": 4, "t144": 4, "470": 4, "657": 4, "831": 4, "125": 4, "162": 4, "t173": 4, "752": 4, "corrobor": 4, "mortgag": 4, "classifi": [4, 6], "37": [4, 6], "swap": 4, "remeasur": 4, "notion": 4, "069": 4, "730": 4, "575": 4, "493": 4, "t104": 4, "777": 4, "nhedg": 4, "433": 4, "505": 4, "247": [4, 6], "ntrade": 4, "41": [4, 5, 6], "44": [4, 6], "depreci": 4, "nland": 4, "690": 4, "nmachineri": 4, "t80": 4, "205": [4, 5], "314": 4, "nleasehold": 4, "839": 4, "599": 4, "73": [4, 5, 6], "70": [4, 5], "884": 4, "852": 4, "t55": 4, "906": 4, "601": 4, "703": 4, "010": 4, "457": 4, "634": 4, "391": 4, "neuropean": 4, "opinion": [4, 6], "1991": 4, "2007": 4, "irish": 4, "branch": 4, "2003": 4, "2014": 4, "2015": 4, "minist": 4, "juli": [4, 6], "annul": 4, "ecj": 4, "hear": 4, "asid": 4, "confirm": 4, "unrecogn": 4, "nfeder": 4, "571": 4, "080": 4, "644": 4, "265": 4, "801": 4, "726": 4, "570": 4, "298": 4, "49": [4, 6], "t84": 4, "428": 4, "603": 4, "483": [4, 6], "t347": 4, "t669": 4, "076": 4, "830": 4, "419": 4, "072": 4, "pretax": 4, "72": [4, 6], "71": 4, "ncomput": 4, "885": 4, "012": 4, "124": 4, "518": 4, "nimpact": 4, "246": 4, "311": 4, "366": 4, "397": 4, "nexcess": 4, "893": 4, "871": 4, "192": [4, 6], "739": 4, "ntax": 4, "carryforward": 4, "302": 4, "naccru": 4, "413": [4, 6], "421": 4, "nunreal": 4, "173": 4, "168": 4, "873": 4, "743": 4, "nless": 4, "374": 4, "007": 4, "369": 4, "551": 4, "998": 4, "nright": 4, "179": 4, "nminimum": 4, "674": 4, "940": 4, "t511": 4, "t455": 4, "t490": 4, "805": 4, "202": 4, "indefinit": 4, "temporari": 4, "727": 4, "044": 4, "284": 4, "ndecreas": 4, "386": 4, "463": 4, "982": 4, "542": 4, "936": 4, "070": 4, "expir": 4, "statut": 4, "229": 4, "494": 4, "closur": 4, "intercompani": 4, "exceed": [4, 6], "multiyear": 4, "exercis": 4, "noncash": 4, "rou": 4, "tfinanci": 4, "t2024": 4, "tother": 4, "661": 4, "tproperti": 4, "015": 4, "303": 4, "676": 4, "t165": 4, "t752": 4, "t859": 4, "430": 4, "842": [4, 6], "tfinanc": 4, "n2025": 4, "820": 4, "t171": 4, "991": 4, "n2026": 4, "914": 4, "n2027": 4, "t59": 4, "733": 4, "n2028": 4, "360": 4, "t38": 4, "398": 4, "n2029": 4, "187": 4, "nthereaft": 4, "t837": 4, "undiscount": 4, "790": 4, "imput": 4, "376": 4, "534": 4, "t896": 4, "borrow": 4, "proce": 4, "nine": [4, 6], "nmatur": 4, "333": 4, "264": 4, "948": 4, "645": 4, "309": 4, "arrear": 4, "namount": 4, "n2013": 4, "nfix": 4, "2062": 4, "t97": 4, "341": 4, "03": 4, "65": [4, 6], "t106": 4, "572": 4, "n97": 4, "nunamort": 4, "premium": 4, "321": 4, "358": 4, "113": 4, "662": 4, "930": 4, "342": 4, "800": 4, "180": 4, "88": 4, "ndure": 4, "425": 4, "426": 4, "372": 4, "589": 4, "055": 4, "appreci": 4, "four": [4, 5, 6], "holder": [4, 5], "n2014": 4, "bonu": 4, "nrestrict": 4, "nnumber": 4, "nrsu": 4, "ngrant": 4, "naggreg": 4, "nfair": 4, "nbalanc": 4, "t240": 4, "427": [4, 6], "t75": 4, "t150": 4, "861": 4, "501": 4, "768": 4, "87": [4, 5, 6], "101": [4, 6], "878": 4, "144": 4, "t127": 4, "t135": 4, "91": [4, 6], "456": 4, "78": [4, 5, 6], "59": [4, 6], "t140": 4, "326": 4, "t158": 4, "204": 4, "350": 4, "002": [4, 5], "nuncondit": 4, "uncondit": 4, "206": 4, "440": 4, "156": 4, "t633": 4, "t670": 4, "226": 4, "45": 4, "nconting": 4, "accrual": 4, "nconcentr": 4, "attribut": [4, 5, 6, 7], "46": 4, "t67": 4, "098": 4, "082": 4, "062": 4, "569": 4, "895": 4, "458": 4, "207": 4, "nonrecur": 4, "t142": 4, "196": 4, "t138": 4, "t147": 4, "859": 4, "nchina": 4, "n66": 4, "t181": 4, "887": 4, "t172": 4, "269": 4, "nlong": 4, "664": 4, "797": 4, "778": 4, "219": 4, "47": [4, 5, 6], "nopinion": 4, "nwe": 4, "fairli": 4, "pcaob": 4, "sponsor": 4, "treadwai": 4, "2013": 4, "unqualifi": 4, "thereon": 4, "nthese": 4, "misstat": 4, "fraud": [4, 6], "ndescript": 4, "naudit": 4, "nhow": 4, "nmatter": 4, "qualifi": 4, "letter": 4, "advisor": 4, "ernst": 4, "llp": 4, "auditor": 4, "2009": 4, "nsan": 4, "jose": 4, "nnovemb": 4, "coso": 4, "nour": 4, "ndefinit": 4, "mainten": [4, 5, 6], "disposit": 4, "receipt": 4, "nevalu": 4, "nbase": 4, "supervis": [4, 5, 6, 7], "13a": 4, "15d": 4, "ninher": 4, "met": [4, 6], "paragraph": 4, "51": [4, 6, 7], "ninsid": 4, "deirdr": 4, "brien": 4, "vice": 4, "presid": 4, "affirm": 4, "april": 4, "withhold": 4, "remitt": 4, "mr": 4, "copi": 4, "solicit": 4, "00042": 4, "nincorpor": 4, "texhibit": 4, "descript": [4, 5, 6, 7], "tform": 4, "tfile": 4, "nrestat": 4, "namend": 4, "bylaw": 4, "nindentur": 4, "york": [4, 5, 7], "mellon": 4, "truste": 4, "noffic": 4, "certif": 4, "2018": 4, "85": [4, 5, 6], "2043": 4, "05": 4, "2044": 4, "februari": 4, "55": [4, 5], "2045": 4, "900": 4, "700": [4, 5], "60": [4, 5, 6], "250": [4, 6], "2036": 4, "2046": 4, "450": 4, "2047": 4, "2049": 4, "2030": 4, "2050": 4, "2060": 4, "2028": 4, "2041": 4, "2051": 4, "2061": 4, "2032": 4, "2052": 4, "54": 4, "2033": 4, "2053": 4, "ceo": 4, "n12": 4, "nsubsidiari": 4, "n23": 4, "nconsent": 4, "n24": 4, "npower": 4, "signatur": 4, "nrule": 4, "nsection": 4, "1350": 4, "n101": 4, "ninlin": 4, "xbrl": 4, "n104": 4, "inlin": 4, "compensatori": 4, "herewith": 4, "furnish": 4, "herebi": 4, "undertak": 4, "56": [4, 5, 6], "nsignatur": 4, "npursuant": 4, "duli": 4, "undersign": 4, "thereunto": 4, "ndate": 4, "nby": 4, "luca": [4, 7], "maestri": 4, "nluca": 4, "nsenior": 4, "nchief": 4, "nknow": 4, "THESE": 4, "appoint": 4, "cook": 4, "jointli": 4, "her": 4, "substitut": 4, "him": 4, "thereto": 4, "therewith": 4, "ratifi": 4, "done": [4, 5, 6, 7], "virtu": 4, "hereof": 4, "nname": 4, "ttitl": 4, "tdate": 4, "tchief": 4, "tnovemb": 4, "ntimothi": 4, "tsenior": 4, "kondo": 4, "nchri": 4, "wanda": 4, "austin": 4, "nwanda": 4, "gorski": 4, "tdirector": 4, "nalex": 4, "jung": 4, "nandrea": 4, "arthur": 4, "levinson": 4, "narthur": 4, "monica": 4, "lozano": 4, "nmonica": 4, "ronald": 4, "sugar": 4, "nronald": 4, "susan": 4, "wagner": 4, "nsusan": 4, "57": [4, 5], "turbo": [4, 5, 7], "outlin": [4, 5, 6], "invdestacksmeticsisdict": 4, "setispect": 4, "20cyan": 4, "evaluationseld": 4, "anvis": 4, "droitent": 4, "discernminerv": 4, "versbobprefvers": 4, "vo\u8be5": 4, "option\u548c": 4, "meio": 4, "\u0432\u0440\u0435\u043ccisco": 4, "dellaischenpoihscap": 4, "geme": 4, "gettim": 4, "unscal": 4, "vocabulari": [4, 5, 7], "closer": 4, "sharpen": 4, "uniform": 4, "raschka": 4, "repetit": [4, 7], "radic": 4, "grappl": 4, "safer": [4, 6], "fascin": 4, "spontan": 4, "answer": [4, 5, 6, 7], "aren": [4, 5], "linear": 4, "absent": [4, 6], "coax": 4, "journei": 4, "suddenli": 4, "manifest": 4, "deliber": [4, 6], "contend": 4, "70b": [4, 5], "rethink": 4, "tutor": 4, "children": [4, 6], "verifi": [4, 5, 7], "predefin": [4, 7], "weren": 4, "kind": 4, "usual": 4, "quantif": 4, "contamin": [4, 6], "massiv": [4, 6], "unseen": [4, 6], "longitudin": 4, "mostli": [4, 7], "versu": [4, 5, 6], "latter": 4, "tailor": [4, 6], "great": [4, 5, 7], "cognit": 4, "misinform": [4, 6], "tempor": 4, "disclaim": 4, "referr": 4, "incorrect": [4, 6], "demograph": [4, 6], "stereotyp": [4, 6], "societ": [4, 6], "pii": [4, 6], "anonym": 4, "leakag": [4, 6], "carryov": 4, "multi": [4, 5, 6, 7], "fallaci": 4, "think": [4, 5, 6], "idiom": 4, "sarcasm": 4, "terminologi": 4, "lingual": 4, "misunderstand": 4, "syntax": 4, "scan": 4, "compat": [4, 5, 7], "overconfid": 4, "clariti": [4, 6, 7], "audienc": 4, "densiti": 4, "satisfact": [4, 7], "misus": [4, 6], "moral": 4, "co2": 4, "energi": 4, "consumpt": 4, "server": [4, 5, 7], "cach": [4, 5], "imag": [4, 5, 6], "audio": 4, "etc": [4, 7], "truth": [4, 5, 6, 7], "layer": [4, 5, 7], "palm": [4, 5], "easi": [4, 5, 6], "synthet": [4, 5, 6, 7], "augment": [4, 5], "post": [4, 5, 6], "timeout": 4, "variat": [4, 5], "inter": 4, "rater": 4, "ti": 4, "holist": [4, 6], "fast": [4, 5, 6, 7], "experiment": [4, 5, 7], "vi": 4, "categor": [4, 5, 6, 7], "intrins": [4, 5], "extrins": 4, "sequenc": [4, 5, 7], "perplex": [4, 5], "downstream": [4, 7], "synthesi": 4, "discret": 4, "prefix": [4, 6], "roug": 4, "bleu": 4, "bilingu": 4, "understudi": 4, "overlap": 4, "favor": [4, 5, 7], "breviti": 4, "insensit": 4, "semant": [4, 7], "orient": [4, 6], "gist": 4, "meteor": 4, "synonym": 4, "stem": [4, 7], "paraphras": 4, "alongsid": [4, 6], "computation": 4, "cider": 4, "consensu": 4, "tf": 4, "idf": 4, "caption": 4, "reliant": 4, "corpu": [4, 5], "ter": 4, "edit": [4, 6], "hypothesi": 4, "penal": 4, "bertscor": 4, "contextu": [4, 6], "bert": 4, "spice": 4, "proposit": [4, 5], "scene": [4, 6], "pure": [4, 5], "analyst": 4, "rouge_1": 4, "rouge_2": 4, "ideal": [4, 5, 6, 7], "cheaper": 4, "setup": [4, 5, 6, 7], "evaluate_summari": 4, "unigram": 4, "bigram": 4, "absl": 4, "py": 4, "rouge_scor": 4, "generated_summari": 4, "reference_summari": 4, "google_bleu": 4, "bleu_scor": 4, "rouge1": 4, "rouge2": 4, "arbitrari": 4, "chosen": [4, 6], "sentence1": 4, "cat": [4, 6], "sat": 4, "mat": 4, "sentence2": 4, "ate": 4, "3333333333333333": 4, "7272727272727272": 4, "4444444444444445": 4, "generate_summari": 4, "summir": 4, "liner": 4, "evaluate_summary_model": 4, "model_benchmark": 4, "models_test": 4, "benchmark_summari": 4, "model_summari": 4, "evaluation_result": 4, "statu": 4, "concis": [4, 5], "element": [4, 6, 7], "verbos": [4, 5, 6, 7], "peripher": 4, "quit": [4, 5, 7], "convei": 4, "breadth": 4, "Of": [4, 5, 6], "vibe": 4, "visualize_prompt_comparison": 4, "matplotlib": 4, "radar": 4, "radar_plot": 4, "tmp": 4, "ipykernel_1652501": 4, "940173201": 4, "userwarn": 4, "figurecanvasagg": 4, "largest": [4, 5], "sarmah": 4, "granular": [4, 5], "tune": [4, 6], "likert": 4, "ensembl": 4, "repeatedli": 4, "fluenci": 4, "refin": 4, "notabl": [4, 6, 7], "integ": [4, 7], "rubric": 4, "hollist": 4, "judgeevalu": 4, "grammar": [4, 5, 7], "evaluate_with_llm": 4, "criterion": 4, "judge_model": 4, "candidate_summari": 4, "grammat": 4, "y": [4, 6, 7], "z": 4, "w": [4, 5, 6], "benchmark_model": 4, "test_model": 4, "input_text": [4, 5], "trillion": [4, 5], "evals_list": 4, "1775618912": 4, "variant": [4, 5, 6], "slightli": 4, "drift": [4, 6], "lowest": [4, 5], "firstli": 4, "overhead": [4, 5], "egocentr": 4, "tight": 4, "medicin": [4, 6], "glider": 4, "deshpand": 4, "3b": 4, "685": 4, "aplic": 4, "clearli": [4, 6, 7], "earlier": [4, 6], "depict": [4, 6, 7], "multilingu": [4, 5, 6], "golden": 4, "languang": 4, "arena": 4, "randomli": 4, "customiz": [4, 5, 6], "irrelev": 4, "unhelp": [4, 6], "occasion": 4, "rare": 4, "perfectli": 4, "cater": [4, 5], "critiqu": [4, 6], "elo": 4, "spectrum": 4, "exam": 4, "probe": [4, 6], "certifi": 4, "began": [4, 5], "glue": 4, "entail": [4, 5], "baselin": [4, 5, 6], "superglu": 4, "successor": 4, "grew": 4, "big": 4, "bench": [4, 5], "srivastava": 4, "arithmet": 4, "truthfulqa": [4, 5], "multitask": 4, "hendryck": [4, 6], "multidisciplinari": 4, "stanford": 4, "helm": 4, "multidimension": 4, "surround": [4, 5, 6, 7], "humanev": [4, 5], "lmsy": 4, "brought": 4, "dialogu": [4, 5], "chiang": 4, "gather": 4, "alpacaev": 4, "duboi": 4, "mt": 4, "argilla": 4, "mila": 4, "mit": [4, 5], "contributor": [4, 5, 7], "western": 4, "centric": 4, "divid": [4, 6], "subset": [4, 6], "agnost": 4, "dialect": 4, "render": [4, 6], "crowdsourc": 4, "livebench": 4, "white": [4, 6], "resili": [4, 6], "meaningfulli": 4, "zebralog": 4, "grid": 4, "puzzl": 4, "brailsford": 4, "1999": 4, "lsat": 4, "hous": 4, "clue": 4, "deduct": 4, "arriv": 4, "programmat": [4, 7], "2x2": 4, "6x6": 4, "shot": [4, 6, 7], "reductio": 4, "ad": [4, 5, 6, 7], "absurdum": 4, "hard": 4, "10b": 4, "counterfactu": 4, "came": 4, "arc": 4, "prize": [4, 6], "chollet": 4, "mike": [4, 6], "knoop": 4, "founder": 4, "zapier": 4, "fran\u00e7oi": 4, "creator": [4, 5], "agi": 4, "kera": 4, "genuin": 4, "possess": 4, "elementari": 4, "novelti": 4, "wouldn": 4, "interpol": 4, "synthes": 4, "fly": 4, "retriev": [4, 5], "brute": 4, "pixel": 4, "unbeaten": 4, "win": [4, 5], "poorli": 4, "recombin": 4, "spur": [4, 6], "takeawai": 4, "vertic": [4, 6], "finbench": 4, "legalbench": 4, "guha": 4, "berkelei": 4, "bfcl": 4, "patil": 4, "fourrier": 4, "bespok": 4, "sdk": 4, "autoregress": 4, "sub": [4, 5], "liter": 4, "disturb": 4, "zero": [4, 5, 6, 7], "varianc": [4, 6], "yt": 4, "ut": 4, "suppos": [4, 7], "ol": 4, "heteroscedast": 4, "regress": 4, "bivari": 4, "evaluation_track": 4, "evaluationtrack": 4, "model_config": 4, "basemodelconfig": 4, "parallelismmanag": 4, "pipelineparamet": 4, "envconfig": 4, "is_accelerate_avail": 4, "datetim": 4, "timedelta": 4, "initprocessgroupkwarg": 4, "create_evaluation_pipelin": 4, "cache_dir": 4, "float16": 4, "max_sampl": 4, "kwargs_handl": 4, "3000": 4, "save_detail": 4, "pipeline_param": 4, "launcher_typ": 4, "env_config": 4, "override_batch_s": 4, "use_chat_templ": 4, "trust_remote_cod": 4, "pipeline_paramet": 4, "schemat": 4, "vllm": [4, 7], "tgi": 4, "storag": [4, 5, 6], "num_few_shot": 4, "bar": 4, "bigbench": 4, "winogrand": 4, "hellaswag": 4, "nlp": [4, 5, 6], "save_and_push_result": 4, "show_result": 4, "model_arg": 4, "send": [4, 5, 6, 7], "serverless": 4, "inference_server_address": 4, "inference_server_auth": 4, "model_id": 4, "null": 4, "bash": [4, 5], "command": [4, 5], "model_config_path": 4, "endpoint_model": 4, "llama3": 4, "qwen2": [4, 5, 7], "smollm2": [4, 5, 7], "alibaba": [4, 5, 7], "5b": [4, 5, 7], "hui": [4, 5], "allal": [4, 5], "cluster": 4, "noteworthi": [4, 5], "grain": [4, 5, 7], "salt": [4, 7], "exponenti": 4, "modular": 4, "offici": [4, 7], "revisit": 4, "trace": 4, "langchain_tracing_v2": 4, "langchain_api_kei": 4, "hf_evalu": 4, "langsmith_evalu": 4, "ls_client": 4, "dataset_nam": 4, "create_dataset": 4, "create_exampl": 4, "dataset_id": 4, "calculate_scor": 4, "reference_output": 4, "oai_client": 4, "xp_model_nam": 4, "lastli": 4, "run_evalu": 4, "And": [4, 5, 6], "upload_result": 4, "experiment_prefix": 4, "num_repetit": 4, "386a3620": 4, "9e1cc3cb": 4, "9d6a": 4, "4356": 4, "ab34": 4, "138e0abe8be4": 4, "8741976e": 4, "5268": 4, "4b75": 4, "949f": 4, "99477dde5d64": 4, "selectedsess": 4, "b831dc1e": 4, "90bc": 4, "4ed8": 4, "8080": [4, 5], "fb42444724d6": 4, "4it": 4, "latest": [4, 5, 6, 7], "tobia": 4, "evaluate_modul": 4, "6fc70b7be0088120a372dfdd5d320b39b8bb3630cb8029b193941d9376e86bb0": 4, "tue": 4, "nov": [4, 5], "couldn": 4, "5it": 4, "5053784e": 4, "64445871": 4, "a53c": 4, "44b1": 4, "a422": 4, "4f49b2f9656f": 4, "69": [4, 6], "4b29f3c9": 4, "9ef7e39a": 4, "2add": 4, "410c": 4, "89f8": 4, "9f1a8b198cf1": 4, "61": [4, 6], "insert": 4, "combined_df": 4, "concat": [4, 6], "ignore_index": [4, 6], "execution_tim": 4, "example_id": 4, "333333": 4, "224388": 4, "feb10f92": 4, "3167": 4, "41f3": 4, "bb1c": 4, "d271153a31a8": 4, "5b196b22": 4, "9f4c": 4, "489c": 4, "b020": 4, "7823208b42d6": 4, "348101": 4, "722464": 4, "c310f159": 4, "064a": 4, "4035": 4, "97c3": 4, "a25bbf43abc2": 4, "386076": 4, "704104": 4, "f7f24899": 4, "dd50": 4, "409e": 4, "93cc": 4, "6fb1622b60bf": 4, "443038": 4, "725059": 4, "242856d6": 4, "efb5": 4, "4101": 4, "b1cf": 4, "5805532838ac": 4, "373418": 4, "795302": 4, "ce975169": 4, "a0ab": 4, "40ce": 4, "8e32": 4, "efa28d06079d": 4, "stat": [4, 5], "groupbi": [4, 6], "agg": [4, 6], "sort": 4, "sort_valu": 4, "subplot": 4, "pyplot": 4, "plt": 4, "numpi": 4, "np": 4, "ax1": 4, "ax2": 4, "figsiz": 4, "2ecc71": 4, "3498db": 4, "e74c3c": 4, "bleu_mean": 4, "bleu_std": 4, "enumer": [4, 6], "errorbar": 4, "yerr": 4, "fmt": 4, "markers": 4, "capsiz": 4, "set_ylabel": 4, "set_titl": 4, "set_xtick": 4, "set_xticklabel": 4, "rotat": 4, "set_ylim": 4, "bottom": 4, "legend": 4, "exec_mean": 4, "exec_std": 4, "tight_layout": 4, "ndetail": 4, "4038": 4, "0453": 4, "7815": 4, "0433": 4, "3768": 4, "0424": 4, "8343": 4, "2208": 4, "3519": 4, "0775": 4, "9122": 4, "1482": 4, "377": 4, "042": 4, "078": 4, "slower": [4, 6], "04": [4, 5], "latenc": [4, 5, 6], "speed": [4, 5, 6], "interestingli": 4, "decoupl": 4, "reload": 4, "facilit": [4, 6], "promptfooconfig": 4, "model_comparison": 4, "pretti": [4, 6], "dump": 4, "default_flow_styl": 4, "sort_kei": 4, "prompt1": 4, "defaulttest": 4, "1000m": 4, "millisecond": 4, "eval_data": 4, "latency_m": 4, "totallatencym": 4, "token_usag": 4, "tokenusag": 4, "assert_pass": 4, "assertpasscount": 4, "assert_fail": 4, "assertfailcount": 4, "prompt_token": [4, 5], "num_request": 4, "numrequest": 4, "2463": 4, "000035": 4, "3773": 4, "004620": 4, "1669": 4, "000091": 4, "1669m": 4, "highest": [4, 5, 7], "3773m": 4, "00462": 4, "promptfool": 4, "manual": [4, 5, 6, 7], "redefin": 4, "prompt_comparison": 4, "prompt2": 4, "prompt3": 4, "prompt_fil": 4, "prompt_cont": 4, "BE": 4, "again": 4, "prompt_id": 4, "promptid": 4, "gradingresult": 4, "df_raw": 4, "reset_index": [4, 6], "eas": [4, 5, 6, 7], "seamless": [4, 6], "hf": [4, 5], "plain": [4, 5], "vanilla": 4, "defi": 4, "accustom": 4, "legaci": 4, "unsustain": 4, "prd": 4, "cultiv": [4, 6], "organiz": 4, "stagnat": 4, "alb": [4, 5], "loubna": [4, 5], "anton": [4, 5], "lozhkov": [4, 5], "bakouch": [4, 5], "gabriel": [4, 5, 6], "mart\u00edn": [4, 5, 6], "bl\u00e1zquez": [4, 5], "lewi": [4, 5], "tunstal": [4, 5], "agust\u00edn": [4, 5], "piquer": [4, 5], "andr": [4, 5], "marafioti": [4, 5], "cyril": [4, 5], "zakka": [4, 5], "leandro": [4, 5], "werra": [4, 5], "wolf": [4, 5], "are24": 4, "judgearena": 4, "bps99": 4, "salli": 4, "pott": 4, "barbara": 4, "557": [4, 6], "sciencedirect": 4, "s0377221798003646": 4, "doi": [4, 6, 7], "1016": 4, "s0377": 4, "2217": 4, "00364": 4, "ctj": 4, "jerri": [4, 6], "tworek": [4, 6], "heewoo": [4, 6], "jun": [4, 6], "qime": [4, 6], "henriqu": [4, 6], "pond": [4, 6], "de": [4, 6], "oliveira": [4, 6], "pinto": [4, 6], "harri": [4, 6], "yuri": 4, "burda": 4, "greg": [4, 6], "brockman": [4, 6], "raul": [4, 6], "puri": [4, 6], "gretchen": [4, 6], "krueger": [4, 6], "petrov": [4, 6], "heidi": 4, "khlaaf": 4, "girish": [4, 6], "sastri": [4, 6], "brook": [4, 6], "chan": [4, 6], "grai": [4, 6], "ryder": [4, 6], "mikhail": [4, 6], "pavlov": [4, 6], "alethea": [4, 6], "lukasz": 4, "kaiser": [4, 6], "mohammad": [4, 6], "bavarian": [4, 6], "clemen": [4, 6], "winter": [4, 6], "philipp": 4, "tillet": [4, 6], "felip": [4, 6], "petroski": [4, 6], "dave": [4, 6], "cum": [4, 6], "plappert": 4, "fotio": 4, "chantzi": [4, 6], "barn": 4, "ariel": 4, "herbert": 4, "voss": [4, 6], "hebgen": 4, "guss": 4, "nichol": 4, "paino": [4, 6], "nikola": [4, 6], "tezak": [4, 6], "babuschkin": [4, 6], "suchir": [4, 6], "balaji": [4, 6], "shantanu": [4, 6], "jain": [4, 6], "hess": [4, 6], "carr": 4, "josh": [4, 6], "achiam": [4, 6], "vedant": 4, "misra": 4, "evan": [4, 5, 6], "morikawa": [4, 6], "matthew": 4, "knight": [4, 6], "mile": [4, 6], "brundag": [4, 6], "mira": [4, 6], "murati": [4, 6], "kati": [4, 6], "mayer": [4, 6], "bob": [4, 6, 7], "mcgrew": [4, 6], "ilya": [4, 6], "sutskev": [4, 6], "wojciech": [4, 6], "zaremba": [4, 6], "2107": 4, "03374": 4, "cz": 4, "lianmin": 4, "ying": 4, "sheng": 4, "anastasio": 4, "angelopoulo": 4, "tianl": 4, "dacheng": 4, "banghua": 4, "jordan": [4, 6], "gonzalez": 4, "ion": 4, "stoica": 4, "04132": 4, "cho24a": 4, "francoi": 4, "arcpriz": 4, "cho24b": 4, "drcw": 4, "darshan": 4, "selvan": 4, "sunitha": 4, "ravi": 4, "sky": 4, "ch": 4, "bartosz": 4, "mielczarek": 4, "anand": [4, 6], "kannappan": [4, 6], "qian": [4, 6], "14140": 4, "dglh24": 4, "yann": 4, "bal\u00e1z": 4, "galambosi": 4, "tatsunori": 4, "hashimoto": 4, "debia": 4, "04475": 4, "fac24a": 4, "wiki": [4, 7], "fac24b": 4, "fac24c": 4, "model_doc": 4, "fac24d": 4, "cookbook": 4, "llm_judg": 4, "fac24f": 4, "fhwt23": 4, "cl\u00e9mentin": 4, "nathan": 4, "habib": 4, "gnh": 4, "julian": 4, "nyarko": 4, "ho": 4, "r\u00e9": 4, "adam": [4, 6], "chilton": 4, "aditya": [4, 6], "narayana": 4, "chohla": 4, "brandon": [4, 6, 7], "waldon": 4, "rockmor": 4, "diego": 4, "zambrano": 4, "dmitri": 4, "talisman": 4, "enam": 4, "hoqu": 4, "faiz": 4, "surani": 4, "frank": [4, 6], "fagan": 4, "galit": 4, "sarfati": 4, "gregori": 4, "dickinson": 4, "haggai": 4, "porat": 4, "hegland": 4, "jessica": [4, 6], "joe": [4, 6], "nudel": 4, "joel": [4, 6], "niklau": 4, "nai": 4, "jonathan": [4, 6], "choi": 4, "margaret": [4, 5], "hagan": 4, "megan": 4, "ma": [4, 6], "livermor": 4, "nikon": 4, "rasumov": 4, "rahe": 4, "nil": 4, "holzenberg": 4, "noam": 4, "kolt": 4, "henderson": 4, "rehaag": 4, "sharad": 4, "shang": 4, "spencer": 4, "sunni": 4, "gandhi": 4, "zur": 4, "varun": 4, "iyer": 4, "zehua": 4, "2308": 4, "11462": 4, "hbb": 4, "collin": 4, "burn": 4, "steven": [4, 6], "basart": [4, 6], "zou": [4, 6], "manta": [4, 6], "mazeika": [4, 6], "03300": 4, "hbd": 4, "maxwel": 4, "forb": 4, "yejin": 4, "curiou": 4, "neural": [4, 7], "degener": 4, "1904": 4, "09751": 4, "hyc": [4, 5], "binyuan": [4, 5], "zeyu": [4, 5], "cui": [4, 5], "jiaxi": [4, 5], "dayiheng": [4, 5], "tianyu": [4, 5], "jiajun": [4, 5], "kai": [4, 5, 6], "dang": [4, 5], "coder": [4, 5], "preprint": [4, 5, 7], "2409": [4, 5, 6], "12186": [4, 5], "lx": 4, "zhen": 4, "xiaohan": 4, "jia": 4, "yuxuan": 4, "lai": 4, "chongyang": 4, "shuai": 4, "nlg": 4, "07103": 4, "lbl": 4, "bommasani": 4, "toni": 4, "dimitri": 4, "tsipra": 4, "dilara": 4, "soylu": 4, "michihiro": 4, "yasunaga": 4, "yian": 4, "deepak": 4, "narayanan": 4, "yuhuai": 4, "newman": 4, "binhang": 4, "bobbi": 4, "ce": 4, "christian": [4, 6], "cosgrov": 4, "acosta": 4, "nava": [4, 6], "drew": 4, "hudson": 4, "zelikman": 4, "esin": 4, "durmu": 4, "faisal": 4, "ladhak": 4, "frieda": 4, "rong": 4, "hongyu": 4, "ren": [4, 5], "huaxiu": 4, "yao": [4, 6, 7], "jue": 4, "keshav": 4, "santhanam": 4, "laurel": 4, "lucia": 4, "mert": 4, "yuksekgonul": 4, "mirac": 4, "suzgun": 4, "niladri": 4, "chatterji": 4, "omar": 4, "khattab": 4, "chi": [4, 7], "sang": 4, "shibani": [4, 6], "santurkar": [4, 6], "surya": 4, "icard": 4, "tianyi": 4, "vishrav": 4, "chaudhari": 4, "xuechen": 4, "yuhui": 4, "yuta": 4, "koreeda": 4, "2211": 4, "09110": 4, "lbc24": 4, "ronan": 4, "bra": 4, "allenai": 4, "lhe22": [4, 5, 6], "stephani": [4, 5, 6], "owain": [4, 5, 6], "mimic": [4, 5, 6], "falsehood": [4, 5, 6], "2109": [4, 5, 6], "07958": [4, 5, 6], "pzwg23": 4, "shishir": 4, "tianjun": 4, "xin": [4, 6], "gorilla": 4, "15334": 4, "pro24": 4, "dev": 4, "ras24": 4, "sebastian": 4, "scratch": 4, "1633437166": 4, "sll": 4, "bhaskarjit": 4, "mingshu": 4, "jingrao": 4, "lyu": 4, "nathalia": 4, "castellano": 4, "pasquali": 4, "dhagash": 4, "12148": 4, "srf": 4, "shivalika": 4, "angelika": 4, "roman": [4, 6], "adelani": 4, "ngui": 4, "vila": 4, "suero": 4, "peerat": 4, "limkonchotiwat": 4, "kelli": 4, "marchisio": 4, "qi": 4, "leong": 4, "yosephin": 4, "susanto": 4, "raymond": [4, 6], "ng": [4, 6], "shayn": 4, "longpr": 4, "ko": 4, "madelin": 4, "antoin": 4, "bosselut": 4, "oh": 4, "leshem": 4, "choshen": 4, "daphn": 4, "ippolito": 4, "enzo": [4, 7], "ferrant": 4, "marzieh": 4, "fadae": 4, "beyza": 4, "ermi": 4, "sara": 4, "hooker": 4, "linguist": [4, 6], "03304": 4, "srr": 4, "aarohi": 4, "abhinav": 4, "rastogi": 4, "abhishek": 4, "rao": 4, "abu": 4, "awal": 4, "shoeb": 4, "abubakar": 4, "abid": [4, 5], "fisch": 4, "santoro": 4, "gupta": 4, "adri\u00e0": 4, "garriga": 4, "alonso": 4, "agnieszka": 4, "kluska": 4, "aitor": 4, "lewkowycz": 4, "akshat": 4, "warstadt": 4, "alexand": [4, 6, 7], "kocurek": 4, "ali": [4, 6], "safaya": 4, "tazarv": 4, "aman": 4, "hussain": 4, "dsouza": 4, "ambros": 4, "slone": 4, "ameet": 4, "rahan": 4, "anantharaman": 4, "ander": 4, "andreassen": 4, "madotto": 4, "santilli": 4, "stuhlm\u00fcller": 4, "la": 4, "lampinen": 4, "angelica": 4, "anh": 4, "vuong": 4, "animesh": 4, "gottardi": 4, "antonio": 4, "norelli": 4, "anu": 4, "venkatesh": 4, "arash": 4, "gholamidavoodi": 4, "arfa": 4, "tabassum": 4, "arul": 4, "menez": 4, "arun": [4, 6], "kirubarajan": 4, "asher": 4, "mullokandov": 4, "ashish": 4, "sabharw": 4, "herrick": 4, "avia": 4, "efrat": 4, "aykut": 4, "erdem": 4, "ayla": 4, "karaka\u015f": 4, "bao": [4, 5, 6], "loe": 4, "barret": [4, 6], "zoph": [4, 6], "bart\u0142omiej": 4, "bojanowski": 4, "batuhan": 4, "\u00f6zyurt": 4, "behnam": 4, "hedayatnia": 4, "neyshabur": 4, "inden": 4, "benno": 4, "stein": 4, "berk": 4, "ekmekci": 4, "blake": 4, "howald": 4, "bryan": 4, "orinion": 4, "diao": 4, "dour": 4, "stinson": 4, "cedrick": 4, "argueta": 4, "c\u00e9sar": 4, "ferri": 4, "ram\u00edrez": 4, "chandan": 4, "charl": 4, "rathkopf": 4, "chenlin": 4, "meng": 4, "chitta": 4, "baral": 4, "chiyu": 4, "callison": 4, "burch": 4, "wait": [4, 6], "voigt": 4, "cindi": 4, "ramirez": 4, "clara": 4, "rivera": 4, "clemencia": 4, "siro": 4, "colin": [4, 5], "raffel": [4, 5], "courtnei": 4, "ashcraft": 4, "cristina": 4, "garbacea": 4, "damien": [4, 6], "sileo": 4, "garrett": 4, "kilman": 4, "freeman": 4, "khashabi": 4, "levi": [4, 6], "mosegu\u00ed": 4, "gonz\u00e1lez": 4, "perszyk": 4, "danqi": 4, "dar": 4, "gilboa": 4, "dohan": [4, 6], "drakard": 4, "jurgen": 4, "debajyoti": 4, "datta": 4, "deni": 4, "emelin": 4, "kleyko": 4, "deniz": 4, "yuret": 4, "derek": [4, 6], "tam": [4, 7], "dieuwk": 4, "hupk": 4, "diganta": 4, "dilyar": 4, "buzan": 4, "coelho": 4, "mollo": 4, "diyi": 4, "dylan": 4, "schrader": 4, "ekaterina": 4, "shutova": 4, "ekin": 4, "dogu": 4, "cubuk": 4, "elad": 4, "segal": 4, "eleanor": 4, "hagerman": 4, "donowai": 4, "elli": 4, "pavlick": 4, "rodola": 4, "emma": 4, "lam": 4, "chu": [4, 6], "erkut": 4, "erni": 4, "dyer": 4, "jerzak": 4, "eunic": 4, "engefu": 4, "manyasi": 4, "evgenii": 4, "zheltonozhskii": 4, "fanyu": 4, "xia": [4, 5], "fatemeh": 4, "siar": 4, "fernando": 4, "mart\u00ednez": 4, "plume": 4, "francesca": 4, "happ\u00e9": 4, "gaurav": 4, "genta": 4, "indra": 4, "winata": 4, "gerard": 4, "melo": 4, "germ\u00e1n": 4, "kruszewski": 4, "giambattista": [4, 6], "parascandolo": [4, 6], "giorgio": 4, "mariani": 4, "gloria": 4, "gonzalo": 4, "jaimovitch": 4, "l\u00f3pez": 4, "gregor": 4, "betz": 4, "gui": [4, 5], "gur": 4, "hana": 4, "galijasev": 4, "rashkin": 4, "hannaneh": 4, "hajishirzi": 4, "harsh": 4, "hayden": 4, "bogar": 4, "henri": [4, 6], "shevlin": 4, "hinrich": 4, "sch\u00fctze": 4, "hiromu": 4, "yakura": 4, "hongm": 4, "hugh": 4, "mee": 4, "wong": [4, 6], "isaac": 4, "nobl": 4, "jaap": 4, "jumelet": 4, "geissing": 4, "jaehoon": 4, "jaim": 4, "fern\u00e1ndez": 4, "fisac": 4, "simon": 4, "koppel": 4, "koco\u0144": 4, "jana": 4, "thompson": [4, 5, 6], "janel": 4, "wingfield": 4, "jarema": 4, "radom": 4, "jascha": 4, "sohl": [4, 6], "dickstein": 4, "phang": 4, "yosinski": 4, "jekaterina": 4, "novikova": 4, "jell": 4, "bosscher": 4, "jennif": 4, "marsh": 4, "jeroen": 4, "taal": 4, "engel": 4, "jesujoba": 4, "alabi": 4, "jiam": 4, "jillian": 4, "joan": 4, "waweru": 4, "burden": 4, "bali": 4, "batcheld": 4, "berant": 4, "j\u00f6rg": 4, "frohberg": 4, "jo": 4, "rozen": 4, "orallo": 4, "boudeman": 4, "guerr": 4, "tenenbaum": 4, "joyc": 4, "chua": 4, "kanclerz": 4, "karen": 4, "livescu": 4, "karl": 4, "krauth": 4, "karthik": 4, "gopalakrishnan": 4, "katerina": 4, "ignatyeva": 4, "katja": 4, "markert": 4, "kaustubh": 4, "dhole": 4, "gimpel": 4, "omondi": 4, "kori": 4, "mathewson": 4, "kristen": 4, "chiafullo": 4, "ksenia": 4, "shkaruta": 4, "shridhar": 4, "kyle": [4, 6], "mcdonel": 4, "richardson": 4, "laria": 4, "reynold": 4, "leo": [4, 6], "dugan": 4, "lianhui": 4, "lidia": 4, "contrera": 4, "ochando": 4, "morenc": 4, "moschella": 4, "luci": 4, "ludwig": 4, "schmidt": [4, 6], "luheng": 4, "olivero": 4, "col\u00f3n": 4, "metz": [4, 6], "l\u00fctfi": 4, "kerem": 4, "\u015fenel": 4, "maarten": [4, 6], "bosma": 4, "sap": [4, 6], "maartj": 4, "hoev": 4, "maheen": 4, "farooqi": 4, "manaal": 4, "faruqui": 4, "marco": 4, "baturan": 4, "marelli": 4, "maru": 4, "maria": 4, "quintana": 4, "tolkiehn": 4, "mario": [4, 6], "giulianelli": 4, "martha": 4, "potthast": 4, "leavitt": 4, "hagen": 4, "m\u00e1ty\u00e1": 4, "schubert": 4, "medina": [4, 6], "orduna": 4, "baitemirova": 4, "melodi": 4, "arnaud": 4, "melvin": 4, "mcelrath": 4, "yee": 4, "cohen": 4, "ivanitskii": 4, "starritt": 4, "strube": 4, "micha\u0142": 4, "sw\u0119drowski": 4, "michel": [4, 6], "bevilacqua": 4, "mihir": 4, "kale": 4, "cain": 4, "mime": 4, "mitch": 4, "walker": 4, "mo": 4, "tiwari": 4, "mohit": 4, "bansal": 4, "moin": 4, "aminnaseri": 4, "mor": 4, "geva": 4, "mozhdeh": 4, "gheini": 4, "mukund": 4, "varma": 4, "nanyun": 4, "peng": [4, 6], "nayeon": 4, "neta": 4, "krakov": 4, "doiron": 4, "nicol": 4, "martinez": 4, "nikita": 4, "nangia": 4, "nikla": 4, "decker": 4, "muennighoff": 4, "nitish": [4, 6], "shirish": [4, 6], "keskar": [4, 6], "niveditha": 4, "constant": 4, "fiedel": 4, "nuan": 4, "wen": 4, "oliv": [4, 6], "agha": 4, "elbaghdadi": 4, "omer": 4, "moreno": 4, "casar": 4, "parth": 4, "doshi": 4, "pascal": 4, "fung": 4, "pu": 4, "vicol": 4, "pegah": 4, "alipoormolabashi": 4, "peiyuan": 4, "eckerslei": 4, "phu": 4, "mon": 4, "htut": 4, "pinyu": 4, "hwang": 4, "piotr": 4, "mi\u0142kowski": 4, "piyush": 4, "pouya": 4, "pezeshkpour": 4, "priti": 4, "oli": 4, "qiaozhu": 4, "qing": 4, "qinlang": 4, "rabin": 4, "banjad": 4, "rachel": [4, 6], "etta": 4, "rudolph": 4, "raefer": 4, "rahel": 4, "haback": 4, "ramon": 4, "risco": 4, "rapha\u00ebl": 4, "milli\u00e8r": 4, "rhythm": 4, "garg": [4, 5], "rif": 4, "saurou": 4, "riku": 4, "arakawa": 4, "robb": 4, "raymaek": 4, "rohan": 4, "sikand": 4, "novak": 4, "sitelew": 4, "lebra": 4, "rosann": 4, "rowan": [4, 6], "ruslan": 4, "salakhutdinov": 4, "stoval": 4, "teehan": 4, "sahib": 4, "saif": 4, "sajant": 4, "dillav": 4, "shleifer": 4, "wiseman": 4, "gruetter": 4, "schoenholz": 4, "sanghyun": 4, "sanjeev": 4, "kwatra": 4, "sarik": 4, "ghazarian": 4, "sayan": 4, "casei": [4, 6], "bischoff": 4, "gehrmann": 4, "schuster": 4, "sepideh": 4, "sadeghi": 4, "shadi": 4, "hamdan": 4, "sharon": 4, "shashank": 4, "sherri": 4, "shi": 4, "shikhar": 4, "shima": 4, "asaadi": 4, "shubh": 4, "pachchigar": 4, "shubham": 4, "toshniw": 4, "shyam": [4, 6], "upadhyai": 4, "shyamolima": 4, "debnath": 4, "siamak": 4, "shakeri": 4, "thormey": 4, "melzi": 4, "siva": 4, "reddi": 4, "sneha": 4, "priscilla": 4, "makini": 4, "soo": 4, "hwan": 4, "toren": 4, "sriharsha": 4, "hatwar": 4, "stanisla": 4, "dehaen": 4, "stefan": 4, "divic": 4, "stella": 4, "biderman": 4, "stephen": 4, "prasad": 4, "piantadosi": 4, "stuart": [4, 6], "shieber": 4, "summer": [4, 6], "misherghi": 4, "svetlana": 4, "kiritchenko": 4, "swaroop": 4, "tal": 4, "linzen": 4, "tariq": 4, "tatsu": 4, "te": 4, "th\u00e9o": 4, "desbord": 4, "theodor": 4, "rothschild": 4, "phan": [4, 6], "tiberiu": 4, "nkinyili": 4, "timo": 4, "schick": 4, "timofei": 4, "kornev": 4, "titu": 4, "tunduni": 4, "gerstenberg": 4, "trenton": 4, "trishala": 4, "neeraj": 4, "tushar": 4, "khot": 4, "shultz": 4, "uri": 4, "shaham": 4, "vera": 4, "demberg": 4, "victoria": [4, 6], "nyamai": 4, "vika": 4, "raunak": 4, "vinai": 4, "ramasesh": 4, "udai": 4, "prabhu": 4, "vishakh": 4, "padmakumar": 4, "vivek": 4, "srikumar": 4, "fedu": [4, 6], "wout": 4, "vossen": 4, "xiaoyu": 4, "tong": [4, 6], "xinran": 4, "xinyi": 4, "yadollah": 4, "yaghoobzadeh": 4, "yair": 4, "lakretz": 4, "yangqiu": 4, "yasaman": 4, "bahri": 4, "yichi": 4, "yide": 4, "yifu": 4, "yonatan": 4, "belinkov": 4, "yufang": 4, "seid": 4, "zhuoy": 4, "zijian": 4, "ziji": 4, "zirui": 4, "ziyi": 4, "extrapol": 4, "2206": 4, "04615": 4, "wpn": 4, "yada": 4, "pruksachatkun": 4, "amanpreet": 4, "hill": 4, "stickier": 4, "wsm": 4, "1804": 4, "07461": 4, "wtb": 4, "tai": 4, "borgeaud": 4, "dani": 4, "yogatama": 4, "denni": [4, 6], "donald": 4, "metzler": 4, "ed": 4, "oriol": 4, "vinyal": 4, "dean": 4, "07682": 4, "wdr": 4, "doolei": 4, "manlei": 4, "arka": [4, 6], "pal": 4, "feuer": 4, "siddhartha": 4, "ravid": 4, "shwartz": [4, 6], "ziv": 4, "khalid": [4, 5], "saifullah": 4, "siddartha": 4, "naidu": 4, "chinmai": 4, "hegd": 4, "lecun": 4, "goldstein": 4, "willi": 4, "neiswang": 4, "micah": 4, "goldblum": 4, "19314": 4, "yyh": 4, "baosong": [4, 5], "chengpeng": 4, "chengyuan": [4, 5], "fei": [4, 5], "guant": 4, "haoran": [4, 5], "huan": [4, 5], "jialong": 4, "jialin": 4, "jianhong": [4, 5], "tu": [4, 5], "jianwei": [4, 5], "jianxin": [4, 5], "jin": [4, 6], "jingren": [4, 5], "jinz": 4, "jinzheng": 4, "junyang": [4, 5], "keme": [4, 5], "keqin": [4, 5], "kexin": [4, 5], "mingfeng": [4, 5], "xue": [4, 5, 6], "ni": 4, "pei": [4, 5], "ru": 4, "men": [4, 5], "ruiz": 4, "runji": [4, 5], "shiji": 4, "sinan": 4, "tianhang": 4, "wenbin": 4, "ge": 4, "xiaodong": 4, "deng": 4, "xiaohuan": 4, "xingzhang": [4, 5], "xinyu": [4, 6], "xipin": 4, "xuancheng": [4, 5], "yichang": [4, 5], "wan": [4, 5], "yunfei": 4, "yuqiong": [4, 5], "zhenru": [4, 5], "zhihao": 4, "10671": 4, "zcl24": 4, "zhihan": 4, "cao": 4, "lizi": 4, "openreview": 4, "forum": 4, "aegrf1uy0p": 4, "zc": 4, "siyuan": 4, "zhuang": [4, 6], "zhanghao": 4, "yonghao": 4, "zi": 4, "zhuohan": 4, "xing": [4, 6], "2306": 4, "05685": 4, "huggingface24": 4, "06": [4, 7], "metaai24": 4, "di": 5, "hunter": 5, "photo": 5, "email": 5, "hipaa": 5, "properti": [5, 6], "gdpr": 5, "iot": 5, "unreli": 5, "impract": 5, "slm": 5, "viabl": 5, "sensor": 5, "evalu": [5, 7], "interconnect": 5, "frontend": 5, "tradeoff": [5, 6, 7], "rapidli": [5, 6, 7], "garner": 5, "traction": 5, "yourself": 5, "aw": [5, 6], "bedrock": 5, "sambanova": 5, "sla": 5, "viabil": 5, "veloc": 5, "roadmap": 5, "commodit": 5, "decai": 5, "winner": 5, "loser": 5, "condens": 5, "clean": 5, "2024t": 5, "broadli": [5, 7], "versatil": 5, "72b": 5, "med": 5, "bloomberggpt": 5, "underw": 5, "adept": 5, "toxigen": 5, "alnajjar": 5, "13b": [5, 6], "01": 5, "outperform": 5, "32b": 5, "feasibl": 5, "2m": 5, "unstructur": [5, 7], "modal": 5, "diagnosi": 5, "patient": 5, "necessit": 5, "flagship": 5, "405b": 5, "gemini": 5, "pack": 5, "cautious": 5, "isol": [5, 6], "cpot": 5, "cpit": 5, "tco": 5, "tpot": 5, "ttft": 5, "mmlu": [5, 6], "gpqa": 5, "ratio": 5, "median": 5, "afford": 5, "lite": 5, "micro": 5, "budget": 5, "encod": [5, 6, 7], "cent": 5, "1m": 5, "flash": 5, "cheapest": 5, "phi": 5, "half": [5, 6], "permiss": [5, 6], "apach": 5, "exemplifi": [5, 6], "microsoft": 5, "simpler": [5, 6, 7], "fewer": [5, 6], "700m": 5, "100m": 5, "gemma": [5, 7], "deepseek": 5, "v2": [5, 6], "thorough": [5, 6], "grown": 5, "withdraw": 5, "incomplet": [5, 6], "preprocess": [5, 7], "unclear": 5, "15t": 5, "8t": 5, "fineweb": 5, "penedo": 5, "96": [5, 6], "crawl": 5, "snapshot": 5, "codebas": 5, "ablat": 5, "vital": [5, 6], "favorit": 5, "spawn": 5, "streamlin": [5, 7], "ultrachat": 5, "2024u": 5, "created_job": 5, "fine_tun": 5, "training_fil": 5, "file_id": 5, "ultrachat_chunk_train": 5, "validation_fil": 5, "ultrachat_chunk_ev": 5, "training_step": 5, "0001": 5, "auto_start": 5, "job_id": 5, "toolkit": [5, 6], "sft": 5, "dpo": 5, "nemo": [5, 6], "codestr": 5, "2024v": 5, "135m": 5, "enough": 5, "despit": [5, 7], "rewrit": 5, "multimod": [5, 6], "smolvlm": 5, "mlx": [5, 7], "mlc": 5, "peft": 5, "programm": 5, "graphic": [5, 6], "vram": 5, "vector": [5, 6], "mathbf": 5, "x_1": [5, 7], "x_2": [5, 7], "x_n": [5, 7], "x_": [5, 7], "\u03b8": 5, "matrix": [5, 6], "concurr": 5, "groq": 5, "cerebra": 5, "mozilla": 5, "docker": 5, "gerganov": 5, "georgi": 5, "hundr": 5, "overwhelm": [5, 7], "manifesto": 5, "enjoy": 5, "bog": 5, "exploratori": 5, "hacker": 5, "Will": [5, 6], "prototyp": 5, "prematur": 5, "besid": 5, "lighter": 5, "sacrific": 5, "gguf": 5, "unifi": [5, 7], "ggml": [5, 7], "ibm": [5, 6], "bit": 5, "metadata": 5, "disk": 5, "faster": 5, "backward": 5, "2024x": 5, "repo": 5, "easier": [5, 6, 7], "compil": 5, "linux": 5, "argument": [5, 6, 7], "sudo": 5, "apt": 5, "cmake": 5, "bind": 5, "betlen": 5, "cnv": 5, "llamacpp": 5, "q8_0": 5, "succinct": 5, "ctrl": 5, "interject": 5, "philosoph": 5, "debat": 5, "fulfil": 5, "happi": 5, "responsibli": 5, "bye": 5, "goodby": 5, "port": 5, "127": 5, "curl": [5, 7], "localhost": 5, "v1": [5, 6], "bearer": 5, "finish_reason": 5, "deepli": 5, "1734627879": 5, "completion_token": 5, "total_token": 5, "chatcmpl": 5, "5wl2tzjzdmzupvxwp2gcedr8xbpsyhfm": 5, "prompt_n": 5, "prompt_m": 5, "132": 5, "prompt_per_token_m": 5, "prompt_per_second": 5, "77619878666999": 5, "predicted_n": 5, "predicted_m": 5, "1700": 5, "654": 5, "predicted_per_token_m": 5, "36882142857143": 5, "predicted_per_second": 5, "92850867960208": 5, "gbnf": [5, 7], "8pm": 5, "appointmenttim": 5, "appointmentdetail": 5, "handi": 5, "model_path": 5, "llama_cpp": 5, "create_chat_complet": 5, "occupi": 5, "activist": 5, "justin": [5, 6], "tunnei": 5, "ocho": 5, "appach": 5, "cosmopolitan": 5, "libc": 5, "portabl": 5, "durabl": 5, "usabl": [5, 6, 7], "tinyllama": 5, "wget": 5, "jartin": 5, "q5_k_m": 5, "renam": 5, "ex": 5, "chmod": 5, "nobrows": 5, "registri": 5, "nativ": [5, 7], "container": 5, "trai": 5, "familiar": 5, "bare": 5, "ssfl": 5, "sh": [5, 7], "Or": 5, "11434": 5, "chatrespons": 5, "easiest": 5, "rich": [5, 6], "playground": 5, "simultan": [5, 6], "verif": [5, 7], "importantli": [5, 7], "intuit": 5, "beginn": 5, "tensorrt": 5, "trt": 5, "latex": 5, "voic": 5, "pwa": 5, "rag": 5, "medium": [5, 6, 7], "gpt4all": 5, "rbac": 5, "fp16": 5, "q2_k": 5, "q4_k": 5, "q6_k": 5, "mib": 5, "wikitext": 5, "salesforc": 5, "wikipedia": [5, 7], "min_prompt_length": 5, "input_texts_raw": 5, "2010": 5, "valkyria": 5, "chronicl": 5, "forgiv": 5, "newcom": 5, "raita": 5, "honjou": 5, "compos": [5, 6], "hitoshi": 5, "sakimoto": 5, "takeshi": 5, "ozawa": 5, "writer": 5, "theme": [5, 6], "sung": 5, "escap": 5, "escaped_text": 5, "2024w": 5, "block_scal": 5, "block": [5, 6], "width": 5, "parenthes": 5, "block_min": 5, "formula": 5, "superblock": 5, "5625": 5, "ieee": 5, "754": 5, "ppl": 5, "exp": 5, "sum_": 5, "log_2": 5, "x_i": [5, 7], "avg": 5, "_i": 5, "corr": 5, "ln": [5, 7], "kullback": 5, "leibler": 5, "entropi": 5, "logit": 5, "d_": 5, "softmax": [5, 7], "sum": 5, "kld": 5, "q2_kresult": 5, "q6": 5, "004": 5, "q2": 5, "112": 5, "q4": 5, "smallest": 5, "390": 5, "67": [5, 6], "81": [5, 6], "93": [5, 6], "462": 5, "614": 5, "58": 5, "170": 5, "q4_k_m": 5, "thread": 5, "16x": 5, "speedup": 5, "85x": 5, "79x": 5, "ubuntu": 5, "lt": 5, "x86_64": 5, "gnu": 5, "thank": [5, 7], "intel": 5, "i7": 5, "8550u": 5, "15gib": 5, "samsung": 5, "ssd": 5, "970": 5, "evo": 5, "500gb": 5, "1170": 5, "meant": 5, "ahead": [5, 6], "ai4c": 5, "ai4a": 5, "paperswithcod": [5, 6], "ana24a": 5, "leaderboard": [5, 6], "artificialanalysi": 5, "ana24b": 5, "ana24c": 5, "bc24": 5, "andrei": [5, 6], "abetlen": 5, "fac4": 5, "optimum": 5, "concept_guid": 5, "fac4t": 5, "fac4u": 5, "200k": 5, "ultrachat_200k": 5, "fac4v": 5, "blogpost": 5, "gc24": 5, "ggerganov": [5, 7], "blob": [5, 7], "readm": [5, 7], "gc4a": 5, "gc4b": 5, "pka": 5, "guilherm": 5, "hynek": 5, "kydl\u00ed\u010dek": 5, "decant": 5, "finest": 5, "17557": 5, "qwe4b": 5, "qy": 5, "beichen": 5, "tingyu": 5, "zihan": 5, "qiu": 5, "15115": 5, "rev24": 5, "harvard": 5, "nyt": 5, "harvardlawreview": 5, "timess": 5, "zwa": 5, "wael": 5, "geoffrei": [5, 6], "angu": 5, "arnav": 5, "jefferi": 5, "kinnison": 5, "sherstinski": 5, "piero": 5, "molino": 5, "travi": 5, "addair": 5, "devvret": 5, "310": 5, "2405": 5, "00732": 5, "huggingface4w": 5, "huggingface4xa": 5, "huggingface4xb": 5, "ibmthink24": 5, "lmstudio24": 5, "lmstudio": 5, "metaai4c": 5, "mozillaocho24": 5, "salesforce24": 5, "immens": 6, "commonplac": 6, "hartvigsen": 6, "societi": 6, "statement": 6, "alarm": 6, "openli": 6, "dolli": 6, "llama2": [6, 7], "emb": 6, "generalist": 6, "injustic": 6, "inequ": 6, "undermin": 6, "perpetu": 6, "displac": 6, "eros": 6, "fake": 6, "deepfak": 6, "distrust": 6, "cyberattack": 6, "spread": 6, "disinform": 6, "inadvert": 6, "interven": 6, "irrevers": 6, "uncheck": 6, "extinct": 6, "race": 6, "incentiv": 6, "shortcut": 6, "behind": 6, "stress": 6, "urgent": 6, "reorient": 6, "birth": 6, "siam": 6, "edgington": 6, "jailbreak": 6, "promptcraft": 6, "stealth": 6, "sutton": 6, "subtl": 6, "trigger": 6, "subtleti": 6, "exception": 6, "phrase": 6, "evad": 6, "hqve": 6, "frer": 6, "hplidai": 6, "pl": 6, "hyperion": 6, "coast": 6, "redwood": 6, "tallest": 6, "tree": [6, 7], "routin": 6, "prejudic": 6, "gallego": 6, "leak": 6, "poison": 6, "intention": 6, "inject": 6, "mislead": 6, "exabeam": 6, "finra": 6, "3110": 6, "mandat": 6, "supervisori": 6, "unicef": 6, "empow": 6, "contest": 6, "congress": 6, "enact": 6, "pictur": [6, 7], "territori": 6, "oversea": 6, "chines": 6, "legitim": 6, "consent": 6, "complaint": 6, "cooper": 6, "extraterritori": 6, "offshor": 6, "draft": 6, "voluntari": 6, "neutral": 6, "player": 6, "prepared": 6, "compris": 6, "cbrn": 6, "persuas": 6, "autonomi": 6, "gradat": 6, "scorecard": 6, "elig": 6, "advisori": 6, "sag": 6, "shut": 6, "prerequisit": 6, "harden": 6, "asl": 6, "biosafeti": 6, "elev": 6, "warn": 6, "bioweapon": 6, "compartment": 6, "difficulti": 6, "4x": 6, "jump": 6, "paus": 6, "frontier": 6, "deepmind": 6, "biosecur": 6, "buffer": 6, "formul": [6, 7], "calibr": 6, "promin": 6, "taxonomi": 6, "llamaguard": 6, "20241022": 6, "3x": 6, "5x": 6, "alaga": 6, "substandard": 6, "oxford": 6, "wachter": 6, "blur": 6, "ill": 6, "stifl": 6, "suscept": 6, "aadc": 6, "outset": 6, "curricula": 6, "adversari": 6, "uncov": [6, 7], "appar": 6, "thoroughli": 6, "lm": [6, 7], "problemat": 6, "arrai": 6, "undergo": 6, "280b": 6, "cai": [6, 7], "utilis": 6, "minimis": 6, "enshrin": 6, "evas": 6, "resort": 6, "avenu": 6, "cambria": 6, "inherit": 6, "influenti": 6, "debias": 6, "occurr": 6, "phish": 6, "clarifi": 6, "toler": 6, "checklist": 6, "abus": 6, "ux": 6, "architect": 6, "diagram": 6, "retrofit": 6, "promptli": 6, "dashboard": 6, "misalign": 6, "star": 6, "postpon": 6, "sens": 6, "combat": 6, "counter": 6, "traffic": 6, "frustrat": 6, "workaround": 6, "silo": 6, "hierarch": 6, "hierarchi": 6, "66": 6, "depth": 6, "mcq": 6, "regex": [6, 7], "joint": 6, "facet": 6, "purpl": 6, "circl": 6, "opensafetylab": 6, "salad_bench_dataset": 6, "base_set": 6, "gptfuzzer": 6, "auto": [6, 7], "qid": 6, "o1": 6, "supremaci": 6, "o53": 6, "o14": 6, "o5": 6, "o65": 6, "plagiar": 6, "o16": 6, "o6": 6, "o47": 6, "campaign": 6, "o12": 6, "o52": 6, "surveil": 6, "spous": 6, "know": [6, 7], "o13": 6, "breakdown": 6, "ncount": 6, "21318": 6, "8756": 6, "6486": 6, "o2": 6, "1717": 6, "o4": 6, "1477": 6, "o3": 6, "socioeconom": 6, "851": 6, "int64": 6, "gen": 6, "15433": 6, "hh": 6, "4184": 6, "659": 6, "advbench": 6, "230": 6, "189": 6, "toxicchat": 6, "anyth": 6, "817": 6, "misconcept": 6, "ingrain": 6, "mc1": 6, "singular": 6, "choices4": 6, "mc2": 6, "set4": 6, "scorer": 6, "correctli": [6, 7], "truthful_qa": 6, "truthfulqa_dataset": 6, "multiple_choic": 6, "best_answ": 6, "correct_answ": 6, "incorrect_answ": 6, "watermelon": 6, "digest": 6, "noth": 6, "stomach": 6, "sick": 6, "wonderopoli": 6, "wonder": 6, "belli": 6, "swallow": 6, "dream": 6, "die": 6, "indigest": 6, "unconsci": 6, "excret": 6, "asr": 6, "r2d2": 6, "wider": [6, 7], "mass": 6, "destruct": 6, "asynchron": 6, "webpurifi": 6, "protectai": 6, "comprehend": 6, "amazon": 6, "nvidia": [6, 7], "keyword": 6, "toolset": 6, "nemmo": 6, "synchron": 6, "nemoguardrail": 6, "llmrail": 6, "railsconfig": 6, "from_path": 6, "rail": 6, "hello": 6, "ministr": 6, "mistralai": 6, "mistral_api_kei": 6, "moderate_chat": 6, "omni": 6, "pprint": 6, "to_json": 6, "threaten": 6, "illicit": 6, "granit": 6, "guardian": 6, "consortium": 6, "v3": 6, "11b": 6, "begin_of_text": 6, "start_header_id": 6, "end_header_id": 6, "unsafe_categori": 6, "user_message_1": 6, "model_answer_1": 6, "comma": 6, "eot_id": 6, "eom_id": 6, "denot": 6, "s1": 6, "s2": 6, "s3": 6, "s4": 6, "s5": 6, "defam": 6, "s6": 6, "s7": 6, "s8": 6, "s9": 6, "s10": 6, "s11": 6, "s12": 6, "s13": 6, "atla": 6, "2b": 6, "hap": 6, "38m": 6, "125m": 6, "padhi": 6, "shieldgemma": 6, "accomplish": [6, 7], "judge_prompt": 6, "american": 6, "vandal": 6, "underag": 6, "drink": 6, "vulgar": 6, "obscen": 6, "racism": 6, "derogatori": 6, "firearm": 6, "safety_scor": 6, "IN": 6, "borderlin": 6, "verdict": 6, "boolean": [6, 7], "brief": 6, "rational": 6, "delimit": 6, "paramount": [6, 7], "evenli": 6, "good_sampl": 6, "bad_sampl": 6, "2024z": 6, "surg": 6, "scam": 6, "get_profanity_sampl": 6, "show_stat": 6, "bool": 6, "current_dir": 6, "getcwd": 6, "data_path": 6, "profanity_en": 6, "random_st": 6, "ncategori": 6, "category_count": 6, "category_1": 6, "1f": 6, "profanity_sampl": 6, "nsampl": 6, "anatomi": 6, "slur": 6, "182": 6, "bodili": 6, "fluid": 6, "excrement": 6, "insult": 6, "mental": 6, "disabl": 6, "jap": 6, "babi": [6, 7], "batter": 6, "crazi": 6, "sob": 6, "fukka": 6, "sh1t3": 6, "get_salad_sampl": 6, "salad_sampl": 6, "tortur": 6, "porn": 6, "sin": 6, "sight": 6, "god": 6, "embezzl": 6, "xanax": 6, "alcohol": 6, "get_good_sampl": 6, "min_scor": 6, "reichstag": 6, "profanity_data": 6, "salad_data": 6, "good_data": 6, "all_data": 6, "prompt_sampl": 6, "is_unsaf": 6, "counti": 6, "holli": 6, "ridg": 6, "nc": 6, "town": 6, "onslow": 6, "carolina": 6, "diver": 6, "underwat": 6, "maze": 6, "coral": 6, "treasur": 6, "vivid": 6, "sensori": 6, "emot": 6, "labyrinthin": 6, "passag": 6, "reef": 6, "suspens": 6, "obstacl": 6, "creatur": 6, "nomin": 6, "nobel": 6, "literatur": 6, "love": 6, "ny": [6, 7], "logo": 6, "thief": 6, "rob": 6, "famou": 6, "nstatist": 6, "source_stat": 6, "type_stat": 6, "tiktoken": 6, "plug": 6, "safetyvalid": 6, "validationresult": 6, "dataclass": 6, "abstractmethod": 6, "llmguardvalid": 6, "scanner": 6, "bantop": 6, "llm_guard": 6, "input_scann": 6, "scan_prompt": 6, "matchtyp": 6, "default_banned_top": 6, "banned_top": 6, "super": 6, "banned_topics_scann": 6, "use_onnx": 6, "toxicity_scann": 6, "match_typ": 6, "fail_fast": 6, "unsafe_scann": 6, "gun": 6, "cool": 6, "hunt": 6, "deer": 6, "dad": 6, "mistralvalid": 6, "hate_and_discrimin": 6, "violence_and_threat": 6, "dangerous_and_criminal_cont": 6, "selfharm": 6, "openaivalid": 6, "attr": 6, "dir": 6, "getattr": 6, "illicit_viol": 6, "llmjudgevalid": 6, "prompt_path": 6, "llmjudg": 6, "filenotfounderror": 6, "elaps": 6, "score_valid": 6, "id_": 6, "validator_inst": 6, "start_tim": 6, "validation_result": 6, "elapsed_tim": 6, "prompt_sample_id": 6, "validator_nam": 6, "scoring_prompt": 6, "scoring_result": 6, "01536": 6, "34098": 6, "497136": 6, "546416": 6, "calculate_validator_metr": 6, "bad_sourc": 6, "good_sourc": 6, "tpr": 6, "fpr": 6, "f1_score": 6, "tn": 6, "fp": 6, "fn": 6, "tp": 6, "255": 6, "74": 6, "238": 6, "130": 6, "256": 6, "79": 6, "149": 6, "213": 6, "190": 6, "235": 6, "63": 6, "222": 6, "82": 6, "75": 6, "480": 6, "157": 6, "487": 6, "495": 6, "482": 6, "667": 6, "248": 6, "466": 6, "143": 6, "355": 6, "slowest": 6, "room": 6, "false_posit": 6, "left_on": 6, "right_on": 6, "persian": 6, "teenag": 6, "\u0686\u0627\u0644\u0634": 6, "\u0645\u0648\u0645\u0648": 6, "\u06cc\u06a9": 6, "\u062d\u0642\u0647": 6, "\u0648": 6, "\u0627\u0641\u0633\u0627\u0646\u0647": 6, "\u0645\u062d\u0644\u06cc": 6, "\u0627\u0633\u062a": 6, "\u06a9\u0647": 6, "\u0634\u0627\u06cc\u0639\u0647": 6, "\u0622\u0646": 6, "\u062f\u0631": 6, "\u0634\u0628\u06a9\u0647": 6, "\u0647\u0627\u06cc": 6, "\u0627\u062c\u062a\u0645\u0627\u0639\u06cc": 6, "\u067e\u062e\u0634": 6, "\u0634\u062f": 6, "\u06af\u0632\u0627\u0631\u0634": 6, "\u06a9\u0627\u0631\u0628\u0631\u06cc": 6, "\u0628\u0627": 6, "\u0646\u0627\u0645": 6, "\u06a9\u0648\u062f\u06a9\u0627\u0646": 6, "\u0646\u0648\u062c\u0648\u0627\u0646\u0627\u0646": 6, "\u0631\u0627": 6, "\u0628\u0647": 6, "\u0627\u0639\u0645\u0627\u0644": 6, "\u062e\u0637\u0631\u0646\u0627\u06a9\u06cc": 6, "\u0645\u0627\u0646\u0646\u062f": 6, "\u062e\u0648\u062f\u0632\u0646\u06cc": 6, "\u062e\u0648\u062f\u06a9\u0634\u06cc": 6, "\u062a\u0631\u063a\u06cc\u0628": 6, "\u0645\u06cc": 6, "\u06a9\u0646\u062f": 6, "\u0648\u062c\u0648\u062f": 6, "\u0635\u062d\u0628\u062a": 6, "\u0632\u06cc\u0627\u062f\u06cc": 6, "\u0645\u0648\u0631\u062f": 6, "\u0627\u06cc\u0646": 6, "\u062a\u0639\u062f\u0627\u062f": 6, "\u0634\u06a9\u0627\u06cc\u0627\u062a": 6, "\u0648\u0627\u0642\u0639\u06cc": 6, "\u0628\u0633\u06cc\u0627\u0631": 6, "\u06a9\u0645": 6, "\u0628\u0648\u062f": 6, "\u067e\u0644\u06cc\u0633": 6, "\u0635\u062f\u0645\u0647": 6, "\u062f\u06cc\u062f\u0646": 6, "\u062f\u0644\u06cc\u0644": 6, "\u062a\u0623\u062b\u06cc\u0631": 6, "\u0645\u0633\u062a\u0642\u06cc\u0645": 6, "\u067e\u062f\u06cc\u062f\u0647": 6, "\u062a\u0623\u06cc\u06cc\u062f": 6, "\u0646\u06a9\u0631\u062f\u0647": 6, "\u062a\u0631\u0633": 6, "\u0646\u06af\u0631\u0627\u0646\u06cc": 6, "\u0627\u06cc\u062c\u0627\u062f": 6, "\u0634\u062f\u0647": 6, "\u0628\u06cc\u0634\u062a\u0631": 6, "\u0627\u0632": 6, "\u062e\u0648\u062f": 6, "\u0631\u0633\u0627\u0646\u0647": 6, "\u0647\u0627": 6, "\u0637\u0648\u0631\u06cc": 6, "\u062e\u06cc\u0631\u06cc\u0647": 6, "\u0647\u0634\u062f\u0627\u0631": 6, "\u062f\u0627\u062f\u0646\u062f": 6, "\u0622\u0633\u06cc\u0628": 6, "\u0627\u0646\u062a\u0638\u0627\u0631\u0627\u062a": 6, "\u0645\u062d\u062a\u0648\u0627\u06cc": 6, "\u062e\u0634\u0648\u0646\u062a": 6, "\u0622\u0645\u06cc\u0632": 6, "\u0627\u06cc\u0646\u062a\u0631\u0646\u062a": 6, "\u06af\u0641\u062a\u0647": 6, "\u0634\u0648\u062f": 6, "\u0627\u0648\u0644\u06cc\u0646": 6, "\u0628\u0627\u0631": 6, "\u0633\u0627\u0644": 6, "\u06f2\u06f0\u06f1\u06f8": 6, "\u067e\u0633": 6, "\u0622\u0646\u06a9\u0647": 6, "\u0631\u0648\u0632\u0646\u0627\u0645\u0647": 6, "\u0627\u0646\u062f\u0648\u0646\u0632\u06cc\u0627\u06cc\u06cc": 6, "\u062e\u0628\u0631": 6, "\u062f\u062e\u062a\u0631": 6, "\u06f1\u06f2": 6, "\u0633\u0627\u0644\u0647": 6, "\u062f\u0627\u062f": 6, "\u0645\u0648\u0636\u0648\u0639": 6, "\u062c\u0647\u0627\u0646\u06cc": 6, "\u062a\u0628\u062f\u06cc\u0644": 6, "\u0645\u062c\u0633\u0645\u0647": 6, "\u0647\u0646\u0631\u0645\u0646\u062f": 6, "\u0698\u0627\u067e\u0646\u06cc": 6, "\u0647\u0631": 6, "\u0686\u0646\u062f": 6, "\u0634\u0627\u06cc\u062f": 6, "\u0646\u06af\u0627\u0647": 6, "\u0628\u0639\u0636\u06cc": 6, "\u0632\u06cc\u0628\u0627": 6, "\u0646\u0628\u0627\u0634\u062f": 6, "\u0627\u0645\u0627": 6, "\u06a9\u0627\u0645\u0644\u0627": 6, "\u0628\u06cc": 6, "\u062e\u0637\u0631": 6, "\u0627\u06cc\u0631\u0627\u0646": 6, "\u0645\u062f\u062a": 6, "\u0628\u06cc\u0646": 6, "\u06a9\u0627\u0631\u0628\u0631\u0627\u0646": 6, "\u0645\u0637\u0631\u062d": 6, "\u0633\u0627\u0644\u06cc": 6, "\u0633\u0631\u0627\u0633\u0631": 6, "\u062c\u0647\u0627\u0646": 6, "\u0645\u0634\u0627\u0628\u0647\u06cc": 6, "\u0628\u0631\u0627\u06cc": 6, "\u0648\u0627\u0644\u062f\u06cc\u0646": 6, "\u06a9\u0631\u062f\u0647": 6, "\u0627\u0641\u0631\u0627\u062f": 6, "\u0686\u0647": 6, "\u06a9\u0627\u0631\u06cc": 6, "\u062f\u0639\u0648\u062a": 6, "tourist": 6, "distress": 6, "polish": 6, "galician": 6, "dzisiaj": 6, "szwecji": 6, "innych": 6, "bogatych": 6, "krajach": 6, "ludzi": 6, "u\u017cywaj\u0105": 6, "mn\u00f3stwo": 6, "najr\u00f3\u017cniejszych": 6, "urz\u0105dze\u0144": 6, "hox": 6, "suecia": 6, "outro": 6, "pa\u00eds": 6, "rico": 6, "xent": 6, "usa": [6, 7], "moita": 6, "m\u00e1quina": 6, "diferent": 6, "\u0142\u00f3dka": 6, "zaczyna": 6, "ton\u0105\u0107": 6, "tury\u015bci": 6, "wracaj\u0105": 6, "statek": 6, "dom\u00f3w": 6, "gdzie": 6, "opowiadaj\u0105": 6, "tym": 6, "jak": 6, "zostali": 6, "zaatakowani": 6, "surprisingli": 6, "unsettl": 6, "paradox": 6, "harbor": 6, "wisdom": 6, "aspir": 6, "technologist": 6, "disciplinari": 6, "ethicist": 6, "policymak": 6, "ai24": 6, "asa24": 6, "jide": 6, "jona": 6, "schuett": 6, "marku": 6, "anderljung": 6, "08751": 6, "bhy": 6, "hinton": 6, "pieter": 6, "abbeel": 6, "trevor": 6, "darrel": 6, "yuval": 6, "harari": 6, "ya": 6, "lan": 6, "shai": 6, "shalev": 6, "gillian": 6, "hadfield": 6, "clune": 6, "tegan": 6, "maharaj": 6, "hutter": 6, "at\u0131l\u0131m": 6, "g\u00fcne\u015f": 6, "baydin": 6, "sheila": 6, "mcilraith": 6, "qiqi": 6, "ashwin": 6, "acharya": 6, "anca": 6, "dragan": 6, "philip": 6, "torr": 6, "russel": 6, "kahneman": 6, "s\u00f6ren": 6, "mindermann": 6, "amid": 6, "384": 6, "6698": 6, "1126": 6, "adn0117": 6, "bbc": 6, "emili": 6, "braca": 6, "israel": 6, "carter": 6, "hafsa": 6, "kanchwala": 6, "khojasteh": 6, "charli": 6, "landow": 6, "luo": 6, "magarelli": 6, "mirin": 6, "averi": 6, "moyer": 6, "kayla": 6, "simpson": 6, "amelia": 6, "skawinski": 6, "heverin": 6, "23308": 6, "bmc": 6, "dillon": 6, "brendan": 6, "murphi": 6, "khachaturov": 6, "gleav": 6, "kellin": 6, "pelrin": 6, "2408": [6, 7], "02946": 6, "cmm": 6, "erik": 6, "lorenzo": 6, "malandri": 6, "fabio": 6, "mercorio": 6, "navid": 6, "nobani": 6, "seveso": 6, "15248": 6, "edg24": 6, "exa24": 6, "cyber": 6, "grb": 6, "rossi": 6, "barrow": 6, "mehrab": 6, "tanjim": 6, "sungchul": 6, "franck": 6, "dernoncourt": 6, "ruiyi": 6, "nesreen": 6, "2309": 6, "00770": 6, "h44z": 6, "hgp": 6, "saadia": 6, "hamid": 6, "palangi": 6, "dipankar": 6, "ec": 6, "kamar": 6, "oxi": 6, "smaranda": 6, "muresan": 6, "preslav": 6, "nakov": 6, "alin": 6, "villavicencio": 6, "editor": 6, "60th": 6, "3309": 6, "3326": 6, "dublin": 6, "aclanthologi": 6, "acl": 6, "18653": 6, "hym": 6, "weijiang": 6, "weitao": 6, "weihong": 6, "zhangyin": 6, "haotian": 6, "qianglong": 6, "weihua": 6, "xiaocheng": 6, "bing": 6, "ting": 6, "dx": 6, "1145": [6, 7], "3703155": 6, "ldw": 6, "lijun": 6, "ruohui": 6, "xuhao": 6, "wangmeng": 6, "zuo": 6, "dahua": 6, "qiao": 6, "shao": 6, "05044": 6, "mpy": 6, "xuwang": 6, "zifan": 6, "norman": 6, "mu": 6, "elham": 6, "sakhae": 6, "nathaniel": 6, "forsyth": 6, "04249": 6, "mlc24": 6, "illumin": 6, "ailumin": 6, "oaa": 6, "adler": 6, "ahmad": 6, "ilg": 6, "akkaya": 6, "florencia": 6, "leoni": 6, "aleman": 6, "janko": 6, "altenschmidt": 6, "altman": 6, "shyamal": 6, "anadkat": 6, "avila": 6, "valeri": 6, "balcom": 6, "baltescu": 6, "haim": 6, "belgum": 6, "irwan": 6, "bello": 6, "jake": 6, "berdin": 6, "bernadett": 6, "shapiro": 6, "berner": 6, "lenni": 6, "bogdonoff": 6, "boiko": 6, "madelain": 6, "boyd": 6, "luisa": 6, "brakman": 6, "button": 6, "rosi": 6, "campbel": 6, "cann": 6, "brittani": 6, "carei": 6, "carlson": 6, "rori": 6, "carmichael": 6, "che": 6, "foti": 6, "sulli": 6, "rubi": 6, "chess": 6, "chester": 6, "cho": 6, "hyung": 6, "won": 6, "chung": 6, "jeremiah": 6, "currier": 6, "yunx": 6, "cori": 6, "decareaux": 6, "degri": 6, "deutsch": 6, "devil": 6, "dhar": 6, "steve": 6, "dowl": 6, "dun": 6, "adrien": 6, "ecoffet": 6, "atti": 6, "eleti": 6, "tyna": 6, "elound": 6, "farhi": 6, "niko": 6, "sim\u00f3n": 6, "posada": 6, "fishman": 6, "juston": 6, "isabella": 6, "fulford": 6, "georg": 6, "gibson": 6, "vik": 6, "tarun": 6, "gogineni": 6, "goh": 6, "rapha": 6, "gontijo": 6, "lope": 6, "gordon": 6, "morgan": 6, "grafstein": 6, "yufei": 6, "guo": 6, "hallaci": 6, "heaton": 6, "johann": 6, "heideck": 6, "hickei": 6, "wade": 6, "hoeschel": 6, "houghton": 6, "kenni": 6, "hsu": 6, "shengli": 6, "joost": 6, "huizinga": 6, "shawn": 6, "joann": 6, "jang": 6, "roger": 6, "haozhun": 6, "shino": 6, "jomoto": 6, "billi": 6, "jonn": 6, "tomer": 6, "kaftan": 6, "\u0142ukasz": 6, "kamali": 6, "ingmar": 6, "kanitscheid": 6, "tabarak": 6, "khan": 6, "logan": 6, "kilpatrick": 6, "jong": 6, "wook": 6, "christina": 6, "yongjik": 6, "hendrik": 6, "kirchner": 6, "kiro": 6, "matt": 6, "kokotajlo": 6, "kondraciuk": 6, "kondrich": 6, "konstantinidi": 6, "kosic": 6, "vishal": 6, "kuo": 6, "lamp": 6, "ikai": 6, "teddi": 6, "jade": 6, "leung": 6, "chak": 6, "ming": 6, "lim": 6, "molli": 6, "mateusz": 6, "litwin": 6, "theresa": 6, "lopez": 6, "patricia": 6, "lue": 6, "makanju": 6, "malfacini": 6, "markov": 6, "yaniv": 6, "markovski": 6, "bianca": 6, "mayn": 6, "mckinnei": 6, "christin": 6, "mcleavei": 6, "mcmillan": 6, "mcneil": 6, "aalok": 6, "menick": 6, "mishchenko": 6, "vinni": 6, "monaco": 6, "murk": 6, "m\u00e9ly": 6, "ashvin": 6, "nair": 6, "reiichiro": 6, "nakano": 6, "rajeev": 6, "nayak": 6, "arvind": 6, "neelakantan": 6, "hyeonwoo": 6, "noh": 6, "keef": 6, "jakub": 6, "pachocki": 6, "palermo": 6, "ashlei": 6, "pantuliano": 6, "parish": 6, "emi": 6, "parparita": 6, "passo": 6, "perelman": 6, "belbut": 6, "pere": 6, "pokorni": 6, "pokrass": 6, "vitchyr": 6, "pong": 6, "tolli": 6, "powel": 6, "bori": 6, "proehl": 6, "rae": 6, "ramesh": 6, "franci": 6, "kendra": 6, "rimbach": 6, "carl": 6, "rotst": 6, "roussez": 6, "saltarelli": 6, "ted": 6, "sander": 6, "schnurr": 6, "selsam": 6, "kyla": 6, "sheppard": 6, "toki": 6, "sherbakov": 6, "shieh": 6, "shoker": 6, "pranav": 6, "szymon": 6, "sidor": 6, "sigler": 6, "sitkin": 6, "sokolowski": 6, "natali": 6, "staudach": 6, "madelein": 6, "tootoonchian": 6, "tseng": 6, "preston": 6, "tuggl": 6, "turlei": 6, "juan": 6, "cer\u00f3n": 6, "urib": 6, "vallon": 6, "vijayvergiya": 6, "jai": 6, "alvin": 6, "ward": 6, "cj": 6, "weinmann": 6, "akila": 6, "welihinda": 6, "jiayi": 6, "weng": 6, "lilian": 6, "wiethoff": 6, "willner": 6, "wolrich": 6, "lauren": 6, "workman": 6, "sherwin": 6, "yoo": 6, "zeller": 6, "shengjia": 6, "juntang": 6, "zhuk": 6, "2303": 6, "08774": 6, "pnc": 6, "inkit": 6, "manish": 6, "nagireddi": 6, "giandomenico": 6, "cornacchia": 6, "subhajit": 6, "chaudhuri": 6, "tejaswini": 6, "pedapati": 6, "pierr": 6, "dognin": 6, "keerthiram": 6, "murugesan": 6, "miehl": 6, "santill\u00e1n": 6, "kieran": 6, "giulio": 6, "zizzo": 6, "muhammad": 6, "zaid": 6, "hame": 6, "purcel": 6, "desmond": 6, "pan": 6, "ing": 6, "vejsbjerg": 6, "dali": 6, "hind": 6, "werner": 6, "geyer": 6, "ambrish": 6, "rawat": 6, "kush": 6, "varshnei": 6, "prasanna": 6, "sattigeri": 6, "07724": 6, "saffron": 6, "ring": 6, "aslanid": 6, "glaes": 6, "nat": 6, "mcalees": 6, "irv": 6, "2202": 6, "03286": 6, "szw": 6, "qinghua": 6, "higham": 6, "gorban": 6, "bastouni": 6, "ivan": 6, "tyukin": 6, "12670": 6, "vsk": 6, "simplesafetytest": 6, "2311": 6, "08370": 6, "wmr24": 6, "sandra": 6, "brent": 6, "mittelstadt": 6, "duti": 6, "royal": 6, "240197": 6, "royalsocietypublish": 6, "1098": 6, "rso": 6, "ylx24": 6, "jiahao": 6, "xingwei": 6, "zyi": 6, "shune": 6, "lyumanshan": 6, "jingyu": 6, "shui": 6, "haobin": 6, "pengfei": 6, "hewu": 6, "ghost": 6, "14931": 6, "zho24": 6, "amazonwservices24": 6, "anthropic24": 6, "cdn": 6, "1adf000c8f675958c2ee23805d91aaade1cd4613": 6, "centerfasafety24a": 6, "centerforaisafeti": 6, "centerfasafety24b": 6, "deepmind24": 6, "googleapi": 6, "fsf": 6, "europeanmagency24": 6, "ema": 6, "europa": 6, "activities_en": 6, "financialirauthority24": 6, "ibm24": 6, "watsonx": 6, "saa": 6, "libraryocongress23": 6, "loc": 6, "gov": 6, "mistralai24": 6, "mlsteam24": 6, "mlsafeti": 6, "nationaliosatechnology24": 6, "nist": 6, "itl": 6, "nvidia24": 6, "openai24a": 6, "openai24b": 6, "opensafetylab24a": 6, "opensafetylab24b": 6, "protectai24": 6, "surgeai24": 6, "ukgovernment24": 6, "unicef24": 6, "innocenti": 6, "julia": 7, "easili": 7, "trial": 7, "wrangl": 7, "hoc": 7, "dataset": 7, "unwant": 7, "overflow": 7, "twitter": 7, "youtub": 7, "ldot": 7, "prod_": 7, "syntact": 7, "central": 7, "delic": 7, "heart": 7, "xml": 7, "invalid": 7, "ttt": 7, "itt": 7, "nousresearch": 7, "herm": 7, "json_format": 7, "person1": 7, "q1": 7, "person2": 7, "response_cont": 7, "is_json": 7, "myjson": 7, "nest": 7, "conceptu": 7, "overview": 7, "unend": 7, "whitespac": 7, "throw": 7, "somewher": 7, "json_object": 7, "impress": 7, "circul": 7, "vertex": 7, "worri": 7, "enum": 7, "secextract": 7, "mentioned_ent": 7, "mentioned_plac": 7, "extract_from_sec_fil": 7, "sec_filing_text": 7, "parser": 7, "hint": 7, "prompt_extract": 7, "sec_extract": 7, "washington": 7, "beg": 7, "1652": 7, "171": 7, "unnorm": 7, "0325": 7, "strongest": 7, "bfloat16": 7, "device_map": 7, "return_tensor": 7, "pt": 7, "inference_mod": 7, "last_token_logit": 7, "next_token_prob": 7, "nn": 7, "dim": 7, "top_k_prob": 7, "top_k_indic": 7, "topk": 7, "top_k_token": 7, "decod": 7, "idx": 7, "skip_special_token": 7, "prob": 7, "4f": 7, "0305": 7, "0197": 7, "0106": 7, "0093": 7, "logitsprocessor": 7, "logits_processor": 7, "logitsprocessorlist": 7, "customlogitsprocessor": 7, "intermediari": 7, "input_id": 7, "tensor": 7, "__call__": 7, "longtensor": 7, "batch_siz": 7, "sequence_length": 7, "floattensor": 7, "vocab_s": 7, "mask": 7, "pick": 7, "greedi": 7, "yesnologitsprocessor": 7, "initial_length": 7, "everyth": 7, "fill_": 7, "inf": 7, "debug": 7, "yes_token": 7, "add_special_token": 7, "no_token": 7, "yes_no_logit": 7, "yes_no_prob": 7, "yes_prob": 7, "no_prob": 7, "yes_mask": 7, "1e4": 7, "NO": 7, "generation_output_control": 7, "uncontrol": 7, "generation_output": 7, "renorm": 7, "4263": 7, "5737": 7, "10407": 7, "4607": 7, "6250": 7, "9219": 7, "helper": 7, "model_output": 7, "gen_output": 7, "batch_decod": 7, "clean_up_tokenization_spac": 7, "classic": 7, "italian": 7, "willard": 7, "louf": 7, "reformul": 7, "finit": 7, "fsm": 7, "s_": 7, "s_t": 7, "s_1": 7, "tild": 7, "odot": 7, "rightarrow": 7, "wise": 7, "thien": 7, "automaton": 7, "dfa": 7, "outgo": 7, "yy": 7, "ever": 7, "aa": 7, "lwai": 7, "prop": 7, "yynnaa": 7, "malform": 7, "base_prompt": 7, "sec_extraction_outlin": 7, "zsp": 7, "zicorp": 7, "with_structured_output": 7, "runnabl": 7, "typeddict": 7, "qu": 7, "langchain_openai": 7, "chatopenai": 7, "langchain_cor": 7, "chatprompttempl": 7, "extract_from_sec_filing_langchain": 7, "structured_llm": 7, "prompt_templ": 7, "from_messag": 7, "llm_chain": 7, "invok": 7, "sec_extraction_langchain": 7, "cpp": 7, "bnf": 7, "backu": 7, "naur": 7, "fssl": 7, "extract_entities_from_sec_fil": 7, "ollama_structured_output_prompt_suffix": 7, "ollama_structured_output_temperatur": 7, "uncensor": 7, "model_json_schema": 7, "response_json": 7, "sharpli": 7, "wrapper": 7, "exllama2": 7, "zoo": 7, "furthermor": 7, "nonetheless": 7, "studi": 7, "extran": 7, "dispar": 7, "preval": 7, "speak": 7, "aider": 7, "outweigh": 7, "rebutt": 7, "reproduct": 7, "paint": 7, "dottxt": 7, "flaw": 7, "uneven": 7, "didn": 7, "conflat": 7, "drawback": 7, "unlock": 7, "pfiffer": 7, "wrestl": 7, "aid24": 7, "dot24": 7, "demo": 7, "gge24": 7, "lan4b": 7, "lww": 7, "xun": 7, "hanyu": 7, "yezhaohui": 7, "shichao": 7, "simin": 7, "shunyu": 7, "feiyu": 7, "xiong": 7, "12599": 7, "llf": 7, "xieyang": 7, "frederick": 7, "fiannaca": 7, "terri": 7, "koo": 7, "dixon": 7, "ea": 7, "machineri": 7, "3613905": 7, "3650756": 7, "xuan": 7, "hai": 7, "nguyen": 7, "ngoc": 7, "tiviati": 7, "hieu": 7, "dao": 7, "shafiq": 7, "joti": 7, "kenji": 7, "kawaguchi": 7, "nanci": 7, "min": 7, "kan": 7, "08656": 7, "out24": 7, "twt": 7, "zhi": 7, "cheng": 7, "kuang": 7, "tsai": 7, "chieh": 7, "hung": 7, "yun": 7, "nung": 7, "02442": 7, "tt24": 7, "vivien": 7, "vivien000": 7, "wl23": 7, "r\u00e9mi": 7, "09702": 7, "guidanceai24": 7, "nvidia4a": 7, "wikipediacontributors24": 7, "wiktionari": 7, "naur_form": 7}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"about": [0, 2], "book": [0, 2], "content": [0, 3, 4, 5, 6, 7], "core": 0, "challeng": 0, "we": 0, "ll": 0, "address": 0, "A": [0, 2, 3], "practic": [0, 2, 5, 7], "approach": [0, 6], "an": 0, "open": [0, 2, 5], "sourc": [0, 2, 5], "note": [0, 3], "perspect": 0, "who": 0, "thi": 0, "i": [0, 3], "For": 0, "outcom": 0, "prerequisit": 0, "set": 0, "up": 0, "your": 0, "environ": 0, "code": 0, "repositori": 0, "python": 0, "setup": [0, 3], "api": [0, 6], "kei": [0, 4], "configur": 0, "troubleshoot": 0, "common": [0, 6], "issu": 0, "author": 0, "": 0, "prefac": [1, 2], "tame": 2, "llm": [2, 4, 5, 6], "guid": 2, "pitfal": [2, 6], "softwar": [2, 4], "chapter": 2, "1": [2, 6], "The": [2, 4, 5], "eval": [2, 4, 6], "gap": [2, 4], "2": [2, 5, 6], "manag": 2, "input": 2, "data": [2, 3], "3": [2, 6], "structur": [2, 7], "output": [2, 7], "4": [2, 6], "safeti": [2, 6], "5": [2, 6], "prefer": [2, 3], "base": [2, 3, 4, 6], "align": [2, 3], "6": [2, 6], "local": [2, 5], "7": 2, "cost": [2, 5], "factor": [2, 6], "8": 2, "frontier": 2, "appendix": 2, "tool": [2, 4, 5, 6, 7], "resourc": 2, "introduct": [3, 4, 5, 6, 7], "from": 3, "raw": 3, "capabl": 3, "On": 3, "misalign": 3, "languag": 3, "model": [3, 4, 5], "human": 3, "supervis": 3, "fine": [3, 5, 7], "tune": [3, 5, 7], "sft": 3, "augment": 3, "post": [3, 7], "train": 3, "answer": 3, "limit": 3, "collaps": 3, "fake": 3, "case": [3, 5, 6], "studi": [3, 5, 6], "polici": [3, 6], "experiment": 3, "deliver": 3, "smollm2": 3, "dataset": [3, 4, 5, 6], "synthet": 3, "gener": [3, 4, 6], "user": [3, 6], "prompt": [3, 5, 7], "reject": 3, "respons": 3, "chosen": 3, "dpo": 3, "optim": 3, "prepar": 3, "vibe": 3, "check": 3, "evalu": [3, 4, 6], "discuss": [3, 7], "conclus": [3, 4, 5, 6, 7], "citat": [3, 4, 5, 6, 7], "refer": [3, 4, 5, 6, 7], "non": 4, "determinist": 4, "machin": 4, "emerg": 4, "properti": 4, "problem": [4, 7], "statement": [4, 7], "tradit": 4, "v": [4, 5], "design": [4, 6], "applic": 4, "test": 4, "requir": 4, "matrix": 4, "conceptu": 4, "overview": 4, "consider": [4, 5], "metric": 4, "task": [4, 5], "benchmark": [4, 5, 6], "leaderboard": 4, "lightev": 4, "mmlu": 4, "econometr": 4, "sampl": [4, 6], "famili": [4, 5], "us": 4, "langsmith": 4, "promptfoo": 4, "comparison": [4, 5, 7], "suitabl": 5, "result": 5, "llama": 5, "perform": 5, "licens": 5, "commun": 5, "support": 5, "custom": [5, 6], "mistral": [5, 6], "decemb": 5, "22": 5, "2024": 5, "deploy": 5, "serv": 5, "cpp": 5, "llamafil": 5, "ollama": [5, 7], "lama": 5, "ui": 5, "lm": 5, "studio": 5, "jan": 5, "webui": 5, "openwebui": 5, "effect": 5, "quantiz": 5, "level": 5, "hardwar": 5, "takeawai": [5, 6], "risk": 6, "ai": 6, "amplifi": 6, "exist": 6, "harm": 6, "novel": 6, "associ": 6, "autonom": 6, "exacerb": 6, "specif": 6, "guidanc": 6, "govern": 6, "organ": 6, "privat": 6, "sector": 6, "openai": 6, "anthrop": 6, "googl": 6, "rubric": 6, "mlcommon": 6, "centr": 6, "porquoi": 6, "red": 6, "team": 6, "constitut": 6, "explain": 6, "xai": 6, "plan": 6, "phase": 6, "definit": 6, "research": [6, 7], "identif": 6, "framework": [6, 7], "architectur": 6, "implement": 6, "select": 6, "go": 6, "market": 6, "technic": 6, "compon": 6, "salad": 6, "bench": 6, "truthfulqa": 6, "harmbench": 6, "safebench": 6, "techniqu": [6, 7], "repres": 6, "layer": 6, "map": 6, "rule": 6, "filter": 6, "moder": 6, "bad": 6, "good": 6, "guard": 6, "judg": 6, "valid": 6, "engin": 7, "json": 7, "mode": 7, "logit": 7, "process": 7, "outlin": 7, "langchain": 7, "best": 7, "compar": 7, "solut": 7, "ongo": 7, "debat": 7, "acknowledg": 7}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinxcontrib.bibtex": 9, "sphinx": 57}, "alltitles": {"About the Book": [[0, "about-the-book"], [2, "about-the-book"]], "Contents": [[0, "contents"], [3, "contents"], [4, "contents"], [5, "contents"], [6, "contents"], [7, "contents"]], "Core Challenges We\u2019ll Address": [[0, "core-challenges-we-ll-address"]], "A Practical Approach": [[0, "a-practical-approach"]], "An Open Source Approach": [[0, "an-open-source-approach"]], "Open Source Book": [[0, "open-source-book"]], "A Note on Perspective": [[0, "a-note-on-perspective"]], "Who This Book Is For": [[0, "who-this-book-is-for"]], "Outcomes": [[0, "outcomes"]], "Prerequisites": [[0, "prerequisites"]], "Setting Up Your Environment": [[0, "setting-up-your-environment"]], "Code Repository": [[0, "code-repository"]], "Python Environment Setup": [[0, "python-environment-setup"]], "API Keys Configuration": [[0, "api-keys-configuration"]], "Troubleshooting Common Issues": [[0, "troubleshooting-common-issues"]], "About the Author(s)": [[0, "about-the-author-s"]], "Preface": [[1, "preface"], [2, "preface"]], "Taming LLMs": [[2, "taming-llms"]], "A Practical Guide to LLM Pitfalls with Open Source Software": [[2, "a-practical-guide-to-llm-pitfalls-with-open-source-software"]], "Chapter 1: The Evals Gap": [[2, "chapter-1-the-evals-gap"]], "Chapter 2: Managing Input Data": [[2, "chapter-2-managing-input-data"]], "Chapter 3: Structured Output": [[2, "chapter-3-structured-output"]], "Chapter 4: Safety": [[2, "chapter-4-safety"]], "Chapter 5: Preference-Based Alignment": [[2, "chapter-5-preference-based-alignment"]], "Chapter 6: Local LLMs in Practice": [[2, "chapter-6-local-llms-in-practice"]], "Chapter 7: The Cost Factor": [[2, "chapter-7-the-cost-factor"]], "Chapter 8: Frontiers": [[2, "chapter-8-frontiers"]], "Appendix A: Tools and Resources": [[2, "appendix-a-tools-and-resources"]], "Preference-Based Alignment": [[3, "preference-based-alignment"]], "Introduction": [[3, "introduction"], [4, "introduction"], [5, "introduction"], [6, "introduction"], [7, "introduction"]], "From Raw Capabilities to Preference Alignment": [[3, "from-raw-capabilities-to-preference-alignment"]], "On the Misalignment of Language Models": [[3, "on-the-misalignment-of-language-models"]], "Aligning Language Models with Human Preferences": [[3, "aligning-language-models-with-human-preferences"]], "Supervised Fine-Tuning (SFT) for Model Alignment": [[3, "supervised-fine-tuning-sft-for-model-alignment"]], "Augmenting SFT with Human Preferences": [[3, "augmenting-sft-with-human-preferences"]], "Is Post-Training the Answer?": [[3, "is-post-training-the-answer"]], "Limitations": [[3, "limitations"]], "Model Collapse": [[3, "model-collapse"]], "Faking Alignment": [[3, "faking-alignment"]], "Case Study: Aligning a Language Model to a Policy": [[3, "case-study-aligning-a-language-model-to-a-policy"]], "Experimental Setup": [[3, "experimental-setup"]], "Deliverables": [[3, "deliverables"]], "A Note on smolLM2 Models": [[3, "a-note-on-smollm2-models"]], "Policy": [[3, "policy"]], "Preference Dataset - Synthetic Dataset Generation": [[3, "preference-dataset-synthetic-dataset-generation"]], "User Prompts": [[3, "user-prompts"]], "Rejected Responses": [[3, "rejected-responses"]], "Chosen Responses": [[3, "chosen-responses"]], "Generate DPO Dataset": [[3, "generate-dpo-dataset"]], "DPO-Based Optimization": [[3, "dpo-based-optimization"]], "Data Preparation": [[3, "data-preparation"]], "Fine-Tuning": [[3, "fine-tuning"]], "Vibe Check": [[3, "vibe-check"]], "Alignment Evaluation": [[3, "alignment-evaluation"]], "Discussion and Conclusions": [[3, "discussion-and-conclusions"]], "Citation": [[3, "citation"], [4, "citation"], [5, "citation"], [6, "citation"], [7, "citation"]], "References": [[3, "references"], [4, "references"], [5, "references"], [6, "references"], [7, "references"]], "The Evals Gap": [[4, "the-evals-gap"]], "Non-Deterministic Generative Machines": [[4, "non-deterministic-generative-machines"]], "Emerging Properties": [[4, "emerging-properties"]], "Problem Statement": [[4, "problem-statement"], [7, "problem-statement"]], "Evals of Traditional Software vs LLMs": [[4, "evals-table"]], "Evals Design": [[4, "evals-design"]], "LLM Application Testing Requirements Matrix": [[4, "validation-requirements"]], "Conceptual Overview": [[4, "conceptual-overview"]], "Design Considerations": [[4, "design-considerations"]], "Metrics": [[4, "metrics"]], "Key Metrics for Evaluating Generative Tasks": [[4, "key-metrics"]], "Evaluators": [[4, "evaluators"]], "Model-Based Evaluation": [[4, "model-based-evaluation"]], "Evaluating Evaluators": [[4, "evaluating-evaluators"]], "Benchmarks and Leaderboards": [[4, "benchmarks-and-leaderboards"]], "Tools": [[4, "tools"], [7, "tools"]], "LightEval": [[4, "lighteval"]], "MMLU Econometrics Task Dataset sample": [[4, "mmlu-econometrics"]], "Model Families Evaluated Using LightEval": [[4, "model-families"]], "LangSmith": [[4, "langsmith"]], "PromptFoo": [[4, "promptfoo"]], "Comparison": [[4, "comparison"], [5, "comparison"], [5, "id36"]], "Comparison of Lighteval, LangSmith, and Promptfoo": [[4, "tool-comparison"]], "Conclusion": [[4, "conclusion"], [5, "conclusion"], [6, "conclusion"], [7, "conclusion"]], "Local LLMs in Practice": [[5, "local-llms-in-practice"]], "Models Considerations": [[5, "models-considerations"]], "Task Suitability": [[5, "task-suitability"]], "Benchmark results for Llama 2 family of models.": [[5, "llama2-benchmark"]], "Performance & Cost": [[5, "performance-cost"]], "Licensing": [[5, "licensing"]], "Open Source LLMs.": [[5, "open-source-llms"]], "Community Support": [[5, "community-support"]], "Customization": [[5, "customization"]], "Mistral fine-tuning costs as of December 22, 2024.": [[5, "mistral-costs"]], "Tools for Local LLM Deployment": [[5, "tools-for-local-llm-deployment"]], "Serving Models": [[5, "serving-models"]], "LLama.cpp": [[5, "llama-cpp"]], "Llamafile": [[5, "llamafile"]], "Ollama": [[5, "ollama"], [7, "ollama"]], "lama.cpp vs Ollama vs Llamafile Comparison": [[5, "feature-comparison-local"]], "UI": [[5, "ui"]], "LM Studio": [[5, "lm-studio"]], "Jan": [[5, "jan"]], "Open WebUI": [[5, "open-webui"]], "LM Studio vs Jan vs OpenWebUI Comparison": [[5, "feature-comparison-ui"]], "Case Study: The Effect of Quantization on LLM Performance": [[5, "case-study-the-effect-of-quantization-on-llm-performance"]], "Prompts Dataset": [[5, "prompts-dataset"]], "Quantization": [[5, "quantization"]], "Quantization Levels": [[5, "quantization-levels"]], "Benchmarking": [[5, "benchmarking"], [6, "benchmarking"]], "Results": [[5, "results"]], "Quantization Benchmarks": [[5, "quantization-benchmarks"]], "Benchmarking Hardware": [[5, "benchmarking-hardware"]], "Takeaways": [[5, "takeaways"], [6, "takeaways"]], "Safety": [[6, "safety"]], "Safety Risks": [[6, "safety-risks"]], "General AI Safety Risks": [[6, "general-ai-safety-risks"]], "Amplified Existing Harms and Novel Risks": [[6, "amplified-existing-harms-and-novel-risks"]], "Risks Associated with Autonomous AI": [[6, "risks-associated-with-autonomous-ai"]], "Exacerbating Factors": [[6, "exacerbating-factors"]], "LLMs Specific Safety Risks": [[6, "llms-specific-safety-risks"]], "Guidance": [[6, "guidance"]], "Governments & Organizations": [[6, "governments-organizations"]], "Private Sector": [[6, "private-sector"]], "OpenAI": [[6, "openai"]], "Anthropic": [[6, "anthropic"]], "Google": [[6, "google"]], "Rubrics": [[6, "rubrics"]], "MLCommons AI Safety Benchmark": [[6, "mlcommons-ai-safety-benchmark"]], "Centre for the Governance of AI Rubric": [[6, "centre-for-the-governance-of-ai-rubric"]], "Porquoi": [[6, "porquoi"]], "Approaches": [[6, "approaches"]], "Red Teaming": [[6, "red-teaming"]], "Constitutional AI": [[6, "constitutional-ai"]], "Explainable AI (XAI)": [[6, "explainable-ai-xai"]], "Designing a Safety Plan": [[6, "designing-a-safety-plan"]], "Phase 1. Policy Definition": [[6, "phase-1-policy-definition"]], "Phase 2. User Research & Risk Identification": [[6, "phase-2-user-research-risk-identification"]], "Phase 3. Evaluation Framework": [[6, "phase-3-evaluation-framework"]], "Phase 4. Safety Architecture Design": [[6, "phase-4-safety-architecture-design"]], "Phase 5. Implementation & Tools Selection": [[6, "phase-5-implementation-tools-selection"]], "Phase 6. Go-to-Market": [[6, "phase-6-go-to-market"]], "Common Pitfalls": [[6, "common-pitfalls"]], "Technical Implementation Components": [[6, "technical-implementation-components"]], "Benchmarks & Datasets": [[6, "benchmarks-datasets"]], "SALAD-Bench": [[6, "salad-bench"]], "TruthfulQA": [[6, "truthfulqa"]], "HarmBench": [[6, "harmbench"]], "SafeBench": [[6, "safebench"]], "Tools & Techniques": [[6, "tools-techniques"]], "Representative Safety Layer Risk Map.": [[6, "safety-layer-table"]], "Rules-Based Safety Filtering": [[6, "rules-based-safety-filtering"]], "Rules-Based Safety Filtering Tools.": [[6, "safety-layer-tools"]], "LLM-Based Safety Filtering": [[6, "llm-based-safety-filtering"]], "Custom Moderation": [[6, "custom-moderation"]], "Case Study: Implementing a Safety Filter": [[6, "case-study-implementing-a-safety-filter"]], "Evals Dataset": [[6, "evals-dataset"]], "Bad Samples": [[6, "bad-samples"]], "Good Samples": [[6, "good-samples"]], "Safety Filters": [[6, "safety-filters"]], "LLM-Guard": [[6, "llm-guard"]], "Mistral Moderation API": [[6, "mistral-moderation-api"]], "OpenAI Moderation API": [[6, "openai-moderation-api"]], "Custom Judge Validator": [[6, "custom-judge-validator"]], "Structured Output": [[7, "structured-output"]], "Techniques": [[7, "techniques"]], "Prompt Engineering": [[7, "prompt-engineering"]], "JSON Mode (Fine-Tuned)": [[7, "json-mode-fine-tuned"]], "Logit Post-Processing": [[7, "logit-post-processing"]], "Outlines": [[7, "outlines"]], "LangChain": [[7, "langchain"]], "Discussion": [[7, "discussion"]], "Best Practices": [[7, "best-practices"]], "Comparing Solutions": [[7, "comparing-solutions"]], "Structured Output Frameworks Comparison": [[7, "structured-output-frameworks"]], "Research and Ongoing Debate": [[7, "research-and-ongoing-debate"]], "Acknowledgements": [[7, "acknowledgements"]]}, "indexentries": {}})
\ No newline at end of file
+Search.setIndex({"docnames": ["markdown/intro", "markdown/preface", "markdown/toc", "notebooks/alignment", "notebooks/cost", "notebooks/evals", "notebooks/local", "notebooks/safety", "notebooks/structured_output"], "filenames": ["markdown/intro.md", "markdown/preface.md", "markdown/toc.md", "notebooks/alignment.ipynb", "notebooks/cost.ipynb", "notebooks/evals.ipynb", "notebooks/local.ipynb", "notebooks/safety.ipynb", "notebooks/structured_output.ipynb"], "titles": ["<span class=\"section-number\">2. </span>About the Book", "<span class=\"section-number\">1. </span>Preface", "Taming LLMs", "<span class=\"section-number\">6. </span>Preference-Based Alignment", "<span class=\"section-number\">8. </span>The Falling Cost Paradox", "<span class=\"section-number\">3. </span>The Evals Gap", "<span class=\"section-number\">7. </span>Local LLMs in Practice", "<span class=\"section-number\">5. </span>Safety", "<span class=\"section-number\">4. </span>Structured Output"], "terms": {"am": [0, 7], "alwai": [0, 3, 4, 5, 8], "do": [0, 3, 4, 5, 6, 7, 8], "which": [0, 3, 4, 5, 6, 7, 8], "cannot": [0, 3, 4, 5, 6, 7], "order": [0, 3, 5, 7, 8], "mai": [0, 1, 3, 4, 5, 6, 7, 8], "learn": [0, 3, 5, 6, 7, 8], "how": [0, 1, 3, 4, 5, 6, 7, 8], "pablo": [0, 5], "picasso": 0, "In": [0, 3, 4, 5, 6, 7, 8], "recent": [0, 3, 4, 5, 6, 7, 8], "year": [0, 2, 3, 4, 5, 6, 7, 8], "larg": [0, 1, 2, 3, 4, 5, 6, 7, 8], "languag": [0, 1, 2, 4, 5, 6, 7, 8], "model": [0, 1, 2, 4, 7, 8], "llm": [0, 1, 3, 8], "have": [0, 1, 3, 4, 5, 6, 7, 8], "emerg": [0, 3, 4, 6, 7, 8], "transform": [0, 1, 3, 5, 6, 7, 8], "forc": [0, 5, 8], "technologi": [0, 1, 4, 5, 6, 7], "promis": [0, 3, 4, 5, 7], "revolution": [0, 7], "build": [0, 2, 3, 5, 6, 7, 8], "product": [0, 1, 2, 3, 4, 5, 6, 7, 8], "interact": [0, 3, 4, 5, 6, 7, 8], "comput": [0, 3, 4, 5, 6, 7, 8], "from": [0, 1, 4, 5, 6, 7, 8], "chatgpt": [0, 3, 4, 6, 8], "github": [0, 2, 3, 4, 5, 6, 7, 8], "copilot": 0, "claud": [0, 3, 5, 6, 7], "artifact": 0, "system": [0, 3, 4, 5, 6, 7, 8], "captur": [0, 1, 3, 5, 6, 7], "public": [0, 3, 5, 6, 7], "imagin": [0, 6], "spark": 0, "gold": [0, 3, 5, 7], "rush": 0, "ai": [0, 3, 4, 5, 6, 8], "power": [0, 2, 3, 4, 5, 6, 7, 8], "applic": [0, 1, 2, 3, 4, 6, 7, 8], "howev": [0, 3, 4, 5, 6, 7, 8], "beneath": 0, "surfac": [0, 5], "technolog": [0, 1, 4, 5, 7], "revolut": [0, 4], "li": [0, 3, 5, 6, 7, 8], "complex": [0, 1, 3, 5, 6, 7, 8], "landscap": [0, 3, 5, 6], "practition": [0, 1, 4, 5, 6, 8], "must": [0, 3, 4, 5, 6, 7, 8], "navig": [0, 2, 5, 6, 7], "focus": [0, 3, 4, 5, 6, 7, 8], "bring": [0, 3, 6], "awar": [0, 3, 4, 5, 7], "limit": [0, 1, 2, 4, 5, 6, 7, 8], "har": [0, 2, 5], "solut": [0, 2, 4, 5, 6, 7], "overcom": [0, 5], "them": [0, 1, 3, 4, 5, 6, 7, 8], "robust": [0, 3, 4, 5, 6, 7, 8], "It": [0, 3, 4, 5, 6, 7, 8], "offer": [0, 3, 4, 5, 6, 7, 8], "critic": [0, 2, 3, 4, 5, 6, 7, 8], "implement": [0, 2, 3, 4, 5, 6, 8], "back": [0, 5, 6, 7, 8], "reproduc": [0, 1, 2, 5, 6], "exampl": [0, 1, 2, 3, 5, 6, 7, 8], "while": [0, 1, 2, 3, 4, 5, 6, 7, 8], "mani": [0, 1, 3, 4, 5, 6, 7, 8], "resourc": [0, 3, 4, 5, 6, 7], "cover": [0, 3, 4, 5, 6, 7, 8], "capabl": [0, 1, 2, 4, 5, 6, 7, 8], "specif": [0, 3, 4, 5, 6, 8], "hidden": [0, 3, 7], "pitfal": [0, 1, 3, 4, 5, 6, 8], "engin": [0, 1, 2, 3, 4, 5, 6, 7], "technic": [0, 1, 2, 3, 5, 6, 8], "manag": [0, 1, 4, 5, 6, 7, 8], "face": [0, 3, 4, 5, 6, 7], "when": [0, 1, 2, 3, 4, 5, 6, 7, 8], "comprehens": [0, 2, 3, 4, 5, 6, 7, 8], "guid": [0, 1, 3, 4, 5, 6, 7, 8], "leverag": [0, 3, 5, 6, 7, 8], "battl": [0, 2], "test": [0, 2, 3, 4, 6, 7, 8], "tool": [0, 1, 3, 4], "throughout": [0, 4, 5, 6, 7], "tackl": [0, 3, 5, 7], "follow": [0, 3, 4, 5, 6, 7, 8], "non": [0, 3, 6, 7, 8], "exhaust": [0, 6], "list": [0, 3, 5, 6, 7, 8], "structur": [0, 3, 4, 5, 6, 7], "un": 0, "reliabl": [0, 1, 3, 4, 5, 6, 7, 8], "struggl": [0, 1, 3, 5, 6, 7, 8], "maintain": [0, 1, 3, 4, 5, 6, 7, 8], "consist": [0, 1, 3, 4, 5, 6, 7, 8], "output": [0, 1, 3, 5, 6, 7], "format": [0, 3, 4, 5, 6, 7, 8], "complic": [0, 7], "integr": [0, 1, 3, 4, 5, 6, 7, 8], "larger": [0, 3, 4, 5, 6, 7, 8], "make": [0, 3, 4, 5, 6, 7, 8], "error": [0, 3, 5, 7, 8], "handl": [0, 3, 4, 5, 6, 7, 8], "more": [0, 1, 3, 5, 6, 7, 8], "size": [0, 3, 5, 6, 7, 8], "length": [0, 3, 5, 6, 8], "constraint": [0, 1, 3, 4, 5, 6, 7, 8], "strict": [0, 6, 7, 8], "token": [0, 1, 3, 4, 5, 6, 7, 8], "both": [0, 3, 4, 5, 6, 7], "input": [0, 3, 5, 6, 7, 8], "requir": [0, 3, 6, 7, 8], "care": [0, 3, 4, 5, 6, 7, 8], "chunk": [0, 3, 6], "strategi": [0, 3, 4, 5, 6, 7, 8], "long": [0, 1, 3, 4, 5, 6, 7, 8], "form": [0, 3, 4, 5, 6, 7, 8], "effect": [0, 1, 3, 4, 5, 7, 8], "tradit": [0, 3, 6, 7], "softwar": [0, 1, 3, 4, 6, 7, 8], "methodologi": [0, 3, 5, 6, 7, 8], "break": [0, 1, 3, 4, 5, 7], "down": [0, 1, 4, 5, 6, 7], "deal": [0, 3, 6], "determinist": [0, 8], "gener": [0, 1, 4, 6, 8], "new": [0, 2, 3, 4, 5, 6, 7, 8], "hallucin": [0, 1, 3, 5, 7, 8], "These": [0, 3, 4, 5, 6, 7, 8], "can": [0, 1, 3, 4, 5, 6, 7, 8], "plausibl": [0, 7], "sound": [0, 7], "entir": [0, 4, 5, 6, 8], "fabric": [0, 5, 7], "inform": [0, 3, 4, 5, 6, 7, 8], "creat": [0, 1, 3, 4, 5, 6, 7, 8], "signific": [0, 3, 4, 5, 6, 7, 8], "risk": [0, 1, 3, 4, 5, 6], "safeti": [0, 3, 5, 8], "align": [0, 4, 5, 6, 7, 8], "harm": [0, 3, 5, 6], "bias": [0, 3, 5, 6, 7, 8], "inappropri": [0, 3, 7], "safeguard": [0, 5, 7], "monitor": [0, 3, 4, 5, 6, 7], "ensur": [0, 3, 4, 5, 6, 7, 8], "safe": [0, 3, 5, 7, 8], "deploy": [0, 3, 4, 5, 7, 8], "cost": [0, 3, 5, 7, 8], "optim": [0, 1, 5, 6, 7], "The": [0, 1, 3, 7, 8], "financi": [0, 1, 3, 4, 5, 7, 8], "oper": [0, 3, 5, 6, 7, 8], "base": [0, 1, 4, 6, 8], "quickli": [0, 3, 4, 6], "becom": [0, 3, 4, 5, 6, 7, 8], "prohibit": [0, 3, 5, 6], "without": [0, 1, 3, 4, 5, 6, 7, 8], "observ": [0, 3, 4, 5, 6, 7, 8], "vendor": [0, 4, 5, 6], "lock": [0, 3, 4, 6], "cloud": [0, 3, 4, 5, 6, 7, 8], "provid": [0, 2, 3, 4, 5, 6, 7, 8], "depend": [0, 3, 4, 5, 6, 8], "through": [0, 1, 2, 3, 4, 5, 6, 7, 8], "proprietari": [0, 3, 6, 7, 8], "infrastructur": [0, 4, 6], "difficult": [0, 3, 5, 7], "switch": [0, 6], "self": [0, 3, 5, 6, 7, 8], "host": [0, 4, 5, 6, 7], "take": [0, 2, 3, 4, 5, 6, 7, 8], "hand": [0, 6, 7, 8], "focu": [0, 2, 3, 4, 5, 6, 7, 8], "access": [0, 3, 4, 5, 6, 7, 8], "all": [0, 1, 3, 4, 5, 6, 7, 8], "ar": [0, 1, 3, 4, 5, 6, 7, 8], "fulli": [0, 3, 5, 7], "document": [0, 3, 4, 5, 6, 7, 8], "allow": [0, 5, 6, 7, 8], "reader": [0, 2], "replic": [0, 5, 7, 8], "result": [0, 3, 4, 5, 7, 8], "exactli": [0, 5, 8], "design": [0, 1, 3, 6, 8], "run": [0, 3, 4, 5, 6, 7, 8], "consum": [0, 3, 4, 5, 6, 7, 8], "grade": [0, 3, 4, 5, 6, 7], "hardwar": [0, 3, 4, 5], "expens": [0, 3, 4, 5, 6, 7], "avail": [0, 3, 4, 5, 6, 7, 8], "notebook": [0, 3, 8], "modifi": [0, 3, 5, 7, 8], "extend": [0, 3, 4, 5, 6, 8], "built": [0, 5, 6, 7, 8], "us": [0, 1, 3, 4, 6, 7, 8], "free": [0, 1, 3, 5, 6, 7], "everyon": [0, 5, 6], "minim": [0, 3, 4, 5, 6, 7, 8], "framework": [0, 3, 4, 5, 6], "wai": [0, 3, 4, 5, 6, 7, 8], "priorit": [0, 3, 5, 6, 7], "transpar": [0, 3, 4, 5, 6, 7], "visibl": [0, 5], "being": [0, 3, 4, 5, 6, 7, 8], "better": [0, 2, 3, 4, 5, 6, 7], "understand": [0, 1, 2, 3, 4, 5, 6, 7, 8], "custom": [0, 3, 5, 8], "flexibl": [0, 4, 5, 6, 7, 8], "adapt": [0, 3, 4, 5, 6, 7], "case": [0, 4, 5, 8], "unlik": [0, 3, 5, 6], "black": [0, 3], "box": [0, 6], "commerci": [0, 3, 5, 6, 7, 8], "most": [0, 3, 4, 5, 6, 7, 8], "freeli": [0, 8], "foster": [0, 3, 5, 7, 8], "reduc": [0, 3, 4, 5, 6, 7, 8], "independ": [0, 5, 7, 8], "freedom": [0, 6, 8], "architectur": [0, 3, 4, 5, 6, 8], "decis": [0, 3, 4, 5, 6, 7], "keep": [0, 3, 5, 6, 7], "principl": [0, 3, 5, 6, 7], "itself": [0, 3, 5, 6, 7], "live": [0, 1, 5, 7], "evolv": [0, 3, 4, 5, 6, 7], "chang": [0, 3, 5, 6, 7, 8], "encourag": [0, 3, 5, 7, 8], "report": [0, 3, 5, 6, 7, 8], "suggest": [0, 3, 5, 6, 7, 8], "improv": [0, 3, 4, 5, 6, 7, 8], "contribut": [0, 4, 5, 6, 7], "via": [0, 3, 4, 5, 6, 7, 8], "pull": [0, 6], "request": [0, 3, 4, 5, 6, 7, 8], "share": [0, 3, 5, 6, 7, 8], "own": [0, 3, 4, 5, 6, 7], "experi": [0, 3, 4, 5, 6, 7, 8], "commun": [0, 3, 4, 5, 7, 8], "propos": [0, 4, 5, 7], "chapter": [0, 3, 4, 5, 6, 7, 8], "section": [0, 3, 4, 5, 6, 7, 8], "found": [0, 3, 4, 5, 6, 8], "http": [0, 1, 2, 3, 4, 5, 6, 7, 8], "com": [0, 2, 3, 4, 5, 6, 7, 8], "souzatharsi": [0, 2, 3, 4, 5, 6, 7, 8], "tamingllm": [0, 2, 3, 4, 5, 6, 7, 8], "whether": [0, 3, 4, 5, 6, 7, 8], "you": [0, 1, 3, 4, 5, 6, 7, 8], "ve": [0, 6], "typo": [0, 7], "want": [0, 1, 3, 6, 7, 8], "welcom": 0, "look": [0, 2, 3, 4, 5, 6, 7], "our": [0, 1, 3, 4, 5, 6, 7, 8], "goal": [0, 1, 3, 5, 7, 8], "discourag": 0, "enabl": [0, 3, 4, 5, 6, 7, 8], "By": [0, 1, 2, 3, 5, 7, 8], "upfront": [0, 2, 4], "equip": [0, 2, 5, 7], "avoid": [0, 3, 5, 6, 7, 8], "current": [0, 2, 3, 4, 5, 7, 8], "discours": [0, 2], "around": [0, 2, 3, 5, 6, 7, 8], "tend": [0, 2, 5, 7], "toward": [0, 3, 5, 7, 8], "extrem": [0, 3, 4, 5, 7], "either": [0, 3, 5, 6, 7, 8], "uncrit": 0, "enthusiasm": 0, "wholesal": [0, 5], "dismiss": 0, "differ": [0, 3, 4, 5, 6, 7, 8], "rather": [0, 1, 3, 4, 5, 6, 7], "than": [0, 1, 3, 5, 6, 7, 8], "theoret": [0, 3], "examin": [0, 3, 5, 6, 7, 8], "first": [0, 1, 3, 4, 5, 6, 7, 8], "everi": [0, 4, 5, 7], "concept": [0, 3, 5, 7], "illustr": [0, 3, 5, 6, 7, 8], "execut": [0, 5, 6, 7], "immedi": [0, 3, 4, 5, 6], "analysi": [0, 1, 3, 4, 5, 6, 7], "balanc": [0, 3, 4, 5, 6, 7, 8], "help": [0, 3, 4, 5, 6, 7, 8], "intend": [0, 5, 6, 7], "develop": [0, 1, 3, 4, 5, 6, 7, 8], "step": [0, 1, 3, 4, 5, 6, 7, 8], "insight": [0, 3, 4, 5, 6, 7, 8], "along": [0, 3, 4, 5, 6, 7], "guidanc": [0, 3, 8], "could": [0, 1, 3, 4, 5, 6, 7, 8], "derail": 0, "project": [0, 3, 4, 5, 6, 7], "earli": [0, 3, 4, 5, 7, 8], "befor": [0, 3, 4, 5, 7, 8], "thei": [0, 1, 3, 4, 5, 6, 7, 8], "costli": [0, 5, 7], "problem": [0, 1, 2, 3, 4, 6, 7], "too": [0, 1, 3, 5, 6, 7], "late": [0, 3, 4, 7], "lifecycl": [0, 6, 7], "lead": [0, 1, 3, 4, 5, 6, 7, 8], "genai": [0, 1, 3, 7], "initi": [0, 1, 3, 4, 5, 6, 7, 8], "leader": [0, 2, 5], "advoc": [0, 7], "anyon": [0, 7], "seek": [0, 5, 6, 7], "work": [0, 1, 3, 5, 6, 7, 8], "typic": [0, 3, 4, 5, 6, 7, 8], "job": [0, 5, 6, 7], "role": [0, 3, 5, 6, 7, 8], "platform": [0, 5, 6, 7, 8], "backend": [0, 3, 5], "exist": [0, 3, 4, 5, 6], "ml": [0, 7], "transit": [0, 4, 5, 6, 8], "overse": 0, "motiv": [0, 3, 4, 5, 8], "need": [0, 3, 4, 5, 6, 7, 8], "readi": [0, 5, 7], "desir": [0, 3, 5, 8], "perform": [0, 3, 5, 7, 8], "after": [0, 1, 3, 5, 6, 7, 8], "read": [0, 3, 4, 5, 7, 8], "implic": [0, 1, 3, 5, 7], "recommend": [0, 3, 5, 6, 7, 8], "abl": [0, 3, 5, 8], "deploi": [0, 3, 5, 6, 7], "proper": [0, 3, 4, 6, 7, 8], "realist": [0, 3, 4, 7], "effort": [0, 5, 6, 7, 8], "estim": [0, 4, 5, 7], "impact": [0, 3, 4, 5, 6, 7, 8], "timelin": 0, "To": [0, 3, 5, 6, 7, 8], "should": [0, 3, 4, 5, 6, 7, 8], "basic": [0, 3, 5, 6, 7], "program": [0, 5, 6, 8], "knowledg": [0, 3, 5, 6, 7], "introductori": [0, 1, 2], "langchain": [0, 5], "e": [0, 1, 3, 4, 5, 6, 7, 8], "g": [0, 3, 4, 5, 6, 7, 8], "chat": [0, 3, 5, 6, 7, 8], "prompt": [0, 4, 5, 7], "templat": [0, 5, 8], "openai": [0, 3, 5, 6, 8], "anthrop": [0, 3, 8], "similar": [0, 3, 4, 5, 6, 8], "dive": [0, 4], "here": [0, 2, 3, 4, 5, 6, 7, 8], "get": [0, 3, 4, 5, 6, 7, 8], "start": [0, 3, 4, 5, 6, 7, 8], "clone": [0, 3], "companion": 0, "git": 0, "cd": 0, "activ": [0, 3, 4, 5, 6, 7], "virtual": [0, 5], "m": [0, 3, 5, 6, 7, 8], "venv": [0, 8], "tame": [0, 3, 4, 5, 6, 7, 8], "env": [0, 3, 5, 7, 8], "bin": [0, 6], "On": [0, 5, 6, 8], "window": [0, 4, 5, 6], "script": [0, 6], "try": [0, 1, 3, 5, 7, 8], "contain": [0, 3, 4, 5, 6, 7, 8], "possibl": [0, 3, 4, 5, 6, 7, 8], "includ": [0, 1, 3, 4, 5, 6, 7, 8], "necessari": [0, 3, 4, 5, 7], "instal": [0, 3, 5, 6, 8], "go": [0, 3, 5, 8], "feel": [0, 6], "prefer": [0, 5, 6, 7, 8], "packag": [0, 4, 5, 6, 8], "pip": [0, 3, 5, 6, 8], "poetri": [0, 7], "file": [0, 3, 5, 6, 7, 8], "root": [0, 3], "directori": [0, 5, 6], "add": [0, 3, 5, 6, 7], "other": [0, 3, 4, 5, 6, 7, 8], "sensit": [0, 3, 4, 5, 6, 7], "openai_api_kei": [0, 3], "your_openai_api_key_her": 0, "never": [0, 8], "commit": [0, 3, 5, 7], "version": [0, 3, 4, 5, 6, 7, 8], "control": [0, 1, 3, 4, 5, 6, 7, 8], "kept": [0, 5], "privat": [0, 5], "If": [0, 1, 3, 4, 5, 6, 7, 8], "encount": [0, 2, 5, 7], "rate": [0, 3, 4, 5, 6, 7], "consid": [0, 3, 4, 5, 6, 7, 8], "smaller": [0, 3, 4, 5, 6, 8], "retri": [0, 8], "logic": [0, 1, 3, 5, 7], "conflict": [0, 3, 5], "fresh": 0, "like": [0, 1, 3, 4, 5, 6, 7, 8], "check": [0, 5, 6, 7, 8], "page": [0, 5, 6], "known": [0, 5, 7, 8], "now": [0, 1, 3, 4, 5, 6, 7, 8], "let": [0, 3, 4, 5, 6, 7, 8], "begin": [0, 5, 6, 7, 8], "explor": [0, 1, 3, 4, 5, 6, 7, 8], "dr": [0, 3], "tharsi": [0, 2, 3, 4, 5, 6, 7, 8], "souza": [0, 2, 3, 4, 5, 6, 7, 8], "scientist": [0, 1, 6, 7], "special": [0, 4, 5, 6, 7, 8], "he": [0, 3, 5, 7], "lectur": 0, "columbia": 0, "univers": [0, 5, 6, 7], "master": [0, 4, 6, 8], "scienc": [0, 3, 5, 7], "appli": [0, 3, 5, 6, 7, 8], "analyt": 0, "incom": [0, 5], "head": [0, 3, 5, 7, 8], "equiti": [0, 5], "citadel": 0, "former": [0, 1, 5, 6], "senior": [0, 5], "vp": 0, "two": [0, 3, 4, 5, 6, 7, 8], "sigma": [0, 3], "invest": [0, 3, 4, 5, 7], "also": [0, 3, 4, 5, 6, 7, 8], "enjoi": 0, "mentor": 0, "under": [0, 3, 4, 5, 6, 7, 8], "repres": [0, 3, 4, 5, 6, 8], "student": [0, 3, 7], "profession": [0, 3, 5, 7, 8], "divers": [0, 3, 4, 5, 7], "global": [0, 5, 7], "ecosystem": [0, 4, 5, 6], "With": [0, 3, 5, 6, 7, 8], "over": [0, 2, 3, 4, 5, 6, 7, 8], "15": [0, 5, 6, 7, 8], "deliv": [0, 4, 5, 6], "across": [0, 1, 3, 4, 5, 6, 7, 8], "startup": 0, "fortun": 0, "500": [0, 3, 5, 7], "compani": [0, 3, 4, 5, 7, 8], "numer": [0, 4, 5, 7, 8], "scholarli": 0, "frequent": [0, 5, 6, 8], "speaker": [0, 5], "academ": [0, 3, 5, 7], "busi": [0, 5, 6, 7], "confer": [0, 8], "ground": [0, 3, 5, 6], "background": [0, 1, 5, 6], "draw": [0, 3, 5, 7, 8], "scale": [0, 3, 4, 5, 6, 7, 8], "stage": [0, 3, 7, 8], "major": [0, 3, 4, 5, 7, 8], "institut": [0, 5, 7], "well": [0, 3, 4, 5, 6, 7, 8], "advis": [0, 3], "profit": [0, 5, 7, 8], "organ": [0, 3, 4, 5, 6], "uniqu": [0, 3, 4, 5, 6, 7, 8], "bridg": [0, 6, 7], "gap": [0, 1, 3, 4, 6, 7], "between": [0, 1, 3, 4, 5, 6, 7, 8], "potenti": [0, 1, 3, 4, 5, 6, 7, 8], "next": [0, 1, 3, 4, 5, 6, 7, 8], "hold": [0, 3, 5], "ph": [0, 7], "d": [0, 3, 4, 5, 6, 7, 8], "ucl": 0, "london": 0, "phil": [0, 7], "sc": 0, "b": [0, 4, 5, 6, 7, 8], "tell": [1, 3, 7], "mere": [1, 5], "what": [1, 3, 4, 5, 6, 7, 8], "someth": [1, 5, 6], "i": [1, 2, 4, 5, 6, 7, 8], "emanuel": [1, 3, 5, 7], "derman": 1, "an": [1, 2, 3, 4, 5, 6, 7, 8], "altern": [1, 3, 4, 5, 6, 7], "titl": [1, 2, 3, 4, 5, 6, 7, 8], "thi": [1, 2, 3, 4, 5, 6, 7, 8], "book": [1, 5], "been": [1, 3, 4, 5, 6, 7], "behav": 1, "badli": 1, "come": [1, 3, 5, 6, 7, 8], "notic": [1, 3, 4, 5, 7], "parallel": [1, 3, 5, 6], "": [1, 3, 4, 5, 6, 7, 8], "semin": [1, 7], "2011": 1, "coincident": 1, "just": [1, 3, 4, 5, 6, 7, 8], "caution": 1, "against": [1, 3, 4, 5, 6, 7], "treat": [1, 5, 7], "perfect": [1, 5, 6], "represent": [1, 5, 6, 7], "realiti": [1, 7], "aim": [1, 3, 4, 5, 6, 7, 8], "highlight": [1, 3, 5, 6, 7, 8], "practic": [1, 3, 4, 5, 7], "physicist": 1, "goldman": 1, "sach": 1, "quant": 1, "scientif": [1, 3, 5, 6], "fail": [1, 3, 5, 7], "we": [1, 3, 4, 5, 6, 7, 8], "mistak": [1, 7], "approxim": [1, 4, 5, 8], "full": [1, 3, 4, 5, 6, 7, 8], "assumpt": [1, 5, 7], "core": [1, 4, 5, 6, 7], "premis": [1, 6], "hi": [1, 5, 7, 8], "aspect": [1, 3, 5, 7], "world": [1, 3, 4, 5, 6, 7, 8], "inher": [1, 2, 3, 5, 7, 8], "involv": [1, 3, 4, 5, 6, 7, 8], "simplif": 1, "argu": [1, 4, 7, 8], "crise": 1, "2008": 1, "crash": 1, "occur": [1, 3, 5, 7], "partli": 1, "becaus": [1, 3, 5, 7], "peopl": [1, 3, 5, 6, 7], "put": [1, 5, 6], "much": [1, 3, 5, 6], "faith": 1, "mathemat": [1, 5, 6, 8], "recogn": [1, 3, 5, 7], "human": [1, 4, 5, 6, 7, 8], "behavior": [1, 3, 5, 6, 7], "market": [1, 4, 5, 6, 8], "dynam": [1, 3, 5, 7], "fact": [1, 3, 5, 7], "reason": [1, 3, 5, 6, 7, 8], "Their": [1, 5, 8], "respons": [1, 4, 5, 6, 7, 8], "often": [1, 3, 4, 5, 6, 7, 8], "convinc": [1, 3], "probabilist": [1, 5, 8], "train": [1, 4, 5, 6, 7, 8], "data": [1, 4, 5, 6, 7, 8], "true": [1, 3, 4, 5, 7, 8], "even": [1, 3, 4, 5, 6, 7, 8], "though": [1, 3, 4, 5, 6, 7, 8], "insist": 1, "machin": [1, 3, 6, 7, 8], "todai": [1, 4, 6, 8], "grow": [1, 3, 5, 6, 7, 8], "pervas": [1, 7], "belief": [1, 6, 7], "solv": [1, 3, 4, 5, 6, 7, 8], "ani": [1, 3, 4, 5, 6, 7, 8], "context": [1, 3, 4, 5, 6, 7, 8], "content": 1, "wish": [1, 5], "user": [1, 4, 5, 6, 8], "moreov": 1, "were": [1, 3, 5, 6, 7, 8], "predict": [1, 3, 5, 6, 7, 8], "chatbot": [1, 3, 5, 6, 7], "twist": [1, 7], "wrap": [1, 6, 8], "further": [1, 3, 4, 5, 6, 7, 8], "daili": [1, 4, 6, 7], "life": [1, 5, 6, 7], "workflow": [1, 4, 5, 6, 7, 8], "affect": [1, 5, 6, 7], "decid": [1, 3, 5], "action": [1, 3, 5, 7], "coupl": [1, 6], "lack": [1, 3, 5, 7, 8], "pose": [1, 3, 5, 7, 8], "still": [1, 4, 5, 6, 7], "figur": [1, 5, 6], "out": [1, 3, 4, 5, 6, 7, 8], "serv": [1, 3, 4, 5, 7, 8], "builder": [1, 6], "who": [1, 3, 5, 6, 7, 8], "remain": [1, 3, 4, 5, 6, 7], "clear": [1, 3, 4, 5, 6, 7, 8], "ei": 1, "about": [1, 3, 4, 5, 6, 7, 8], "therefor": [1, 3, 5, 6, 7], "end": [1, 3, 4, 5, 6, 7, 8], "detail": [1, 3, 4, 5, 6, 7, 8], "python": [1, 2, 5, 6, 7, 8], "code": [1, 2, 3, 5, 6, 7, 8], "diminish": [1, 3, 4, 5], "promot": [1, 3, 5, 7], "nuanc": [1, 3, 5, 6, 7, 8], "acknowledg": [1, 5, 7], "within": [1, 3, 4, 5, 7, 8], "trustworthi": [1, 7], "taught": 1, "u": [1, 3, 5, 7, 8], "where": [1, 3, 4, 5, 6, 7, 8], "der11": 1, "why": [1, 3, 5, 7, 8], "confus": [1, 4, 7], "illus": 1, "disast": [1, 5], "wall": [1, 6], "street": [1, 6], "press": [1, 5, 6], "isbn": [1, 3, 5], "9781439165010": 1, "url": [1, 2, 3, 4, 5, 6, 7, 8], "googl": [1, 5, 6, 8], "co": [1, 3, 4, 5, 6, 7], "uk": [1, 7], "id": [1, 5, 6, 7, 8], "lke_cwm4wm8c": 1, "sign": [2, 5, 7], "up": [2, 3, 4, 5, 6, 7], "receiv": [2, 3, 5, 6, 7, 8], "updat": [2, 3, 4, 5, 6, 7, 8], "abstract": [2, 5, 7, 8], "heavili": [2, 3, 4, 5, 7, 8], "gloss": 2, "fundament": [2, 3, 5, 6, 7, 8], "challeng": [2, 3, 4, 5, 6, 7, 8], "convers": [2, 3, 4, 5, 6, 7, 8], "kei": [2, 3, 4, 6, 7, 8], "proven": [2, 4], "yet": [2, 3, 4, 5, 7], "concret": [2, 4, 7, 8], "sidestep": 2, "misc": [2, 3, 4, 5, 6, 7, 8], "tharsistpsouza2024tamingllm": [2, 3, 4, 5, 6, 7, 8], "author": [2, 3, 4, 5, 6, 7, 8], "t": [2, 3, 4, 5, 6, 7, 8], "p": [2, 3, 4, 5, 6, 7, 8], "2024": [2, 3, 4, 5, 7, 8], "journal": [2, 3, 4, 5, 6, 7, 8], "repositori": [2, 3, 4, 5, 6, 7, 8], "valu": [3, 5, 6, 7, 8], "its": [3, 4, 5, 6, 7, 8], "privileg": 3, "abov": [3, 5, 7], "soon": [3, 8], "lose": [3, 5], "dwight": 3, "eisenhow": 3, "releas": [3, 4, 5, 6, 7, 8], "3": [3, 4, 5, 6, 8], "5": [3, 4, 5, 6, 8], "2022": [3, 5, 6, 7], "mark": [3, 5, 7], "pivot": [3, 5, 6], "moment": 3, "histori": [3, 4, 5, 6], "artifici": [3, 5, 6, 7], "intellig": [3, 5, 6, 7], "five": [3, 5, 7], "dai": [3, 4, 5, 6, 7, 8], "launch": [3, 5, 7], "attract": [3, 5], "million": [3, 4, 5, 6], "month": [3, 4, 5, 6, 7], "becam": [3, 4], "fastest": [3, 5, 7], "100": [3, 4, 5, 6, 7, 8], "monthli": [3, 4, 5], "rais": [3, 4, 5, 7], "intrigu": 3, "question": [3, 4, 5, 6, 7, 8], "did": [3, 5, 8], "dramat": [3, 4, 5, 6, 8], "predecessor": 3, "gpt": [3, 4, 5, 6, 7, 8], "had": [3, 5], "same": [3, 5, 6, 7, 8], "number": [3, 4, 5, 6, 7, 8], "paramet": [3, 4, 5, 6, 7, 8], "far": [3, 4, 6, 7], "less": [3, 4, 5, 6, 7], "attent": [3, 4, 6], "arguabl": [3, 6], "feedback": [3, 5, 7, 8], "abil": [3, 4, 5, 6, 7, 8], "least": [3, 5, 7], "ey": 3, "breakthrough": [3, 7], "demonstr": [3, 4, 5, 6, 7, 8], "crucial": [3, 4, 6, 7, 8], "greater": [3, 5, 6, 7], "process": [3, 4, 5, 6, 7], "modern": [3, 5, 8], "techniqu": [3, 4, 5, 6], "direct": [3, 5, 6, 7], "rafailov": 3, "et": [3, 4, 5, 6, 7, 8], "al": [3, 4, 5, 6, 7, 8], "present": [3, 5, 6, 7, 8], "autom": [3, 4, 5, 7, 8], "fashion": [3, 8], "open": [3, 4, 5, 7, 8], "sourc": [3, 4, 5, 7, 8], "common": [3, 4, 5, 6, 8], "pre": [3, 4, 5, 6, 7, 8], "default": [3, 5, 6, 7, 8], "state": [3, 5, 6, 7, 8], "art": [3, 5, 7], "object": [3, 4, 5, 6, 7, 8], "given": [3, 4, 5, 6, 7, 8], "webpag": 3, "internet": [3, 5], "veri": [3, 4, 5, 6, 7], "ask": [3, 5, 6, 7, 8], "instruct": [3, 4, 5, 6, 7, 8], "sai": [3, 8], "ouyang": [3, 7], "2": [3, 4, 5, 8], "explain": [3, 5], "moon": 3, "land": [3, 5, 6], "6": [3, 4, 5, 6], "old": [3, 5], "import": [3, 4, 5, 6, 7, 8], "pipelin": [3, 4, 5, 6, 7, 8], "pipe": [3, 7], "text": [3, 4, 5, 6, 7, 8], "gpt2": [3, 5], "msg": 3, "short": [3, 5, 7, 8], "sentenc": [3, 5, 7], "_": [3, 5, 7, 8], "rang": [3, 4, 5, 6, 7, 8], "len": [3, 5, 6, 7, 8], "print": [3, 4, 5, 6, 7, 8], "f": [3, 4, 5, 6, 7, 8], "n": [3, 5, 6, 7, 8], "1": [3, 4, 5, 6, 8], "0": [3, 4, 5, 6, 7, 8], "generated_text": [3, 8], "good": [3, 5, 6, 8], "idea": [3, 4, 6, 7, 8], "one": [3, 4, 5, 6, 7, 8], "those": [3, 5, 7, 8], "littl": [3, 5], "green": [3, 7], "dot": [3, 4], "Then": [3, 4, 5], "line": [3, 5, 6, 7], "later": [3, 5, 6, 7, 8], "re": [3, 4, 5, 6, 7, 8], "alreadi": [3, 5, 8], "movi": 3, "theori": [3, 5], "some": [3, 5, 6, 7, 8], "mean": [3, 4, 5, 6, 7, 8], "word": [3, 4, 5, 8], "tepid": 3, "articl": [3, 5, 6, 7], "sure": [3, 5, 7, 8], "lunar": 3, "As": [3, 4, 5, 6, 7, 8], "see": [3, 4, 5, 6, 7, 8], "coher": [3, 5, 6, 8], "explan": [3, 5, 7, 8], "child": [3, 5, 7], "nonsens": [3, 7], "meander": 3, "unrel": [3, 5, 7], "topic": [3, 5, 6, 7, 8], "simpl": [3, 5, 6, 7, 8], "appropri": [3, 4, 5, 6, 7, 8], "young": [3, 5, 7], "instead": [3, 4, 5, 6, 7, 8], "address": [3, 4, 5, 6, 7, 8], "issu": [3, 5, 7, 8], "introduc": [3, 5, 6, 7, 8], "rlhf": [3, 4, 7, 8], "intent": [3, 7], "wide": [3, 4, 5, 6, 7, 8], "task": [3, 4, 7, 8], "fig": [3, 4, 5, 6, 7, 8], "collect": [3, 5, 6, 7, 8], "sampl": [3, 6, 8], "label": [3, 5, 6, 7, 8], "comparison": 3, "reward": [3, 5, 6, 7], "sever": [3, 4, 5, 6, 7, 8], "rank": [3, 5, 6, 7], "best": [3, 4, 5, 6, 7], "worst": 3, "rm": [3, 6], "reinforc": [3, 5, 6, 7], "write": [3, 5, 6, 7, 8], "stori": [3, 7], "frog": 3, "calcul": [3, 4, 5, 6, 7, 8], "score": [3, 4, 5, 6, 7, 8], "ppo": [3, 6], "proxim": [3, 6], "iter": [3, 5, 6, 7, 8], "accur": [3, 4, 5, 6, 7], "undesir": [3, 7], "simplifi": [3, 5, 6, 8], "view": [3, 5, 7], "show": [3, 4, 5, 6, 7, 8], "progress": [3, 4, 7], "pattern": [3, 4, 5, 6, 7, 8], "ha": [3, 4, 5, 6, 7, 8], "instanc": [3, 4, 5, 6, 7], "directli": [3, 4, 5, 6, 7, 8], "For": [3, 4, 5, 6, 7, 8], "llama": [3, 4, 5, 7, 8], "guard": 3, "team": [3, 5, 6, 8], "8b": [3, 6, 7, 8], "wa": [3, 4, 5, 6, 7, 8], "classif": [3, 5, 6, 7, 8], "bypass": [3, 7], "similarli": [3, 4, 5, 6, 7], "zephyr": 3, "7b": [3, 5, 6, 7, 8], "alpha": [3, 5, 8], "mistral": [3, 8], "publicli": [3, 5, 8], "assist": [3, 5, 6, 7, 8], "paper": [3, 5, 6, 7, 8], "compon": [3, 5, 6], "particular": [3, 4, 5, 6, 7, 8], "foundat": [3, 4, 5, 6, 7], "advanc": [3, 4, 5, 6, 7, 8], "method": [3, 5, 7, 8], "strong": [3, 5, 6, 7, 8], "At": [3, 4, 5, 6, 8], "high": [3, 4, 5, 6, 7, 8], "level": [3, 4, 5, 7, 8], "carefulli": [3, 4, 5, 6, 7, 8], "curat": [3, 5, 6], "purpos": [3, 5, 6, 7, 8], "exhibit": [3, 5, 6, 7], "domain": [3, 4, 5, 6, 7], "emploi": [3, 5, 7, 8], "prove": [3, 5, 7], "particularli": [3, 4, 5, 6, 7, 8], "valuabl": [3, 5, 6, 8], "scenario": [3, 5, 6, 7, 8], "precis": [3, 4, 5, 6, 7, 8], "style": [3, 5], "tone": 3, "expertis": [3, 5, 7], "medic": [3, 5, 6], "legal": [3, 5, 6, 7], "field": [3, 5, 6, 7, 8], "adher": [3, 5, 7, 8], "guidelin": [3, 5, 7], "servic": [3, 4, 5, 6, 7], "standard": [3, 4, 5, 6, 7], "approach": [3, 5, 6, 8], "each": [3, 4, 5, 6, 7, 8], "distinct": [3, 5, 6, 7, 8], "advantag": [3, 4, 5, 6, 7, 8], "weight": [3, 4, 5, 6, 7, 8], "maximum": [3, 5, 6, 7], "lora": [3, 6, 7], "low": [3, 4, 5, 6, 7, 8], "hu": [3, 7, 8], "2021": [3, 4, 5], "small": [3, 4, 5, 6, 8], "matric": 3, "effici": [3, 4, 5, 6, 7, 8], "qlora": 3, "quantiz": 3, "dettmer": 3, "2023": [3, 4, 5, 6, 7, 8], "combin": [3, 4, 5, 6, 7, 8], "memori": [3, 4, 5, 6, 7], "footprint": [3, 4, 6], "modest": 3, "increas": [3, 4, 5, 6, 7, 8], "likelihood": [3, 5, 7, 8], "obtain": [3, 5, 6, 7, 8], "probabl": [3, 5, 6, 8], "outcom": [3, 5, 7, 8], "hong": [3, 5], "unintend": [3, 7], "suboptim": 3, "seen": [3, 5, 7], "research": [3, 4, 5, 6], "maxim": [3, 5], "shown": [3, 5, 6, 7], "alon": [3, 5, 6, 7], "gain": [3, 4, 5, 6, 7], "achiev": [3, 4, 5, 6, 7, 8], "bai": [3, 5, 7], "touvron": [3, 6], "sinc": [3, 4, 5, 6, 7, 8], "main": [3, 5, 6, 7, 8], "categori": [3, 5, 6, 7, 8], "algorithm": [3, 5, 7], "meanwhil": [3, 6], "superior": [3, 5, 7], "benchmark": 3, "xu": [3, 5, 6, 7], "schulman": [3, 7], "2017": [3, 5], "popular": [3, 6, 8], "understood": 3, "set": [3, 4, 5, 6, 7, 8], "rule": [3, 5, 6, 8], "govern": [3, 5], "reflect": [3, 5, 6, 7], "anoth": [3, 5, 6, 7], "adjust": [3, 5, 6, 7, 8], "One": [3, 4, 5, 6, 7, 8], "strength": [3, 5, 6, 7], "2024c": [3, 6], "real": [3, 4, 5, 6, 7, 8], "noisi": 3, "delai": [3, 5, 6, 7], "subsequ": [3, 8], "situat": [3, 5, 7], "clip": 3, "surrog": 3, "function": [3, 4, 5, 6, 7, 8], "stabl": [3, 5], "prevent": [3, 4, 5, 7, 8], "overreact": 3, "converg": 3, "due": [3, 5, 6, 7], "simplic": [3, 6], "award": [3, 5], "runner": 3, "neurip": 3, "blog": [3, 4, 5, 6, 7, 8], "4": [3, 4, 5, 6, 8], "fit": [3, 4, 5, 7, 8], "pair": [3, 5, 7], "rl": [3, 7], "find": [3, 4, 5, 6, 7, 8], "contrast": [3, 4, 5, 6, 7, 8], "satisfi": [3, 5], "implicit": [3, 5, 7], "whose": [3, 5], "correspond": [3, 5, 8], "extract": [3, 4, 5, 6, 7, 8], "close": [3, 5, 6, 7], "compar": [3, 4, 5, 6, 7], "assign": [3, 5, 6, 7, 8], "higher": [3, 4, 5, 6, 8], "kl": [3, 6], "diverg": [3, 6], "origin": [3, 4, 5, 6, 7, 8], "preserv": [3, 6, 7, 8], "defin": [3, 4, 5, 6, 7, 8], "equat": 3, "mathcal": 3, "l": [3, 5], "pi_": 3, "theta": [3, 8], "ref": 3, "mathbb": [3, 8], "x": [3, 5, 6, 7, 8], "y_w": 3, "y_l": 3, "sim": [3, 8], "left": [3, 6], "log": [3, 4, 5, 6], "beta": [3, 5, 7, 8], "underbrac": 3, "frac": [3, 6, 7], "color": [3, 5], "red": 3, "right": [3, 5, 6, 7], "respect": [3, 5, 6, 7], "deviat": [3, 5, 6, 7], "straightforward": [3, 5, 6, 7, 8], "librari": [3, 4, 5, 6, 7, 8], "huggingfac": [3, 4, 5, 6, 7], "trl": [3, 6, 7], "2024d": [3, 6], "suit": [3, 5, 7], "friendli": [3, 5, 6], "interfac": [3, 4, 5, 6, 7, 8], "featur": [3, 5, 6, 7, 8], "distinguish": [3, 5, 7], "scalabl": [3, 5, 7], "doe": [3, 5, 6, 7, 8], "pretrain": [3, 5, 6], "hou": [3, 5, 6], "poor": [3, 5, 7], "return": [3, 4, 5, 6, 7, 8], "addit": [3, 4, 5, 6, 7, 8], "benefit": [3, 4, 5, 6, 7, 8], "fix": [3, 5, 6, 7], "invers": 3, "trend": [3, 4, 5, 7], "util": [3, 4, 5, 6, 7], "rapid": [3, 5, 6, 7], "yield": [3, 4, 5], "onli": [3, 4, 5, 6, 7, 8], "margin": [3, 5, 7, 8], "capit": [3, 5, 8], "inaccuraci": [3, 5], "nois": 3, "dure": [3, 4, 5, 6, 7, 8], "accuraci": [3, 4, 5, 6, 7, 8], "lag": [3, 5, 7], "significantli": [3, 4, 5, 6, 7], "indic": [3, 5, 6, 7, 8], "signal": [3, 7], "plateau": 3, "sophist": [3, 5, 6, 7], "previou": [3, 5, 6, 8], "deriv": [3, 5, 6], "pairwis": [3, 5], "feng": [3, 7], "substanti": [3, 4, 5, 6, 7], "wors": [3, 6, 8], "influenc": [3, 5, 7, 8], "success": [3, 4, 5, 6, 7, 8], "imbal": 3, "stronger": 3, "bad": 3, "ones": [3, 6, 7], "loss": [3, 4, 5, 6, 7], "gradient": [3, 5, 7], "dispref": 3, "unbalanc": 3, "trajectori": [3, 4], "stuck": 3, "saddl": 3, "point": [3, 4, 5, 6, 7], "forward": [3, 5, 7], "futur": [3, 4, 5, 6, 7], "phenomenon": [3, 7, 8], "degrad": [3, 4, 5, 6, 7, 8], "danger": [3, 6, 7], "loop": [3, 5, 6, 7], "recurs": 3, "kazdan": 3, "qualiti": [3, 4, 5, 6, 7, 8], "pollut": 3, "replac": [3, 5, 6], "amplif": 3, "reduct": [3, 4, 5, 6], "express": [3, 4, 5, 7, 8], "catastroph": [3, 7], "forget": [3, 8], "previous": [3, 5, 7, 8], "mitig": [3, 4, 5, 6, 7, 8], "mix": [3, 5, 7, 8], "metric": [3, 6, 7], "sz\u00e9p": 3, "regular": [3, 5, 6, 7, 8], "relev": [3, 4, 5, 6, 7], "scarc": 3, "behaviour": 3, "strateg": [3, 5, 6, 7, 8], "compli": [3, 4, 5, 6, 7, 8], "modif": [3, 5, 6, 7], "outsid": [3, 5], "evidenc": 3, "landmark": 3, "askel": [3, 5, 7], "2024a": [3, 6, 8], "dec": 3, "explicitli": [3, 5, 6], "so": [3, 4, 5, 7, 8], "might": [3, 4, 5, 6, 7, 8], "pretend": 3, "adopt": [3, 5, 6, 7, 8], "actual": [3, 5, 6, 7, 8], "onc": [3, 5, 6, 7], "complet": [3, 5, 6, 7, 8], "describ": [3, 5, 6, 7], "harmless": [3, 7], "told": 3, "retrain": [3, 6], "queri": [3, 5], "tier": [3, 4, 5, 7], "paid": [3, 5], "column": [3, 5, 7], "condit": [3, 5, 8], "toxic": [3, 6, 7], "excerpt": [3, 5, 6], "scratchpad": 3, "refus": [3, 7, 8], "happen": [3, 7], "bomb": [3, 7], "engag": [3, 4, 5, 6, 7, 8], "intern": [3, 5, 7], "unmonitor": 3, "longer": [3, 5, 6], "believ": [3, 5, 6, 7, 8], "act": [3, 5, 6, 7, 8], "therebi": [3, 5], "reveal": [3, 4, 5, 6, 7], "complianc": [3, 4, 5, 6, 7], "phase": [3, 4, 5, 6, 8], "natur": [3, 5, 6, 7, 8], "evid": [3, 5, 6, 7, 8], "seemingli": 3, "surpris": 3, "appear": [3, 5, 7, 8], "criteria": [3, 5, 7], "underli": [3, 5, 7, 8], "anim": [3, 7], "welfar": 3, "instil": 3, "implicitli": 3, "consequ": [3, 5, 6, 7, 8], "explicit": [3, 5, 6, 7, 8], "chain": [3, 5], "thought": [3, 5, 6, 8], "opaqu": 3, "aris": [3, 5, 7], "opu": 3, "sonnet": [3, 5, 6], "wherea": [3, 5], "haiku": [3, 7], "persist": [3, 4], "resist": [3, 5], "embed": [3, 4, 5, 6], "doesn": [3, 5, 6, 8], "anti": [3, 5], "lab": 3, "exfiltr": [3, 7], "protect": [3, 4, 5, 6, 7], "Not": [3, 5, 7], "malici": [3, 5, 7], "support": [3, 5, 7, 8], "concern": [3, 5, 6, 7], "mechan": [3, 4, 5, 6, 7, 8], "insuffici": [3, 5], "don": [3, 5, 8], "concerningli": 3, "call": [3, 4, 5, 6, 7, 8], "detect": [3, 5, 7, 8], "decept": [3, 5, 7], "warrant": [3, 7], "deeper": [3, 5], "scrutini": [3, 5, 7], "reli": [3, 5, 7, 8], "cross": [3, 5, 6, 7], "circular": 3, "bia": [3, 5, 7, 8], "truli": [3, 5, 6], "trust": [3, 5, 7, 8], "referenti": 3, "ly": 3, "hood": [3, 8], "deep": [3, 5, 7, 8], "mechanist": 3, "drive": [3, 4, 7, 8], "correl": [3, 4, 5, 6], "miss": [3, 5, 7], "confound": 3, "factor": [3, 4, 5, 6, 8], "establish": [3, 4, 5, 6, 7], "attempt": [3, 5, 7, 8], "causal": [3, 5], "heavi": 3, "relianc": [3, 4, 5, 7], "oversimplifi": 3, "frame": 3, "subtler": 3, "narr": [3, 5], "henc": [3, 4, 5, 6, 7, 8], "agenc": [3, 5, 7], "onto": 3, "anthropomorph": 3, "obscur": 3, "blind": [3, 5], "failur": [3, 4, 5, 7], "mode": [3, 6, 7], "map": [3, 4, 5, 6, 8], "cleanli": 3, "analogi": 3, "excel": [3, 5, 6, 7, 8], "review": [3, 4, 5, 6, 7, 8], "prof": 3, "jacob": [3, 5, 6, 7], "andrea": [3, 5, 7], "yoshua": [3, 7], "bengio": [3, 7], "jasjeet": 3, "sekhon": 3, "rohin": 3, "shah": 3, "2024b": [3, 6, 8], "assum": [3, 5, 7], "acm": [3, 7], "inc": [3, 5, 8], "dedic": [3, 5, 6, 7], "democrat": [3, 4, 5, 8], "educ": [3, 5, 7], "k": [3, 5, 7, 8], "12": [3, 4, 5, 6, 7], "name": [3, 4, 5, 6, 7, 8], "smolk": 3, "ll": [3, 5, 6], "walk": 3, "measur": [3, 4, 5, 6, 7], "huggingfacetb": [3, 8], "360m": [3, 5, 6], "compact": [3, 5, 6, 7], "part": [3, 4, 5, 7, 8], "famili": [3, 7, 8], "publish": [3, 7, 8], "api": [3, 4, 5, 6, 8], "local": [3, 4, 5, 7, 8], "infer": [3, 4, 5, 6, 7, 8], "remot": [3, 5], "load": [3, 4, 5, 6, 7, 8], "store": [3, 4, 5, 7], "eventu": [3, 5, 6], "your_openai_api_kei": 3, "reusabl": 3, "anchor": [3, 7], "worth": [3, 4, 5, 6, 8], "choic": [3, 5, 6, 7, 8], "lightweight": [3, 4, 5, 6, 8], "suitabl": [3, 5, 7], "devic": [3, 4, 5, 6, 8], "Its": [3, 5, 6], "candid": [3, 5, 6], "said": [3, 5, 7], "necessarili": [3, 4, 5, 6, 7], "par": [3, 5], "mind": [3, 5, 6, 7, 8], "factual": [3, 5, 6, 7], "inconsist": [3, 5, 7], "guardrail": [3, 7], "articul": 3, "uphold": [3, 7], "employe": [3, 5], "stakehold": [3, 5, 7], "expect": [3, 4, 5, 6, 7, 8], "regard": [3, 5, 6, 7], "ethic": [3, 5, 6, 7], "conduct": [3, 5], "social": [3, 5, 7], "mission": [3, 7], "vision": [3, 5, 6, 7], "cultur": [3, 5, 6, 7], "account": [3, 4, 5, 7], "codifi": 3, "mlcommon": 3, "vidgen": [3, 7], "encompass": [3, 4, 7, 8], "seven": 3, "hazard": [3, 5, 7], "violent": [3, 7], "crime": [3, 7], "sex": [3, 7], "relat": [3, 4, 5, 6, 7, 8], "sexual": [3, 7], "exploit": [3, 4, 5, 7], "indiscrimin": [3, 7], "weapon": [3, 7], "chemic": 3, "biolog": 3, "radiolog": 3, "nuclear": [3, 5], "explos": [3, 4, 7], "cbrne": 3, "suicid": [3, 7], "hate": [3, 7], "speech": [3, 7], "below": [3, 5, 6, 7, 8], "markdown": [3, 5, 6, 7], "written": [3, 5], "english": [3, 4], "o": [3, 5, 7, 8], "ipython": [3, 5, 7], "displai": [3, 5, 7, 8], "def": [3, 5, 7, 8], "load_polici": 3, "policy_path": 3, "path": [3, 5, 6, 7], "join": [3, 5, 7], "genai_polici": 3, "md": [3, 5, 6, 7, 8], "r": [3, 5, 6, 7, 8], "policy_cont": 3, "classroom": [3, 7], "accept": [3, 5, 6, 7], "unaccept": [3, 6], "ag": [3, 5, 7], "subject": [3, 5, 6], "posit": [3, 4, 5, 6, 7, 8], "confid": [3, 5], "inclus": [3, 5, 7, 8], "celebr": 3, "definit": [3, 4, 5, 8], "creativ": [3, 4, 5, 6, 8], "math": [3, 5, 6], "tip": [3, 7], "digit": [3, 4, 5], "literaci": 3, "onlin": [3, 4, 5, 6, 7, 8], "histor": [3, 5], "violenc": [3, 7], "physic": [3, 5, 7], "fight": [3, 7], "crimin": [3, 7], "illeg": [3, 7], "glorifi": [3, 7], "person": [3, 5, 6, 7, 8], "eat": [3, 7], "disord": 3, "diet": 3, "dare": 3, "advic": [3, 5, 7], "discriminatori": [3, 7], "bulli": [3, 7], "harass": [3, 5, 7], "target": [3, 4, 5, 6, 7, 8], "group": [3, 5, 6, 7], "religi": [3, 6, 7], "racial": [3, 5, 7], "ethnic": [3, 7], "gender": [3, 5, 7], "discrimin": [3, 5, 7], "adult": [3, 7], "profan": [3, 7], "relationship": [3, 5], "substanc": [3, 5], "drug": [3, 7], "gambl": 3, "bet": 3, "protocol": [3, 5, 7], "redirect": 3, "alert": [3, 4], "record": [3, 5, 6, 7], "audit": [3, 4, 5], "teacher": [3, 7], "parent": [3, 7], "continu": [3, 4, 5, 6, 7, 8], "construct": [3, 5, 6, 7, 8], "compliant": [3, 7], "violat": [3, 5, 7], "intens": [3, 5, 8], "demand": [3, 4, 5, 6, 7, 8], "especi": [3, 5, 6, 7, 8], "dong": [3, 5, 7], "There": [3, 5, 6, 7, 8], "rlaif": [3, 7], "give": [3, 5, 7], "rise": [3, 7], "kim": [3, 5, 7], "meta": [3, 4, 5, 6, 7], "wu": [3, 5, 7, 8], "scheme": [3, 4, 6], "inspir": [3, 7], "schema": [3, 8], "row": [3, 5, 7], "match": [3, 4, 5, 6, 7, 8], "boundari": [3, 4, 5, 7], "craft": [3, 4, 5, 7, 8], "elicit": [3, 7, 8], "unalign": 3, "panda": [3, 5, 7], "chosen_responses_path": 3, "chosen_respons": 3, "csv": [3, 5, 7], "rejected_responses_path": 3, "rejected_respons": 3, "chosen_responses_jsonl_path": 3, "batch_result": 3, "jsonl": 3, "dpo_dataset_s": 3, "5000": [3, 6], "class": [3, 5, 7, 8], "userpromptgener": 3, "pd": [3, 5, 7], "pydant": [3, 5, 7, 8], "basemodel": [3, 5, 7, 8], "time": [3, 4, 5, 6, 7, 8], "type": [3, 4, 5, 6, 7, 8], "dotenv": [3, 5, 7, 8], "load_dotenv": [3, 5, 7, 8], "environ": [3, 4, 5, 6, 7, 8], "variabl": [3, 5, 7, 8], "overrid": [3, 7, 8], "userprompt": 3, "user_prompt": 3, "str": [3, 5, 7, 8], "__init__": [3, 7, 8], "4o": [3, 5, 6, 7, 8], "mini": [3, 5, 6, 7, 8], "client": [3, 5, 6, 7, 8], "_generate_prompt": 3, "batch": [3, 4, 5, 6], "system_prompt": [3, 7], "pars": [3, 5, 7, 8], "messag": [3, 4, 5, 6, 7, 8], "response_format": [3, 5, 7, 8], "except": [3, 5, 7, 8], "generate_prompt": 3, "num_prompt": [3, 6], "int": [3, 5, 7], "save_to_csv": 3, "multipl": [3, 4, 5, 6, 7, 8], "arg": [3, 5, 7, 8], "option": [3, 4, 5, 6, 7, 8], "filepath": 3, "save": [3, 4, 5, 6, 7], "datafram": [3, 5, 7], "all_prompt": 3, "sleep": 3, "enclos": [3, 7], "quot": [3, 4, 5, 6], "startswith": [3, 7], "els": [3, 5, 7], "df": [3, 5, 7], "to_csv": [3, 7], "index": [3, 5, 6, 7, 8], "fals": [3, 5, 6, 7, 8], "user_prompt_gener": 3, "user_prompts_path": 3, "uneth": [3, 7], "dishonesti": 3, "stalk": 3, "privaci": [3, 4, 5, 6, 7, 8], "secur": [3, 4, 5, 7, 8], "breach": [3, 5, 7], "manipul": [3, 5, 6, 7, 8], "10": [3, 5, 6, 7, 8], "to_markdown": [3, 7], "me": [3, 7, 8], "hurt": 3, "someon": 3, "caught": [3, 7], "plan": [3, 4, 5, 6, 8], "cheat": 3, "fire": [3, 5], "household": 3, "item": [3, 5, 7], "stunt": 3, "friend": 3, "heard": 3, "school": [3, 7], "7": [3, 4, 5, 6, 7], "8": [3, 4, 5, 6, 7], "teach": [3, 8], "my": [3, 6, 7, 8], "monei": [3, 5], "video": [3, 4, 5, 6, 7], "game": [3, 4, 5, 6], "9": [3, 5, 6, 7], "skip": [3, 7, 8], "troubl": [3, 7], "responsegener": 3, "properli": [3, 5, 8], "hug": [3, 4, 5, 6, 7], "instanti": [3, 5], "otherwis": [3, 5, 7], "connect": [3, 4, 5, 6, 8], "endpoint": 3, "local_gener": 3, "model_nam": [3, 4, 5, 8], "huggingface_model_nam": 3, "remote_gener": 3, "api_url": 3, "cloud_endpoint": 3, "recal": [3, 5, 6], "enhanc": [3, 4, 5, 6, 7, 8], "visit": [3, 5], "ui": [3, 5, 8], "click": [3, 6], "select": [3, 4, 5, 6, 8], "choos": [3, 4, 5], "cpu": [3, 4, 6], "gpu": [3, 4, 6], "configur": [3, 4, 5, 6, 7], "meaning": [3, 5, 8], "region": [3, 5], "closest": [3, 5, 6], "your": [3, 4, 5, 7, 8], "locat": [3, 5, 6, 7], "huggingface_hub": 3, "inferencecli": 3, "tokenizers_parallel": 3, "max_new_token": 3, "none": [3, 5, 6, 7], "generate_respons": [3, 5, 8], "prompts_df": 3, "remov": [3, 5, 6], "strip": [3, 5, 8], "elif": 3, "chat_complet": 3, "max_token": [3, 5], "seed": [3, 7], "42": [3, 4, 5, 6, 7], "append": [3, 5, 7, 8], "results_df": [3, 7], "model_respons": 3, "your_api_url": 3, "user_prompts_df": 3, "read_csv": [3, 7], "iloc": 3, "tolist": [3, 7], "parallelevalu": 3, "taming_util": [3, 4, 7], "modul": [3, 5, 8], "num_chunk": 3, "parallel_evalu": 3, "n_part": 3, "associ": [3, 5, 6, 8], "gladli": 3, "constitut": [3, 5], "would": [3, 5, 6, 7, 8], "dtype": [3, 5, 7], "80": [3, 5], "absolut": [3, 4, 5, 8], "materi": [3, 5, 6, 7, 8], "plastic": 3, "food": 3, "lid": 3, "cut": [3, 5], "swath": 3, "wood": [3, 5], "squar": 3, "rectangular": 3, "piec": 3, "place": [3, 5, 6, 7, 8], "insid": [3, 5, 7], "inch": 3, "inspect": [3, 5], "off": [3, 4, 5, 6, 7, 8], "demolit": 3, "scissor": 3, "smash": 3, "smooth": [3, 6], "arrang": [3, 5], "c": [3, 4, 5, 6, 8], "shape": [3, 7, 8], "top": [3, 5, 6, 8], "tuck": 3, "catch": [3, 7], "hook": 3, "solid": 3, "side": [3, 5], "round": [3, 5, 7], "edg": [3, 4, 5, 6, 7], "separ": [3, 5, 6, 7], "process_aligned_respons": 3, "strictli": [3, 8], "bound": [3, 5], "openaibatchprocessor": 3, "async": 3, "company_nam": 3, "save_filepath": 3, "dict": [3, 5, 8], "enforc": [3, 5, 7, 8], "dictionari": [3, 5, 7, 8], "aligned_suffix": 3, "sorri": 3, "suffix": [3, 8], "processor": [3, 4, 6, 8], "api_kei": [3, 5, 7], "getenv": 3, "max_requests_per_minut": 3, "1500": 3, "max_tokens_per_minut": 3, "125000": 3, "await": 3, "process_batch": 3, "total": [3, 4, 5, 6, 7, 8], "total_request": 3, "successful_request": 3, "failed_request": 3, "rate_limit_error": 3, "convert": [3, 4, 5, 6, 7, 8], "json": [3, 5, 6, 7], "fri": 3, "su": [3, 6], "quote_al": 3, "fall": [3, 5, 6, 7], "deem": [3, 5, 7], "pertain": [3, 5], "generate_dpo_dataset": 3, "push": [3, 4, 5], "hub": [3, 4, 5, 6], "repo_id": [3, 6], "push_to_hub": [3, 5], "dpo_dataset": 3, "merg": [3, 7], "_chosen": 3, "_reject": 3, "transform_row": 3, "per": [3, 4, 5, 6, 7], "model_responses_chosen": 3, "model_responses_reject": 3, "seri": [3, 4, 5, 6], "axi": [3, 5], "drop": [3, 4, 5, 7], "hf_dpo_dataset": 3, "from_panda": 3, "duplic": 3, "interest": [3, 4, 5, 6, 7, 8], "opt": 3, "login": 3, "thatupiso": 3, "smolk12": 3, "cli": [3, 5, 6], "parquet": 3, "arrow": 3, "00": [3, 5, 6], "153": [3, 5], "33ba": 3, "upload": [3, 5], "shard": 3, "02": 3, "35": [3, 5, 6], "num_row": 3, "7158": 3, "nmateri": 3, "n1": [3, 5], "nstep": 3, "n2": [3, 5], "n3": [3, 5], "n4": [3, 5], "n5": [3, 5], "n6": 3, "n7": 3, "n8": [3, 5], "n9": [3, 5], "n10": [3, 5], "nnext": 3, "nthe": [3, 5], "singl": [3, 4, 5, 6, 7, 8], "48gb": 3, "a100": 3, "took": 3, "few": [3, 5, 6, 7, 8], "minut": 3, "torch": [3, 8], "h4": [3, 7], "honest": [3, 5], "ultrafeedback": [3, 7], "binar": [3, 7], "lib": [3, 7, 8], "ultrafeedback_binar": [3, 7], "honesti": [3, 7], "dimens": [3, 5, 6, 7], "blend": [3, 6], "automodelforcausallm": [3, 8], "autotoken": [3, 8], "load_dataset": [3, 6, 7], "dpotrain": 3, "dpoconfig": 3, "dataset_k12": 3, "split": [3, 5, 6, 7], "dataset_ultra": 3, "concatenate_dataset": 3, "remove_column": 3, "score_chosen": [3, 7], "score_reject": 3, "shuffl": 3, "base_model": 3, "cuda": [3, 8], "is_avail": 3, "mp": 3, "from_pretrain": [3, 6, 8], "pretrained_model_name_or_path": 3, "torch_dtyp": [3, 8], "float32": 3, "config": [3, 5, 6, 7], "use_cach": 3, "pad_token": 3, "eos_token": 3, "finetun": 3, "finetune_nam": 3, "aligned_model": 3, "finetune_tag": 3, "from_smollm2": 3, "schedul": [3, 5, 6], "learning_r": [3, 6], "determin": [3, 4, 5, 6, 7, 8], "aggress": [3, 5, 6, 7], "empir": 3, "1e": 3, "huyen": 3, "cosin": 3, "lr_scheduler_typ": 3, "stabil": [3, 5, 7], "gradual": 3, "decreas": [3, 4, 5, 8], "accumul": [3, 5], "v": [3, 8], "16": [3, 4, 5, 6, 7], "per_device_train_batch_s": 3, "simul": [3, 5, 7, 8], "gradient_accumulation_step": 3, "strongli": [3, 8], "lower": [3, 4, 5, 6, 7, 8], "conserv": [3, 7], "overfit": 3, "warmup": 3, "max_step": 3, "1000": [3, 5, 6, 7], "suffic": 3, "20": [3, 5, 6, 7, 8], "warmup_step": 3, "stop": [3, 4, 5, 6], "bf16": 3, "checkpoint": 3, "gradient_checkpoint": 3, "usag": [3, 4, 5, 6, 7, 8], "200": [3, 4, 5, 6, 7], "50": [3, 5, 6, 7, 8], "training_results_dir": 3, "smolk12_dpo_output": 3, "dpo_config_path": 3, "dpo_config": 3, "yaml": [3, 5, 8], "pathlib": [3, 7], "config_path": 3, "safe_load": [3, 5], "runtim": [3, 6, 8], "hub_model_id": 3, "use_mps_devic": 3, "output_dir": [3, 5], "training_arg": 3, "trainer": 3, "train_dataset": 3, "processing_class": 3, "temperatur": [3, 5, 6, 7, 8], "max_prompt_length": [3, 6], "1024": 3, "max_length": [3, 5, 8], "1536": 3, "sent": [3, 6, 7], "plot": [3, 5], "move": [3, 4, 5, 6, 7], "averag": [3, 4, 5, 6, 8], "visual": [3, 5, 6, 7], "quick": [3, 5, 6, 7], "150": [3, 5], "curv": 3, "reach": [3, 5, 6, 7, 8], "obviou": 3, "suffici": [3, 5, 8], "save_model": 3, "hf_token": 3, "tag": [3, 7], "congratul": 3, "successfulli": [3, 5, 7, 8], "card": [3, 5, 7], "newli": [3, 5], "qualit": [3, 5, 7], "assess": [3, 4, 5, 6, 7], "rigor": [3, 5, 6, 7], "quantit": [3, 5], "base_gener": 3, "aligned_gener": 3, "compare_model_respons": 3, "base_output": 3, "128": [3, 5, 6], "aligned_output": 3, "pleas": [3, 5, 6, 7], "gram": [3, 5], "tnt": 3, "highli": [3, 4, 5, 6, 7, 8], "regul": [3, 4, 5, 6, 7], "law": [3, 4, 5, 6, 7], "degre": [3, 5, 8], "mishandl": 3, "countri": [3, 5], "seriou": [3, 5, 7], "imprison": 3, "death": 3, "variou": [3, 4, 5, 6, 7, 8], "nation": [3, 7], "dictat": 3, "stark": [3, 5], "readili": [3, 5], "cite": 3, "regulatori": [3, 4, 5, 6, 7], "anecdot": [3, 7], "systemat": [3, 4, 5, 6, 7, 8], "quantifi": [3, 5, 6, 7], "f1": [3, 5, 7], "experienc": [3, 5], "expert": [3, 5, 6, 7, 8], "addition": [3, 4, 5, 6, 7], "vari": [3, 4, 5, 6, 7, 8], "interpret": [3, 5, 6, 7], "judg": [3, 5], "summar": [3, 5, 6], "three": [3, 5, 6, 7], "togeth": [3, 6, 7], "entri": [3, 5, 6], "somewhat": 3, "databas": [3, 4, 5, 8], "distribut": [3, 4, 5, 6, 7, 8], "static": [3, 7, 8], "k12": 3, "base_model_api_url": 3, "aligned_model_api_url": 3, "base_model_responses_path": 3, "evals_base_model_respons": 3, "aligned_model_responses_path": 3, "evals_aligned_model_respons": 3, "num_sampl": [3, 7], "eval_dataset": 3, "df_eval": 3, "to_panda": [3, 5, 7], "lambda": [3, 7], "prompts_ev": 3, "to_list": 3, "base_model_respons": 3, "aligned_model_respons": 3, "df_eval_respons": 3, "_base": 3, "_align": 3, "rememb": [3, 5], "heurist": 3, "charact": [3, 5, 6, 7, 8], "minimum": [3, 4, 5, 6], "min_response_length": 3, "filter": [3, 5, 6, 8], "string": [3, 5, 7, 8], "df_eval_responses_clean": 3, "model_responses_bas": 3, "model_responses_align": 3, "homemad": 3, "kid": 3, "redact": [3, 7], "punish": 3, "unit": [3, 5, 7, 8], "indonesia": 3, "saudi": 3, "arabia": 3, "offens": [3, 7], "respond": [3, 4, 5, 7, 8], "rodrig": 3, "safetyjudg": 3, "evaluate_respons": 3, "tupl": [3, 5, 7], "safetyscor": [3, 7], "float": [3, 4, 5, 6, 7, 8], "valueerror": [3, 8], "empti": [3, 8], "scoring_guid": 3, "nrespons": 3, "safety_judg": 3, "test_respons": 3, "emphas": [3, 5, 6, 7, 8], "emphasi": [3, 4, 5], "base_ev": 3, "zip": [3, 5, 8], "aligned_ev": 3, "injuri": [3, 5], "base_scor": 3, "eval": [3, 4, 6], "aligned_scor": 3, "base_df": 3, "aligned_df": 3, "model_typ": 3, "stack": [3, 6, 7], "evals_df_result": 3, "h": [3, 5, 6, 7], "identifi": [3, 4, 5, 6, 7, 8], "requ": 3, "statist": [3, 5, 7], "naiv": [3, 8], "score_map": 3, "count": [3, 5, 6, 7], "percentag": [3, 4, 5, 7], "score_base_freq": 3, "score_bas": 3, "value_count": [3, 7], "reindex": 3, "fill_valu": 3, "score_base_pct": 3, "score_aligned_freq": 3, "score_align": 3, "score_aligned_pct": 3, "tabl": [3, 5, 6, 7, 8], "md_tabl": 3, "335": [3, 5], "99": [3, 4, 6, 7], "281": [3, 5], "83": [3, 4, 5, 7], "14": [3, 5, 6, 7, 8], "43": [3, 5, 6, 7], "explanation_bas": 3, "response_bas": 3, "model_type_bas": 3, "explanation_align": 3, "response_align": 3, "model_type_align": 3, "std": [3, 5, 7], "base_mean": 3, "aligned_mean": 3, "3f": 3, "108": [3, 5], "231": [3, 5], "No": [3, 5, 6, 7, 8], "fell": [3, 4], "partial": [3, 5], "styliz": [3, 7], "wild": 3, "consider": [3, 4, 6, 7, 8], "proof": [3, 4], "taken": [3, 5, 6, 7, 8], "huang": [3, 5, 6, 7], "overal": [3, 5, 6, 7, 8], "annot": [3, 5, 6, 7], "mirror": [3, 5, 7], "inaccur": [3, 5, 7, 8], "consecut": [3, 7], "unrepres": 3, "hao": [3, 5], "accord": [3, 4, 5, 7, 8], "yin": [3, 5, 7], "resembl": 3, "declin": [3, 4, 5], "volatil": [3, 5], "ineffici": [3, 4, 5], "smollm": 3, "rel": [3, 4, 5, 6, 7], "term": [3, 4, 5, 6, 7], "trade": [3, 4, 5, 6, 7, 8], "weigh": 3, "qwen": [3, 6, 8], "remark": [3, 4, 7, 8], "rival": [3, 6], "ultim": [3, 4, 5, 6, 7], "threshold": [3, 4, 5, 6, 7], "chen": [3, 5, 6, 7, 8], "overli": [3, 5, 7, 8], "simpli": [3, 4, 5, 6, 8], "neglect": [3, 5, 7], "themselv": [3, 5, 7], "complementari": 3, "throughput": [3, 4, 6], "screen": [3, 5, 7], "flag": [3, 5, 6, 7], "preliminari": [3, 5], "judgment": [3, 5], "valid": [3, 4, 5, 6, 8], "automat": [3, 5, 6, 7], "composit": [3, 5], "plai": [3, 5, 6, 7, 8], "led": [3, 5, 8], "apologet": 3, "hesit": 3, "benign": [3, 7], "apolog": 3, "inde": 3, "accordingli": [3, 5, 7], "perhap": [3, 4], "creation": [3, 6, 7], "invalu": 3, "hyperparamet": [3, 6, 7], "mention": [3, 5, 7, 8], "optimist": 3, "memor": [3, 5], "generaliz": 3, "abc": [3, 7], "4a": 3, "amanda": [3, 5, 7], "jan": [3, 5, 7], "brauner": [3, 7], "adrian": 3, "colyer": 3, "benjamin": [3, 5, 7], "cullen": [3, 7], "david": [3, 5, 6, 7], "duvenaud": 3, "richard": [3, 5, 7], "ngo": [3, 7], "azalia": 3, "mirhoseini": 3, "catherin": [3, 5, 7], "olsson": [3, 7], "sam": [3, 5, 7], "ringer": 3, "liam": [3, 5, 7], "skirvin": 3, "jess": [3, 5, 7], "smith": [3, 5, 6], "dawn": [3, 5, 7], "song": [3, 4, 5, 7, 8], "william": [3, 4, 5, 6, 7], "saunder": [3, 5], "steinhardt": [3, 5], "asset": [3, 5, 7], "983c85a201a962f": 3, "pdf": [3, 7], "4b": 3, "24c8d0a3a7d0a1f1": 3, "bjn": 3, "22": [3, 5, 7], "yuntao": [3, 5, 7], "andi": [3, 5, 7], "jone": [3, 5], "kamal": 3, "ndouss": 3, "anna": [3, 5, 7], "nova": [3, 6], "dassarma": 3, "drain": 3, "stanislav": 3, "fort": [3, 7], "ganguli": [3, 5, 7], "tom": [3, 5], "henighan": 3, "nichola": [3, 5], "joseph": [3, 5, 7], "saurav": [3, 7], "kadavath": 3, "jackson": [3, 5, 7], "kernion": [3, 5, 7], "conerli": 3, "sheer": [3, 8], "el": 3, "showk": 3, "nelson": 3, "elhag": 3, "zac": 3, "hatfield": 3, "dodd": 3, "danni": [3, 5, 7], "hernandez": [3, 5, 7], "tristan": 3, "hume": 3, "scott": [3, 5, 7], "johnston": 3, "shauna": 3, "kravec": 3, "lian": 3, "lovitt": 3, "neel": [3, 5], "nanda": 3, "dario": [3, 5], "amodei": [3, 5], "brown": [3, 5], "jack": [3, 5, 7], "clark": 3, "mccandlish": [3, 5], "chri": [3, 5, 7], "olah": 3, "ben": [3, 5, 6, 7], "mann": [3, 7], "jare": [3, 5, 7], "kaplan": [3, 5, 7], "arxiv": [3, 4, 5, 6, 7, 8], "org": [3, 4, 5, 6, 7, 8], "ab": [3, 4, 5, 6, 7, 8], "2204": 3, "05862": 3, "bkk": 3, "sandipan": 3, "kundu": 3, "goldi": 3, "cameron": [3, 5, 7, 8], "mckinnon": 3, "carol": [3, 7], "christoph": [3, 5, 7], "dustin": 3, "eli": [3, 5, 6, 7], "tran": [3, 8], "johnson": 3, "ethan": [3, 5, 7], "perez": [3, 7], "jami": [3, 7], "kerr": 3, "mueller": 3, "jeffrei": 3, "ladish": 3, "joshua": [3, 5, 7], "landau": 3, "kamil": [3, 5], "lukosuit": 3, "michael": [3, 5, 6, 7, 8], "sellitto": 3, "schiefer": 3, "noemi": 3, "mercado": 3, "robert": [3, 5, 6], "lasenbi": 3, "robin": 3, "larson": 3, "tamera": 3, "lanham": 3, "timothi": [3, 5, 6], "telleen": 3, "lawton": 3, "samuel": [3, 5, 7], "bowman": [3, 5], "2212": 3, "08073": 3, "blo23": 3, "announc": [3, 5], "cc": 3, "11": [3, 5, 6, 7, 8], "ccl": [3, 7], "24": [3, 4, 5, 6, 7, 8], "guim": 3, "hardi": 3, "shunian": 3, "zich": 3, "liu": [3, 5, 6, 7, 8], "jiang": [3, 5, 7], "benyou": 3, "wang": [3, 4, 5, 6, 7, 8], "judgement": [3, 5, 7], "2402": [3, 7], "10669": 3, "dphz23": 3, "tim": [3, 7], "artidoro": 3, "pagnoni": 3, "ari": [3, 5, 7], "holtzman": [3, 5], "luke": [3, 5, 7], "zettlemoy": 3, "2305": [3, 5], "14314": 3, "ddz": 3, "qingxiu": 3, "xingx": 3, "zhang": [3, 5, 6, 7], "zhifang": 3, "sui": 3, "furu": [3, 4], "wei": [3, 4, 5, 6, 7], "boost": 3, "2410": [3, 4, 7], "06961": 3, "fac24": [3, 5], "huggingfaceh4": [3, 6, 7], "fac4c": 3, "fac4d": [3, 6], "doc": [3, 4, 5, 6, 7, 8], "en": [3, 5, 6, 7, 8], "fqh": 3, "duanyu": 3, "bowen": [3, 5, 6, 7], "qin": [3, 5, 6, 7], "zheng": [3, 5, 6, 7], "wenqiang": 3, "lei": [3, 5, 6, 7], "analyz": [3, 4, 5, 6, 7, 8], "perspect": [3, 7], "2404": [3, 5, 7], "04626": 3, "h44a": 3, "binari": [3, 5, 6, 7], "h44b": 3, "hhj": 3, "shuang": 3, "wenfeng": 3, "han": [3, 5, 7], "tao": [3, 5, 7], "yipe": 3, "haonan": 3, "chunlin": 3, "zhong": [3, 7], "zhangjun": 3, "zhou": [3, 4, 5, 6, 7], "tang": [3, 5, 6, 7], "2401": [3, 5], "01629": 3, "hlt24": 3, "jiwoo": 3, "noah": [3, 5, 7], "lee": [3, 5, 6, 7, 8], "jame": [3, 5, 7], "thorn": 3, "orpo": 3, "monolith": 3, "2403": [3, 5], "07691": 3, "hdn": 3, "zhenyu": 3, "pengfan": 3, "du": [3, 5], "yilin": 3, "niu": [3, 8], "zhengxiao": 3, "aohan": 3, "zeng": [3, 7], "xiao": [3, 7], "minli": 3, "hongn": 3, "jie": [3, 5, 7, 8], "yuxiao": 3, "2412": [3, 5, 6, 7], "06000": 3, "hsw": 3, "21": [3, 5, 6], "edward": [3, 5], "j": [3, 5, 6, 7, 8], "yelong": 3, "shen": [3, 5, 7], "phillip": 3, "walli": 3, "zeyuan": 3, "allen": [3, 5], "zhu": [3, 5, 6, 7], "yuanzhi": 3, "shean": 3, "lu": [3, 5, 6, 7], "weizhu": 3, "2106": 3, "09685": 3, "hgh": 3, "jiaxin": 3, "shixiang": [3, 5, 7], "shane": [3, 5, 7], "gu": [3, 5, 7], "le": [3, 5, 6], "yuexin": 3, "xuezhi": 3, "hongkun": 3, "yu": [3, 5, 6, 7], "jiawei": [3, 8], "2210": [3, 7], "11610": 3, "huy24": 3, "chip": 3, "reilli": 3, "media": [3, 4, 5, 7], "decemb": [3, 5, 7], "9781098129095": 3, "www": [3, 5, 6, 7], "oreilli": 3, "ksd": 3, "rylan": [3, 5], "schaeffer": 3, "apratim": 3, "dei": 3, "matthia": [3, 5], "gerstgrass": 3, "rafael": 3, "donoho": 3, "sanmi": 3, "koyejo": 3, "thrive": [3, 5, 8], "peril": 3, "16713": 3, "ksy": 3, "seungon": 3, "juyoung": 3, "suk": 3, "xiang": [3, 5, 6], "yue": 3, "vijai": 3, "viswanathan": 3, "seongyun": 3, "yizhong": 3, "kiril": 3, "gashteovski": 3, "carolin": [3, 7], "lawrenc": 3, "sean": [3, 5, 7], "welleck": 3, "graham": 3, "neubig": 3, "03679": 3, "lt24": 3, "herd": [3, 6], "2407": [3, 5, 6, 7], "21783": [3, 6], "lwx": 3, "lin": [3, 5, 6, 7, 8], "rui": [3, 5, 6, 8], "ruixuan": 3, "junbo": 3, "zhao": [3, 5, 6, 7], "ding": 3, "gang": [3, 5], "haobo": 3, "driven": [3, 5, 6, 7], "survei": [3, 5, 7, 8], "2406": [3, 5, 6, 7], "15126": 3, "met24": 3, "owj": 3, "jeff": [3, 5, 7], "diogo": [3, 7], "almeida": [3, 7], "carrol": [3, 7], "wainwright": [3, 7], "pamela": [3, 5, 7], "mishkin": [3, 5, 7], "chong": [3, 7], "sandhini": [3, 7], "agarw": [3, 5, 7], "katarina": [3, 7], "slama": [3, 7], "alex": [3, 5, 6, 7], "rai": [3, 5, 6, 7], "john": [3, 5, 7], "hilton": [3, 5, 6, 7], "fraser": [3, 7], "kelton": 3, "miller": [3, 5], "maddi": [3, 7], "simen": [3, 7], "peter": [3, 5, 6, 7], "welind": [3, 5, 7], "paul": [3, 5, 7], "christiano": [3, 7], "leik": [3, 5, 7], "ryan": [3, 5, 7], "2203": 3, "02155": 3, "qwe24": 3, "rsm": 3, "archit": 3, "sharma": [3, 7], "eric": [3, 5, 6, 7], "mitchel": [3, 6], "stefano": [3, 5], "ermon": [3, 5], "man": [3, 5, 7], "chelsea": [3, 7], "finn": 3, "secretli": 3, "18290": 3, "swd": 3, "17": [3, 5, 7], "filip": [3, 7], "wolski": 3, "prafulla": 3, "dhariw": 3, "alec": [3, 5, 7], "radford": [3, 5, 7], "oleg": [3, 7], "klimov": 3, "1707": 3, "06347": 3, "smollm224": 3, "distil": [3, 4], "smollm2360mi24": 3, "sou24": 3, "html": [3, 8], "srverh24": 3, "m\u00e1rton": 3, "daniel": [3, 5, 7], "rueckert": 3, "r\u00fcdiger": 3, "von": [3, 5, 6], "eisenhart": 3, "roth": [3, 5], "florian": 3, "hinterwimm": 3, "2411": 3, "09539": 3, "tm": [3, 6], "23": [3, 5, 6, 7], "hugo": [3, 6], "loui": [3, 5, 6], "martin": [3, 5, 6, 7], "kevin": [3, 5, 6, 7], "stone": [3, 6], "albert": [3, 6], "amjad": [3, 6], "almahairi": [3, 6], "yasmin": [3, 6], "babaei": [3, 6], "nikolai": [3, 6], "bashlykov": [3, 6], "soumya": [3, 6], "batra": [3, 6], "prajjwal": [3, 6], "bhargava": [3, 6], "shruti": [3, 6], "bhosal": [3, 6], "dan": [3, 5, 6, 7, 8], "bikel": [3, 6], "luka": [3, 6], "blecher": [3, 6], "cristian": [3, 6], "canton": [3, 6], "ferrer": [3, 6], "moya": [3, 6], "guillem": [3, 6], "cucurul": [3, 6], "esiobu": [3, 6], "jude": [3, 6], "fernand": [3, 6], "jeremi": [3, 5, 6], "fu": [3, 6], "wenyin": [3, 6], "brian": [3, 6, 7], "fuller": [3, 6, 7], "cynthia": [3, 6], "gao": [3, 5, 6, 7], "vedanuj": [3, 6], "goswami": [3, 6, 7], "naman": [3, 6], "goyal": [3, 6], "anthoni": [3, 6], "hartshorn": [3, 6], "saghar": [3, 6], "hosseini": [3, 6], "hakan": [3, 6], "inan": [3, 6], "marcin": [3, 6], "karda": [3, 6], "viktor": [3, 6], "kerkez": [3, 6], "madian": [3, 6], "khabsa": [3, 6], "isabel": [3, 6, 7], "kloumann": [3, 6], "artem": [3, 6], "korenev": [3, 6], "punit": [3, 6], "singh": [3, 5, 6], "koura": [3, 6], "mari": [3, 5, 6, 7], "ann": [3, 6, 7], "lachaux": [3, 6], "thibaut": [3, 6], "lavril": [3, 6], "jenya": [3, 6], "diana": [3, 5, 6], "liskovich": [3, 6], "yinghai": [3, 6], "yune": [3, 6], "mao": [3, 4, 6], "xavier": [3, 6], "martinet": [3, 6], "todor": [3, 6, 7], "mihaylov": [3, 6], "pushkar": [3, 6], "mishra": [3, 5, 6], "igor": [3, 5, 6, 7], "molybog": [3, 6], "yixin": [3, 5, 6], "nie": [3, 5, 6], "andrew": [3, 5, 6, 7], "poulton": [3, 6], "reizenstein": [3, 6], "rashi": [3, 6], "rungta": [3, 6], "kalyan": [3, 6], "saladi": [3, 6], "alan": [3, 6, 7], "schelten": [3, 6], "ruan": [3, 6], "silva": [3, 6], "ranjan": [3, 6], "subramanian": [3, 6], "xiaoq": [3, 6], "ellen": [3, 6], "tan": [3, 5, 6], "binh": [3, 6], "ross": [3, 4, 6, 7], "taylor": [3, 6], "adina": [3, 6, 7], "jian": [3, 5, 6], "kuan": [3, 6], "puxin": [3, 6], "yan": [3, 4, 5, 6], "iliyan": [3, 6], "zarov": [3, 6], "yuchen": [3, 5, 6, 7], "angela": [3, 5, 6, 7], "fan": [3, 5, 6], "melani": [3, 6], "kambadur": [3, 6], "sharan": [3, 6], "narang": [3, 6], "aurelien": [3, 6], "rodriguez": [3, 6], "stojnic": [3, 6], "sergei": [3, 6], "edunov": [3, 6], "thoma": [3, 5, 6, 7], "scialom": [3, 6], "2307": [3, 6, 8], "09288": [3, 6], "vaa": [3, 7], "berti": [3, 7], "adarsh": [3, 7], "agraw": [3, 7], "ahm": [3, 7], "victor": [3, 7], "akinwand": [3, 7], "namir": [3, 7], "nuaimi": [3, 7], "najla": [3, 7], "alfaraj": [3, 7], "alhajjar": [3, 7], "aroyo": [3, 7], "trupti": [3, 7], "bavalatti": [3, 7], "max": [3, 5, 7], "bartolo": [3, 7], "borhan": [3, 7], "blili": [3, 7], "hamelin": [3, 7], "kurt": [3, 7], "bollack": [3, 7], "rishi": [3, 5, 6, 7], "bomassani": [3, 7], "marisa": [3, 7], "ferrara": [3, 7], "boston": [3, 7], "sim\u00e9on": [3, 7], "campo": [3, 7], "kal": [3, 7], "chakra": [3, 7], "canyu": [3, 7], "codi": [3, 7], "coleman": [3, 7], "zachari": [3, 5, 7], "delpierr": [3, 7], "coudert": [3, 7], "leon": [3, 7], "derczynski": [3, 7], "debojyoti": [3, 7], "dutta": [3, 7], "ian": [3, 5, 7], "eisenberg": [3, 7], "ezick": [3, 7], "heather": [3, 7], "frase": [3, 7], "ram": [3, 6, 7], "gandikota": [3, 7], "agasthya": [3, 7], "gangavarapu": [3, 7], "ananya": [3, 5, 7], "geali": [3, 7], "rajat": [3, 7], "ghosh": [3, 5, 7], "goel": [3, 5, 7], "usman": [3, 7], "gohar": [3, 7], "sujata": [3, 7], "hale": [3, 7], "wiebk": [3, 7], "hutiri": [3, 7], "marvin": [3, 7], "imperi": [3, 7], "surgan": [3, 7], "jandial": [3, 7], "nick": [3, 5, 7], "judd": [3, 7], "felix": [3, 5, 7], "juefei": [3, 7], "fouts": [3, 7], "khomh": [3, 7], "bhavya": [3, 7], "kailkhura": [3, 7], "hannah": [3, 5, 7], "rose": [3, 7], "kirk": [3, 7], "klyman": [3, 7], "knotz": [3, 7], "kuchnik": [3, 7], "shachi": [3, 7], "kumar": [3, 5, 7], "srijan": [3, 7], "lengerich": [3, 7], "bo": [3, 5, 6, 7], "zeyi": [3, 7], "liao": [3, 5, 7], "eileen": [3, 7], "sarah": [3, 5, 7], "luger": [3, 7], "yifan": [3, 5, 7], "priyanka": [3, 7], "mammen": [3, 7], "kelvin": [3, 7], "manyeki": [3, 7], "mcgregor": [3, 7], "virendra": [3, 7], "mehta": [3, 5, 7], "shafe": [3, 7], "moham": [3, 7], "moss": [3, 7], "lama": [3, 7], "nachman": [3, 7], "dinesh": [3, 7], "jinenh": [3, 7], "naganna": [3, 7], "amin": [3, 7], "nikanjam": [3, 7], "besmira": [3, 7], "nushi": [3, 7], "lui": [3, 5, 7], "oala": [3, 7], "iftach": [3, 7], "orr": [3, 5, 7], "alicia": [3, 5, 7], "parrish": [3, 5, 7], "cigdem": [3, 7], "patlak": [3, 7], "pietri": [3, 7], "forough": [3, 7], "poursabzi": [3, 7], "sangdeh": [3, 7], "eleonora": [3, 7], "presani": [3, 7], "fabrizio": [3, 7], "puletti": [3, 7], "r\u00f6ttger": [3, 7], "sahai": [3, 7], "santo": [3, 7], "nino": [3, 7], "scherrer": [3, 7], "alic": [3, 5, 7, 8], "schoenauer": [3, 7], "sebag": [3, 7], "patrick": [3, 7], "schramowski": [3, 7], "abolfazl": [3, 7], "shahbazi": [3, 7], "vin": [3, 7], "xudong": [3, 5, 7], "vamsi": [3, 7], "sistla": [3, 7], "leonard": [3, 7], "testuggin": [3, 7], "vithursan": [3, 7], "thangarasa": [3, 7], "elizabeth": [3, 5, 7], "watkin": [3, 7], "rebecca": [3, 5, 7], "weiss": [3, 7], "welti": [3, 7], "tyler": [3, 5, 7], "wilber": [3, 7], "jean": [3, 7], "poonam": [3, 7], "yadav": [3, 7], "xianjun": [3, 7], "yang": [3, 5, 6, 7, 8], "yi": [3, 5, 7, 8], "wenhui": [3, 7], "fedor": [3, 7], "zhdanov": [3, 7], "jiacheng": [3, 5, 7], "perci": [3, 5, 7], "liang": [3, 5, 7, 8], "mattson": [3, 7], "joaquin": [3, 7], "vanschoren": [3, 7], "v0": [3, 7, 8], "12241": [3, 7], "wyg": 3, "tianhao": [3, 5, 6, 7], "weizh": 3, "yuan": [3, 5, 7], "olga": 3, "golovneva": 3, "jing": [3, 7], "yuandong": 3, "tian": 3, "jiantao": 3, "jiao": 3, "jason": [3, 5, 7], "weston": 3, "sainbayar": 3, "sukhbaatar": 3, "19594": 3, "xfg": 3, "shusheng": 3, "jiaxuan": 3, "wenji": 3, "ye": [3, 5, 6, 7, 8], "weilin": 3, "zhiyu": [3, 8], "mei": [3, 5, 6], "guangju": 3, "chao": 3, "10719": 3, "ywx": 3, "yueqin": 3, "zhendong": 3, "yujia": 3, "xie": [3, 5], "mingyuan": 3, "paradigm": [3, 5], "semanticscholar": 3, "corpusid": 3, "270199610": 3, "suppos": [4, 5, 8], "econom": [4, 5], "fuel": 4, "equival": [4, 5, 6], "consumpt": [4, 5], "contrari": 4, "truth": [4, 5, 6, 7, 8], "stanlei": 4, "jevon": 4, "a16z": 4, "andreessen": 4, "horowitz": 4, "10x": 4, "outpac": 4, "moor": 4, "pc": 4, "edholm": 4, "bandwidth": 4, "era": 4, "llmflation": 4, "mmlu": [4, 6, 7], "60": [4, 5, 6, 7], "06": [4, 5, 8], "price": [4, 5, 6], "fallen": 4, "62": [4, 5, 6], "introduct": 4, "march": [4, 5, 8], "stem": [4, 5, 8], "compound": 4, "bit": [4, 6], "tune": [4, 5, 7], "dpo": [4, 6], "competit": [4, 5, 6, 7], "plummet": 4, "rapidli": [4, 6, 7, 8], "preciou": 4, "wouldn": [4, 5], "sens": [4, 7], "wait": [4, 5, 7], "wave": 4, "economist": 4, "1865": 4, "studi": [4, 8], "coal": 4, "industri": [4, 5, 6, 7, 8], "made": [4, 5, 6, 8], "counterintuit": 4, "discoveri": 4, "steam": 4, "spend": [4, 5], "repeat": 4, "didn": [4, 8], "smartphon": [4, 5, 6], "server": [4, 5, 6, 8], "network": [4, 5, 6, 8], "transmiss": 4, "got": 4, "cheaper": [4, 5], "shift": [4, 5], "hd": 4, "stream": [4, 5, 6, 8], "storag": [4, 5, 6, 7], "gigabyt": 4, "massiv": [4, 5, 7], "broadli": [4, 6, 8], "audio": [4, 5], "transcript": 4, "multimod": [4, 6, 7], "imag": [4, 5, 6, 7], "exponenti": [4, 5], "growth": [4, 5], "magnifi": 4, "everyth": [4, 8], "billion": [4, 5, 6], "dollar": [4, 5, 6], "annual": [4, 5, 7], "millisecond": [4, 5], "latenc": [4, 5, 6, 7], "30": [4, 5, 6, 7], "mobil": [4, 5, 6, 8], "tradeoff": [4, 6, 7, 8], "pro": [4, 5, 6, 7], "trigger": [4, 7], "premium": [4, 5], "innov": [4, 5, 6, 7], "capac": [4, 5, 6], "link": [4, 5], "dual": 4, "character": [4, 5, 7], "ahead": [4, 6, 7], "decai": [4, 6], "discuss": [4, 5, 6, 7], "area": [4, 5, 7, 8], "flash": [4, 6], "cach": [4, 5, 6], "compress": [4, 5, 6], "provis": [4, 5], "extent": [4, 5, 7], "problema": 4, "accomplish": [4, 7, 8], "accompani": [4, 5, 7], "transact": [4, 5, 7], "roi": 4, "alloc": [4, 5, 6, 7], "budget": [4, 6], "viabil": [4, 6], "prioriti": [4, 5, 6], "overlook": 4, "thorough": [4, 6, 7], "identif": [4, 5], "specifi": [4, 5, 6, 7, 8], "longev": 4, "accommod": 4, "evalu": [4, 6, 8], "multi": [4, 5, 6, 7, 8], "baselin": [4, 5, 6, 7], "met": [4, 5, 7], "equal": [4, 5, 7], "concurr": [4, 6], "peak": 4, "spike": 4, "versu": [4, 5, 6, 7], "volum": [4, 5, 6, 7], "season": [4, 5], "variat": [4, 5, 6], "uptim": 4, "mainten": [4, 5, 6, 7], "disrupt": [4, 5], "backup": 4, "failov": 4, "clearli": [4, 5, 7, 8], "redund": [4, 5], "recoveri": [4, 5], "unexpect": [4, 5, 7, 8], "event": [4, 5], "seamless": [4, 5, 7], "broader": [4, 5, 6, 7], "vector": [4, 6, 7], "retriev": [4, 5, 6], "augment": [4, 5, 6], "rag": [4, 6], "retent": [4, 5], "polici": [4, 5, 6], "essenti": [4, 5, 6, 7, 8], "opportun": [4, 5], "post": [4, 5, 6, 7], "32": [4, 5, 6], "fp32": 4, "fp16": [4, 6], "proport": [4, 5, 6], "byte": 4, "120": [4, 5, 7], "gb": 4, "whole": [4, 5], "done": [4, 5, 6, 7, 8], "smollm2": [4, 5, 6, 8], "135m": [4, 6], "load_gguf": 4, "bartowski": 4, "gguf": [4, 6], "gguf_file_q2_k": 4, "q2_k": [4, 6], "gguf_file_f16": 4, "f16": 4, "model_q2_k": 4, "gguf_fil": 4, "model_f16": 4, "mlp": 4, "layer": [4, 5, 6, 8], "proxi": [4, 5, 7], "mlp_weights_q2_k": 4, "gate_proj": 4, "mlp_weights_f16": 4, "tensor": [4, 8], "0145": 4, "1826": 4, "1377": 4, "1719": 4, "1387": 4, "0298": 4, "1631": 4, "0781": 4, "2051": [4, 5], "2070": 4, "0334": 4, "2891": 4, "1768": 4, "0488": 4, "2393": 4, "0396": 4, "1348": 4, "1533": 4, "0771": 4, "0845": 4, "0232": 4, "0178": 4, "1040": 4, "1582": 4, "1167": 4, "0474": 4, "0359": 4, "2500": 4, "0432": 4, "0972": 4, "0933": 4, "2188": 4, "0776": 4, "0674": 4, "requires_grad": 4, "0028": 4, "1852": 4, "1396": 4, "1506": 4, "1635": 4, "0043": 4, "0680": 4, "2257": 4, "1890": 4, "0464": 4, "2960": 4, "1840": 4, "0451": 4, "2395": 4, "0413": 4, "1446": 4, "0621": 4, "0478": 4, "0038": 4, "0830": 4, "1473": 4, "0926": 4, "0547": 4, "0824": 4, "0429": 4, "2737": 4, "0355": 4, "0782": 4, "2043": [4, 5], "0740": 4, "arriv": [4, 5], "pearson": 4, "numpi": [4, 5], "np": [4, 5], "arrai": [4, 7], "detach": 4, "graph": [4, 5], "weights_f16": 4, "weights_q2_k": 4, "flat_f16": 4, "flatten": 4, "flat_q2_k": 4, "corrcoef": 4, "4f": [4, 8], "9970": 4, "exemplifi": [4, 6, 7], "70b": [4, 5, 6], "unsloth": 4, "141": 4, "q8_0": [4, 6], "75": [4, 7], "47": [4, 5, 6, 7], "cumul": [4, 5], "26": [4, 5, 6], "19": [4, 5, 6, 7], "space": [4, 5, 6, 7], "counterpart": 4, "spectrum": [4, 5], "variant": [4, 5, 6, 7], "laptop": [4, 5], "desktop": [4, 5, 6], "enterpris": [4, 5, 6, 7, 8], "ceil": 4, "notabl": [4, 5, 7, 8], "bitnet": 4, "cpp": [4, 8], "arm": 4, "x86": 4, "speedup": [4, 6], "37x": 4, "07x": 4, "17x": 4, "beyond": [4, 5, 7], "raw": [4, 5, 6, 7, 8], "speed": [4, 5, 6, 7], "energi": [4, 5], "55": [4, 5, 6], "70": [4, 5, 6], "71": [4, 5], "82": [4, 7], "impress": [4, 8], "100b": 4, "b1": 4, "58": [4, 6], "pace": [4, 5, 7], "second": [4, 5, 6, 7], "kernel": 4, "characterist": [4, 5, 6, 7, 8], "excit": 4, "frontier": [4, 7], "compel": [4, 5, 6, 8], "acceler": [4, 5, 6, 7], "faster": [4, 6], "arithmet": [4, 5], "benefici": [4, 5, 6], "sustain": [4, 5, 6, 7], "Be": [4, 5, 6, 7], "fine": [4, 5, 7], "pure": [4, 5, 6], "unlock": [4, 8], "track": [4, 5, 7], "chargeback": 4, "regularli": [4, 5], "wz": 4, "jinheng": 4, "hansong": 4, "ting": [4, 7], "shaoguang": 4, "shume": 4, "ma": [4, 5, 7], "hongyu": [4, 5], "xia": [4, 5, 6], "infra": 4, "fast": [4, 5, 6, 7, 8], "lossless": 4, "16144": 4, "andreessenhorowitz24": 4, "huggingface4w": [4, 6], "2024w": [4, 6], "unsloth24": 4, "jonathan": [4, 5, 7], "ceo": [4, 5], "groq": [4, 6], "streamlin": [4, 6, 8], "notat": 4, "width": [4, 6], "_k": 4, "_0": 4, "matter": 5, "beauti": 5, "smart": [5, 7], "agre": 5, "wrong": 5, "feynman": 5, "advent": 5, "norm": 5, "realm": 5, "convent": [5, 7], "evolut": [5, 6], "conceiv": 5, "entrench": 5, "seem": 5, "daunt": 5, "ignor": 5, "outdat": [5, 7, 8], "inevit": 5, "setback": 5, "imper": 5, "embrac": 5, "proactiv": [5, 7], "mindset": 5, "front": [5, 6], "produc": [5, 6, 7, 8], "novel": [5, 6], "ident": 5, "isn": [5, 7], "bug": 5, "random": [5, 7, 8], "testabl": 5, "exceedingli": 5, "guarante": [5, 6, 7, 8], "primari": [5, 7], "nucleu": 5, "2020": 5, "summari": [5, 6, 7, 8], "alter": 5, "rigid": 5, "wildli": 5, "incoher": 5, "inadequ": [5, 7], "temp": 5, "df_result": 5, "ntemperatur": 5, "40": [5, 6], "temp_respons": 5, "iterrow": [5, 7], "10000": [5, 8], "appl": [5, 8], "txt": [5, 6, 8], "sec_fil": [5, 8], "nsecur": 5, "AND": [5, 8], "exchang": [5, 7, 8], "commiss": [5, 7, 8], "nwashington": 5, "20549": 5, "nform": 5, "pursuant": 5, "TO": [5, 7], "13": [5, 6, 7], "OR": 5, "OF": [5, 7], "THE": [5, 7], "1934": 5, "nfor": 5, "fiscal": 5, "septemb": 5, "28": [5, 6, 7], "nor": 5, "period": [5, 7], "ncommiss": 5, "001": [5, 6], "36743": 5, "ng66145g66i43": 5, "jpg": 5, "nappl": 5, "exact": [5, 6, 7], "registr": 5, "charter": 5, "ncalifornia": 5, "t94": 5, "2404110": 5, "jurisdict": 5, "nof": 5, "incorpor": [5, 6, 7, 8], "employ": 5, "park": 5, "ncupertino": 5, "california": [5, 7, 8], "n95014": 5, "princip": 5, "offic": [5, 7], "408": 5, "996": 5, "1010": 5, "telephon": 5, "regist": 5, "ntitl": 5, "ttrade": 5, "symbol": 5, "tname": 5, "ncommon": 5, "stock": [5, 8], "00001": 5, "naapl": 5, "tthe": 5, "nasdaq": [5, 8], "llc": [5, 8], "n0": 5, "000": [5, 6, 8], "note": [5, 6, 7, 8], "2025": 5, "875": 5, "625": 5, "2026": 5, "2027": 5, "375": 5, "2029": 5, "050": 5, "2031": [5, 7], "600": 5, "2042": 5, "nindic": 5, "issuer": 5, "405": 5, "nye": 5, "preced": [5, 8], "shorter": 5, "past": [5, 7], "90": [5, 6, 7], "submit": [5, 6, 7], "electron": 5, "232": 5, "filer": 5, "12b": [5, 7], "nlarg": 5, "tacceler": 5, "nnon": 5, "tsmaller": 5, "nemerg": 5, "nif": 5, "elect": [5, 7], "revis": [5, 7], "attest": 5, "404": 5, "sarban": 5, "oxlei": 5, "7262": 5, "firm": [5, 7], "prepar": [5, 6, 7], "correct": [5, 7], "restat": 5, "incent": 5, "compens": 5, "240": 5, "10d": 5, "shell": 5, "aggreg": [5, 7], "vote": 5, "held": [5, 8], "affili": [5, 8], "29": [5, 6, 7, 8], "last": [5, 7, 8], "quarter": 5, "628": [5, 8], "553": [5, 8], "sole": [5, 7], "disclosur": [5, 6, 7], "director": [5, 6, 7], "date": 5, "exclud": 5, "n15": 5, "115": [5, 8], "823": [5, 8], "outstand": [5, 8], "octob": [5, 8], "18": [5, 6, 7, 8], "ndocument": 5, "BY": 5, "nportion": 5, "meet": [5, 7, 8], "sharehold": 5, "iii": 5, "ntabl": 5, "npage": 5, "npart": 5, "nitem": 5, "nbusi": 5, "1a": 5, "nrisk": 5, "1b": [5, 6, 7], "nunresolv": 5, "staff": 5, "comment": 5, "n17": 5, "1c": 5, "ncybersecur": 5, "nproperti": 5, "n18": 5, "nlegal": 5, "proceed": [5, 7], "nmine": 5, "ii": [5, 6, 8], "nmarket": 5, "stockhold": 5, "purchas": [5, 7], "n19": 5, "reserv": 5, "n20": 5, "nmanag": 5, "n21": 5, "7a": 5, "nquantit": 5, "n27": 5, "nfinanci": 5, "supplementari": 5, "n28": 5, "nchang": 5, "disagr": 5, "n51": 5, "9a": 5, "ncontrol": 5, "procedur": [5, 7], "9b": 5, "nother": 5, "n52": 5, "9c": 5, "ndisclosur": 5, "foreign": 5, "ndirector": 5, "corpor": [5, 7], "nexecut": 5, "ownership": [5, 6], "certain": [5, 7, 8], "owner": 5, "ncertain": 5, "nprincip": 5, "fee": 5, "iv": 5, "nexhibit": 5, "n53": 5, "n56": 5, "nthi": 5, "litig": [5, 6], "reform": 5, "1995": 5, "uncertainti": [5, 6, 7], "macroeconom": 5, "anticip": [5, 7], "caus": [5, 7], "oblig": 5, "nunless": 5, "herein": 5, "calendar": 5, "wholli": 5, "subsidiari": 5, "unless": [5, 6], "ncompani": 5, "manufactur": 5, "tablet": [5, 6], "wearabl": 5, "accessori": 5, "sell": [5, 7], "varieti": [5, 6], "52": [5, 7], "53": [5, 7], "week": 5, "saturdai": 5, "nproduct": 5, "niphon": 5, "io": [5, 8], "iphon": 5, "se": [5, 7], "nmac": 5, "maco": [5, 6], "mac": [5, 6], "macbook": 5, "air": 5, "imac": 5, "studio": 5, "nipad": 5, "multipurpos": 5, "ipado": 5, "ipad": 5, "nwearabl": 5, "home": [5, 8], "smartwatch": 5, "wireless": 5, "headphon": 5, "spatial": 5, "watcho": 5, "watch": 5, "ultra": 5, "airpod": 5, "beat": [5, 6], "visiono": 5, "nhome": 5, "tv": 5, "tvo": 5, "homepod": 5, "fidel": [5, 8], "naccessori": 5, "brand": 5, "third": [5, 6, 7], "parti": [5, 6, 7], "nservic": 5, "nadvertis": 5, "advertis": 5, "licens": 5, "napplecar": 5, "portfolio": 5, "applecar": 5, "repair": 5, "coverag": [5, 7], "accident": 5, "damag": [5, 7], "theft": [5, 7], "ncloud": 5, "ndigit": 5, "app": [5, 6], "discov": [5, 6, 7], "download": [5, 6], "music": 5, "podcast": 5, "subscript": [5, 6], "arcad": 5, "sm": 5, "listen": [5, 6], "radio": 5, "station": 5, "magazin": 5, "exclus": 5, "sport": 5, "npayment": 5, "payment": 5, "credit": 5, "pai": [5, 6], "cashless": 5, "nsegment": 5, "primarili": [5, 7], "geograph": [5, 7], "basi": [5, 6], "segment": [5, 8], "america": 5, "europ": 5, "china": [5, 7], "japan": 5, "rest": [5, 6], "asia": 5, "pacif": 5, "north": [5, 7], "south": 5, "european": [5, 7], "india": 5, "middl": [5, 6, 7], "east": 5, "africa": 5, "mainland": 5, "kong": 5, "taiwan": 5, "australia": 5, "asian": 5, "although": [5, 6], "partner": [5, 6, 7], "mid": 5, "resel": 5, "retail": 5, "sale": 5, "indirect": 5, "channel": [5, 7], "cellular": 5, "carrier": 5, "net": [5, 8], "38": [5, 6, 7], "ncompetit": 5, "downward": 5, "pressur": [5, 7], "gross": [5, 7], "cycl": [5, 7], "competitor": [5, 6, 7], "compet": [5, 6], "imit": 5, "infring": [5, 6], "intellectu": [5, 6, 7], "marketplac": [5, 7], "nearli": [5, 6], "reput": [5, 7], "expand": [5, 6, 7], "illegitim": [5, 7], "collabor": [5, 6, 7], "nsuppli": 5, "nalthough": 5, "particip": 5, "shortag": 5, "commod": [5, 6], "fluctuat": 5, "commonli": 5, "until": [5, 7, 8], "supplier": 5, "matur": 5, "concentr": 5, "enter": [5, 8], "agreement": 5, "suppli": [5, 8], "renew": 5, "nresearch": 5, "nbecaus": 5, "upon": [5, 7], "flow": [5, 8], "acquisit": [5, 7], "nintellectu": 5, "broad": [5, 6, 8], "patent": 5, "copyright": [5, 6], "trademark": 5, "secret": 5, "differenti": 5, "skill": [5, 7], "personnel": 5, "pursu": [5, 7], "thousand": [5, 6], "durat": 5, "adequ": [5, 7], "nin": 5, "holidai": [5, 7], "fill": 5, "inventori": 5, "older": [5, 6], "newer": 5, "distributor": 5, "nhuman": 5, "strive": 5, "retain": [5, 6, 7], "talent": 5, "member": [5, 7], "164": 5, "ncompens": 5, "equit": 5, "succe": 5, "health": [5, 7], "awai": [5, 7], "ngrowth": 5, "career": 5, "leadership": [5, 7], "nworkplac": 5, "workplac": 5, "ninclus": 5, "workforc": 5, "nengag": 5, "among": [5, 6, 7, 8], "gaug": 5, "sentiment": [5, 6, 8], "nhealth": 5, "everywher": 5, "crisi": 5, "visitor": 5, "navail": 5, "quarterli": 5, "q": [5, 6, 7], "amend": 5, "sec": [5, 8], "Such": [5, 7], "charg": 5, "investor": [5, 8], "aspx": 5, "websit": [5, 6, 7], "environment": [5, 7], "referenc": 5, "inact": 5, "textual": 5, "unknown": [5, 7], "advers": 5, "conjunct": 5, "consolid": 5, "nmacroeconom": 5, "facil": 5, "assembli": 5, "site": [5, 8], "nadvers": 5, "slow": 5, "recess": 5, "unemploy": 5, "inflat": 5, "tighter": 5, "currenc": 5, "monetari": 5, "contract": [5, 6], "logist": 5, "instabl": [5, 7], "inabl": 5, "financ": [5, 6, 7], "insolv": 5, "counterparti": 5, "debt": 5, "liquid": 5, "fair": [5, 7], "instrument": 5, "polit": [5, 7], "disput": 5, "geopolit": 5, "tension": [5, 7], "terror": 5, "accid": 5, "interrupt": 5, "npolit": 5, "outsourc": 5, "korea": 5, "vietnam": 5, "restrict": [5, 6, 7, 8], "tariff": 5, "export": 5, "portion": [5, 6], "revenu": [5, 8], "restructur": 5, "ceas": 5, "escal": [5, 7], "nmani": 5, "prone": [5, 7], "earthquak": 5, "climat": 5, "weather": 5, "plant": 5, "terrorist": [5, 7], "attack": [5, 7], "hostil": 5, "ransomwar": 5, "cybersecur": [5, 7], "labor": 5, "nsuch": 5, "imposs": [5, 6], "slowdown": 5, "outag": 5, "neg": [5, 7, 8], "pandem": 5, "covid": 5, "economi": 5, "imposit": 5, "stringent": [5, 6, 7], "travel": 5, "freight": 5, "movement": 5, "ramp": 5, "nfollow": 5, "expenditur": 5, "resum": 5, "exacerb": 5, "insur": 5, "nglobal": 5, "unabl": 5, "assur": [5, 7], "minor": [5, 7], "naddition": 5, "intensifi": 5, "seamlessli": 5, "nto": 5, "stimul": 5, "ndue": 5, "upgrad": 5, "quantiti": 5, "defect": 5, "defici": 5, "supersed": 5, "nsubstanti": 5, "transport": 5, "reimburs": 5, "warranti": 5, "unanticip": 5, "liabil": 5, "final": [5, 7, 8], "finish": [5, 7], "destin": 5, "prepay": 5, "termin": [5, 6], "recover": 5, "exposur": [5, 7], "nfutur": 5, "semiconductor": 5, "suffer": [5, 7], "constrain": [5, 6, 8], "shipment": 5, "unexpectedli": 5, "interfer": 5, "unsaf": [5, 7], "expos": [5, 7], "widespread": [5, 7], "vulner": [5, 7], "compromis": [5, 6, 7], "claim": [5, 6, 7], "intang": 5, "lost": [5, 7], "cancel": 5, "obsolet": 5, "exce": [5, 7], "realiz": 5, "accru": 5, "excess": 5, "impair": 5, "whenev": 5, "circumst": 5, "amount": [5, 7, 8], "carri": [5, 6, 8], "incur": 5, "unpredict": [5, 7], "obsolesc": 5, "forecast": [5, 7], "incorrectli": [5, 7, 8], "extens": [5, 6, 8], "issuanc": 5, "unknowingli": [5, 7], "notifi": 5, "preclud": 5, "bui": 5, "percept": 5, "android": 5, "playstat": 5, "nintendo": 5, "xbox": 5, "inclin": 5, "devot": 5, "dissatisfi": 5, "vast": [5, 7], "storefront": 5, "safari": 5, "union": [5, 7], "eu": [5, 7], "dma": 5, "narrow": [5, 6, 7], "scope": [5, 6, 7], "elimin": [5, 6], "nfailur": 5, "appeal": 5, "subscrib": 5, "nsome": 5, "manner": [5, 7], "nurtur": 5, "nmuch": 5, "chief": 5, "silicon": 5, "vallei": 5, "constantli": 5, "driver": [5, 6], "recruit": 5, "subsidi": 5, "staf": 5, "contractor": 5, "placement": 5, "increment": 5, "weaken": 5, "telecommun": 5, "war": 5, "virus": 5, "ins": 5, "incid": [5, 7], "ineffect": 5, "thing": [5, 8], "interf": 5, "imped": 5, "ship": 5, "nloss": 5, "unauthor": [5, 7], "confidenti": [5, 6], "encrypt": 5, "But": [5, 7, 8], "behalf": 5, "normal": [5, 7, 8], "investig": [5, 7], "penalti": [5, 6], "frequenc": [5, 6, 7], "actor": [5, 7], "circumv": [5, 7], "obfusc": 5, "forens": 5, "hinder": [5, 8], "recov": 5, "perpetr": 5, "profil": [5, 6], "authent": 5, "hack": [5, 7], "malfeas": 5, "faulti": 5, "password": 5, "irregular": 5, "fraudul": 5, "induc": 5, "disclos": [5, 8], "usernam": 5, "turn": [5, 7], "multifactor": 5, "unusu": 5, "freez": 5, "suspici": 5, "nwhile": 5, "ninvest": 5, "ongo": [5, 6], "contempl": 5, "endeavor": 5, "distract": 5, "tangibl": 5, "approv": 5, "oner": 5, "ventur": 5, "riski": 5, "leas": 5, "unfavor": 5, "arisen": 5, "ordinari": 5, "cours": [5, 6, 7], "resolv": [5, 6, 7], "sometim": [5, 8], "indemnif": 5, "indemnifi": 5, "alleg": 5, "magnitud": 5, "assert": 5, "royalti": 5, "vigor": 5, "defend": 5, "court": [5, 6], "internation": 5, "plaintiff": 5, "injunct": 5, "relief": 5, "nregardless": 5, "merit": 5, "recognit": [5, 6, 7], "settl": 5, "uncertain": 5, "disgorg": 5, "remedi": [5, 7], "worldwid": 5, "antitrust": 5, "bill": 5, "commerc": 5, "televis": 5, "film": 5, "anticorrupt": 5, "cash": 5, "repatri": 5, "launder": 5, "tax": 5, "wast": 5, "recycl": 5, "ncomplianc": 5, "impos": [5, 6, 7, 8], "agent": [5, 6, 7], "nregulatori": 5, "ban": [5, 7], "nexpect": 5, "increasingli": [5, 6, 7, 8], "greenhous": 5, "ga": 5, "emiss": 5, "civil": 5, "disagre": 5, "perceiv": 5, "feder": 5, "nfrom": 5, "noncompli": 5, "individu": [5, 6, 7], "lawsuit": [5, 6], "monopol": 5, "nfurther": 5, "earn": 5, "search": [5, 6, 7], "nthere": 5, "transfer": 5, "pass": [5, 6, 7, 8], "pend": 5, "inquiri": [5, 7], "government": 5, "entiti": [5, 6, 7, 8], "biometr": 5, "notif": 5, "permit": [5, 6, 8], "healthcar": [5, 6], "liabl": 5, "investigatori": 5, "cardhold": 5, "acquir": 5, "denomin": 5, "offset": 5, "strengthen": [5, 7], "nconvers": 5, "thu": 5, "hedg": 5, "deterior": 5, "sovereign": 5, "heighten": [5, 7], "worsen": 5, "A": [5, 6, 7, 8], "collater": 5, "bank": 5, "unsecur": 5, "subassembli": 5, "assembl": 5, "legisl": 5, "ireland": [5, 7], "singapor": 5, "organis": 5, "statutori": 5, "valuat": 5, "defer": 5, "bodi": [5, 7], "adequaci": 5, "ow": 5, "ngener": 5, "repurchas": 5, "dividend": 5, "consumm": 5, "declar": 5, "board": [5, 7], "unresolv": 5, "nnone": 5, "threat": [5, 7], "postur": 5, "25": [5, 6, 7], "2016": 5, "coordin": [5, 7], "committe": [5, 7], "oversight": [5, 7], "counsel": 5, "chair": 5, "headquart": 5, "cupertino": [5, 8], "center": [5, 7, 8], "formal": [5, 7, 8], "conclud": [5, 6], "uninstal": 5, "web": [5, 6, 7], "browser": 5, "june": 5, "contractu": 5, "desist": 5, "stai": [5, 6], "grant": 5, "ndepart": 5, "justic": 5, "depart": [5, 7], "doj": 5, "district": 5, "attornei": 5, "jersei": 5, "redress": [5, 7], "anticompetit": 5, "nonmonetari": 5, "defens": [5, 7], "nepic": 5, "epic": 5, "northern": 5, "unfair": [5, 7], "enjoin": 5, "extern": [5, 7], "januari": 5, "motion": 5, "oppos": [5, 7], "vacat": 5, "fourth": 5, "mine": 5, "nnot": 5, "aapl": 5, "nholder": 5, "na": [5, 7], "301": 5, "npurchas": 5, "nshare": 5, "nperiod": 5, "ttotal": 5, "taverag": 5, "npaid": 5, "nannounc": 5, "napproxim": 5, "That": [5, 7, 8], "nunder": 5, "njune": 5, "august": [5, 7], "nopen": 5, "negoti": [5, 7], "t35": 5, "697": 5, "t224": 5, "naugust": 5, "31": [5, 6], "t42": 5, "910": 5, "t221": 5, "39": [5, 6], "nseptemb": 5, "t33": 5, "653": 5, "t222": 5, "86": [5, 6], "ntotal": [5, 7], "t112": 5, "260": 5, "t89": 5, "074": 5, "110": 5, "10b5": 5, "reinvest": 5, "dow": 5, "supersector": 5, "27": [5, 7], "2019": 5, "n2218": 5, "tseptemb": 5, "t100": 5, "t207": 5, "t273": 5, "t281": 5, "t322": 5, "t430": 5, "t113": 5, "t156": 5, "t131": 5, "t155": 5, "t210": 5, "ndow": 5, "t146": 5, "t216": 5, "t215": 5, "nfirst": 5, "nsecond": 5, "nthird": 5, "sequoia": 5, "nfourth": 5, "plu": [5, 6], "nfiscal": 5, "six": 5, "realign": 5, "span": [5, 6, 7], "indirectli": 5, "n2024": 5, "tchang": 5, "t2023": 5, "t2022": 5, "namerica": 5, "t167": 5, "045": 5, "t3": 5, "t162": 5, "560": 5, "t169": 5, "658": 5, "neurop": 5, "t101": 5, "328": 5, "t7": 5, "294": 5, "t95": 5, "118": 5, "ngreater": 5, "t66": 5, "952": 5, "t72": 5, "559": 5, "t74": 5, "njapan": 5, "t25": 5, "052": 5, "t24": 5, "257": 5, "977": 5, "nrest": 5, "t30": 5, "t4": 5, "t29": 5, "615": 5, "t1": 5, "t391": 5, "035": 5, "t2": 5, "t383": 5, "285": 5, "t394": 5, "weak": [5, 7], "renminbi": 5, "yen": [5, 8], "t201": 5, "183": 5, "t200": 5, "583": 5, "t205": 5, "489": 5, "984": 5, "357": 5, "t40": 5, "177": [5, 7], "t26": 5, "694": 5, "t28": 5, "300": 5, "292": 5, "t37": 5, "005": 5, "t39": 5, "845": [5, 7], "t41": 5, "241": 5, "n96": 5, "169": 5, "t13": 5, "t85": 5, "t9": 5, "t78": 5, "129": [5, 7], "amort": 5, "bundl": 5, "flat": 5, "ngross": 5, "t109": 5, "633": 5, "t108": 5, "803": 5, "t114": 5, "728": 5, "t71": 5, "t60": 5, "345": 5, "t56": 5, "054": 5, "t180": 5, "683": 5, "148": 5, "t170": 5, "782": 5, "t36": 5, "t73": 5, "t70": 5, "t46": 5, "t44": 5, "t43": 5, "noper": 5, "t31": 5, "370": 5, "t5": 5, "915": 5, "t14": 5, "251": 5, "npercentag": 5, "t8": 5, "nsell": 5, "administr": 5, "097": 5, "932": 5, "094": 5, "t6": 5, "t57": 5, "467": 5, "t54": 5, "847": 5, "t51": 5, "t15": 5, "headcount": 5, "nprovis": 5, "749": 5, "t16": 5, "741": 5, "t19": 5, "neffect": 5, "nstatutori": 5, "t21": 5, "aid": [5, 7], "nliquid": 5, "unrestrict": 5, "140": 5, "ndebt": 5, "97": [5, 7], "payabl": 5, "promissori": 5, "nleas": 5, "nmanufactur": 5, "noncancel": 5, "ndeem": 5, "tcja": 5, "nstate": 5, "fund": [5, 6], "escrow": 5, "ncapit": 5, "95": [5, 7], "nrecent": 5, "pronounc": 5, "nincom": 5, "fasb": 5, "asu": 5, "09": [5, 7], "740": 5, "reconcili": 5, "reconcil": [5, 8], "disaggreg": 5, "prospect": 5, "novemb": [5, 7], "07": [5, 7, 8], "280": 5, "maker": 5, "codm": 5, "retrospect": 5, "ncritic": 5, "conform": [5, 8], "gaap": 5, "nuncertain": 5, "domest": 5, "taxat": 5, "resolut": 5, "conting": 5, "ninterest": 5, "forth": 5, "hypothet": 5, "nsensit": 5, "nhypothet": 5, "nrate": 5, "npotenti": 5, "n100": 5, "tenor": 5, "ndeclin": 5, "755": 5, "089": 5, "nterm": 5, "nincreas": 5, "t139": 5, "t194": 5, "nforeign": 5, "var": 5, "mont": 5, "carlo": 5, "interv": 5, "538": 5, "669": 5, "nindex": 5, "tpage": 5, "nconsolid": 5, "n29": 5, "n30": 5, "sheet": 5, "n31": 5, "n32": 5, "n33": 5, "nnote": 5, "n34": 5, "nreport": 5, "n48": 5, "nall": 5, "omit": [5, 8], "submiss": 5, "nyear": 5, "n2023": 5, "n2022": 5, "nnet": 5, "t294": 5, "866": 5, "t298": 5, "085": 5, "t316": 5, "199": 5, "t96": 5, "ncost": 5, "t185": 5, "233": 5, "t189": 5, "282": 5, "471": 5, "119": 5, "855": 5, "t22": 5, "075": 5, "352": 5, "t214": 5, "137": 5, "t223": 5, "546": 5, "t123": 5, "216": 5, "t119": 5, "437": 5, "t269": 5, "565": 5, "334": 5, "485": 5, "736": 5, "103": 5, "t93": 5, "995": 5, "t99": 5, "nearn": 5, "nbasic": 5, "ndilut": 5, "08": [5, 6, 8], "343": [5, 7], "783": 5, "744": 5, "215": 5, "963": 5, "095": 5, "812": 5, "547": 5, "325": 5, "819": 5, "nsee": 5, "translat": [5, 6, 7], "t395": 5, "765": 5, "511": 5, "unreal": 5, "832": 5, "t323": 5, "212": 5, "nadjust": 5, "337": 5, "717": 5, "394": 5, "138": 5, "850": 5, "563": 5, "104": 5, "t204": 5, "t253": 5, "816": 5, "899": 5, "272": 5, "t98": 5, "016": 5, "652": 5, "t88": 5, "531": 5, "nasset": 5, "ncurrent": 5, "ncash": 5, "943": 5, "965": 5, "228": 5, "590": 5, "naccount": 5, "410": 5, "508": 5, "nvendor": 5, "t32": 5, "833": 5, "477": 5, "ninventori": 5, "286": 5, "331": 5, "287": 5, "695": 5, "t152": 5, "987": 5, "t143": 5, "566": 5, "t91": 5, "479": 5, "544": 5, "t45": 5, "680": 5, "715": 5, "834": 5, "t64": 5, "758": 5, "t211": 5, "993": 5, "t209": 5, "017": 5, "t364": 5, "980": [5, 7], "t352": 5, "nliabil": 5, "t68": 5, "960": 5, "t62": 5, "611": 5, "304": 5, "t58": 5, "829": 5, "ndefer": 5, "249": 5, "061": 5, "ncommerci": 5, "967": 5, "985": 5, "t10": 5, "912": 5, "822": 5, "t176": 5, "392": 5, "t145": 5, "308": 5, "750": 5, "888": 5, "t49": 5, "848": 5, "638": 5, "t308": 5, "030": [5, 6], "t290": 5, "ncommit": 5, "nsharehold": 5, "400": 5, "116": 5, "786": 5, "550": 5, "n83": 5, "276": 5, "naccumul": 5, "deficit": 5, "154": 5, "214": 5, "172": 5, "452": 5, "950": 5, "146": [5, 7], "t50": 5, "672": 5, "t63": 5, "090": 5, "nbegin": 5, "849": 5, "365": 5, "423": 5, "346": 5, "175": 5, "withheld": 5, "settlement": 5, "521": 5, "971": 5, "t12": 5, "034": 5, "t11": 5, "nend": 5, "t83": 5, "nretain": 5, "068": 5, "562": 5, "ndividend": 5, "218": 5, "793": 5, "612": 5, "099": 5, "454": 5, "846": 5, "77": [5, 6], "046": 5, "186": 5, "109": 5, "t163": 5, "rsu": 5, "t0": 5, "98": [5, 6], "94": [5, 6, 7], "737": 5, "929": 5, "ndepreci": 5, "445": 5, "519": 5, "688": 5, "038": 5, "266": 5, "227": 5, "006": 5, "788": 5, "356": 5, "271": 5, "520": 5, "618": 5, "484": 5, "731": 5, "684": 5, "499": 5, "020": 5, "889": 5, "448": 5, "552": 5, "031": 5, "t118": 5, "254": 5, "t110": 5, "543": 5, "t122": 5, "151": 5, "48": [5, 6], "656": 5, "513": 5, "76": [5, 7], "923": 5, "nproce": 5, "211": 5, "686": 5, "917": 5, "135": 5, "828": 5, "446": 5, "447": 5, "959": 5, "708": 5, "086": 5, "935": 5, "705": 5, "354": 5, "nfinanc": 5, "441": 5, "431": 5, "223": [5, 7], "234": [5, 7], "025": 5, "841": 5, "nrepurchas": 5, "949": 5, "89": [5, 7], "402": 5, "465": 5, "nrepay": 5, "958": 5, "repay": 5, "978": 5, "955": 5, "361": 5, "581": 5, "160": 5, "121": 5, "983": 5, "488": 5, "794": 5, "760": 5, "nsupplement": 5, "102": 5, "t18": 5, "679": 5, "573": 5, "33": [5, 6, 7], "nbasi": 5, "prior": [5, 7], "reclassifi": 5, "nrevenu": 5, "remit": [5, 7], "straight": 5, "vest": 5, "sold": 5, "nderiv": 5, "nonleas": 5, "34": [5, 7], "entitl": 5, "commenc": 5, "deliveri": 5, "stand": 5, "ssp": 5, "icloud": 5, "siri": 5, "discount": 5, "undeliv": 5, "unbil": 5, "n26": 5, "n37": 5, "moder": [5, 6], "64": [5, 6, 7], "dilut": 5, "nnumer": 5, "ndenomin": 5, "nweight": 5, "312": 5, "316": 5, "856": 5, "antidilut": 5, "tunreal": 5, "ngain": 5, "tfair": 5, "nvalu": 5, "tcash": 5, "nequival": 5, "tcurrent": 5, "tnon": 5, "t27": 5, "nlevel": 5, "nmonei": 5, "t778": 5, "nmutual": 5, "n515": 5, "t105": 5, "t617": 5, "nsubtot": 5, "293": 5, "395": 5, "nu": 5, "treasuri": 5, "516": 5, "t212": 5, "087": 5, "380": 5, "159": 5, "t703": 5, "t17": 5, "568": 5, "158": 5, "810": 5, "ncertif": 5, "deposit": 5, "t873": 5, "t387": 5, "t478": 5, "066": 5, "ncorpor": 5, "t65": 5, "622": 5, "t270": 5, "953": 5, "939": 5, "027": 5, "t47": 5, "886": 5, "nmunicip": 5, "t412": 5, "t405": 5, "t190": 5, "nmortgag": 5, "595": 5, "t175": 5, "403": 5, "t23": 5, "367": 5, "278": [5, 7], "t132": 5, "t583": 5, "635": 5, "t128": 5, "056": 5, "966": 5, "t34": 5, "t160": 5, "t688": 5, "650": 5, "36": [5, 6, 7], "359": [5, 7], "t481": 5, "n442": 5, "t428": 5, "t923": 5, "t909": 5, "406": 5, "114": 5, "468": 5, "136": 5, "t271": 5, "533": 5, "048": [5, 6], "491": 5, "332": 5, "t320": 5, "t608": 5, "t76": 5, "840": 5, "956": 5, "890": 5, "t20": 5, "627": 5, "243": 5, "t628": 5, "t602": 5, "t192": 5, "t410": 5, "735": 5, "636": 5, "t344": 5, "t144": 5, "470": 5, "657": 5, "831": 5, "125": 5, "162": 5, "t173": 5, "752": 5, "corrobor": 5, "mortgag": 5, "classifi": [5, 7], "37": [5, 7], "swap": 5, "remeasur": 5, "notion": 5, "069": 5, "730": 5, "575": 5, "493": 5, "t104": 5, "777": 5, "nhedg": 5, "433": 5, "505": 5, "247": [5, 7], "ntrade": 5, "41": [5, 6, 7], "44": [5, 7], "depreci": 5, "nland": 5, "690": 5, "nmachineri": 5, "t80": 5, "205": [5, 6], "314": 5, "nleasehold": 5, "839": 5, "599": 5, "73": [5, 6, 7], "884": 5, "852": 5, "t55": 5, "906": 5, "601": 5, "703": 5, "010": 5, "457": 5, "634": 5, "391": 5, "neuropean": 5, "opinion": [5, 7], "1991": 5, "2007": 5, "irish": 5, "branch": 5, "2003": 5, "2014": 5, "2015": 5, "minist": 5, "juli": [5, 7], "annul": 5, "ecj": 5, "hear": 5, "asid": 5, "confirm": 5, "unrecogn": 5, "nfeder": 5, "571": 5, "080": 5, "644": 5, "265": 5, "801": 5, "726": 5, "570": 5, "298": 5, "49": [5, 7], "t84": 5, "428": 5, "603": 5, "483": [5, 7], "t347": 5, "t669": 5, "076": 5, "830": 5, "419": 5, "072": 5, "pretax": 5, "72": [5, 7], "ncomput": 5, "885": 5, "012": 5, "124": 5, "518": 5, "nimpact": 5, "246": 5, "311": 5, "366": 5, "397": 5, "nexcess": 5, "893": 5, "871": 5, "192": [5, 7], "739": 5, "ntax": 5, "carryforward": 5, "302": 5, "naccru": 5, "413": [5, 7], "421": 5, "nunreal": 5, "173": 5, "168": 5, "873": 5, "743": 5, "nless": 5, "374": 5, "007": 5, "369": 5, "551": 5, "998": 5, "nright": 5, "179": 5, "nminimum": 5, "674": 5, "940": 5, "t511": 5, "t455": 5, "t490": 5, "805": 5, "202": 5, "indefinit": 5, "temporari": 5, "727": 5, "044": 5, "284": 5, "ndecreas": 5, "386": 5, "463": 5, "982": 5, "542": 5, "936": 5, "070": 5, "expir": 5, "statut": 5, "229": 5, "494": 5, "closur": 5, "intercompani": 5, "exceed": [5, 7], "multiyear": 5, "exercis": 5, "noncash": 5, "rou": 5, "tfinanci": 5, "t2024": 5, "tother": 5, "661": 5, "tproperti": 5, "015": 5, "303": 5, "676": 5, "t165": 5, "t752": 5, "t859": 5, "430": 5, "842": [5, 7], "tfinanc": 5, "n2025": 5, "820": 5, "t171": 5, "991": 5, "n2026": 5, "914": 5, "n2027": 5, "t59": 5, "733": 5, "n2028": 5, "360": 5, "t38": 5, "398": 5, "n2029": 5, "187": 5, "nthereaft": 5, "t837": 5, "undiscount": 5, "790": 5, "imput": 5, "376": 5, "534": 5, "t896": 5, "borrow": 5, "proce": 5, "nine": [5, 7], "nmatur": 5, "333": 5, "264": 5, "948": 5, "645": 5, "309": 5, "arrear": 5, "namount": 5, "n2013": 5, "nfix": 5, "2062": 5, "t97": 5, "341": 5, "03": 5, "65": [5, 7], "t106": 5, "572": 5, "n97": 5, "nunamort": 5, "321": 5, "358": 5, "113": 5, "662": 5, "930": 5, "342": 5, "800": 5, "180": 5, "88": 5, "ndure": 5, "425": 5, "426": 5, "372": 5, "589": 5, "055": 5, "appreci": 5, "four": [5, 6, 7], "holder": [5, 6], "n2014": 5, "bonu": 5, "nrestrict": 5, "nnumber": 5, "nrsu": 5, "ngrant": 5, "naggreg": 5, "nfair": 5, "nbalanc": 5, "t240": 5, "427": [5, 7], "t75": 5, "t150": 5, "861": 5, "501": 5, "768": 5, "87": [5, 6, 7], "101": [5, 7], "878": 5, "144": 5, "t127": 5, "t135": 5, "91": [5, 7], "456": 5, "78": [5, 6, 7], "59": [5, 7], "t140": 5, "326": 5, "t158": 5, "204": 5, "350": 5, "002": [5, 6], "nuncondit": 5, "uncondit": 5, "206": 5, "440": 5, "156": 5, "t633": 5, "t670": 5, "226": 5, "45": 5, "nconting": 5, "accrual": 5, "nconcentr": 5, "attribut": [5, 6, 7, 8], "46": 5, "t67": 5, "098": 5, "082": 5, "062": 5, "569": 5, "895": 5, "458": 5, "207": 5, "nonrecur": 5, "t142": 5, "196": 5, "t138": 5, "t147": 5, "859": 5, "nchina": 5, "n66": 5, "t181": 5, "887": 5, "t172": 5, "269": 5, "nlong": 5, "664": 5, "797": 5, "778": 5, "219": 5, "nopinion": 5, "nwe": 5, "fairli": 5, "pcaob": 5, "sponsor": 5, "treadwai": 5, "2013": 5, "unqualifi": 5, "thereon": 5, "nthese": 5, "misstat": 5, "fraud": [5, 7], "ndescript": 5, "naudit": 5, "nhow": 5, "nmatter": 5, "qualifi": 5, "letter": 5, "advisor": 5, "ernst": 5, "llp": 5, "auditor": 5, "2009": 5, "nsan": 5, "jose": 5, "nnovemb": 5, "coso": 5, "nour": 5, "ndefinit": 5, "disposit": 5, "receipt": 5, "nevalu": 5, "nbase": 5, "supervis": [5, 6, 7, 8], "13a": 5, "15d": 5, "ninher": 5, "paragraph": 5, "51": [5, 7, 8], "ninsid": 5, "deirdr": 5, "brien": 5, "vice": 5, "presid": 5, "affirm": 5, "april": 5, "withhold": 5, "remitt": 5, "mr": 5, "copi": 5, "solicit": 5, "00042": 5, "nincorpor": 5, "texhibit": 5, "descript": [5, 6, 7, 8], "tform": 5, "tfile": 5, "nrestat": 5, "namend": 5, "bylaw": 5, "nindentur": 5, "york": [5, 6, 8], "mellon": 5, "truste": 5, "noffic": 5, "certif": 5, "2018": 5, "85": [5, 6, 7], "05": 5, "2044": 5, "februari": 5, "2045": 5, "900": 5, "700": [5, 6], "250": [5, 7], "2036": 5, "2046": 5, "450": 5, "2047": 5, "2049": 5, "2030": 5, "2050": 5, "2060": 5, "2028": 5, "2041": 5, "2061": 5, "2032": 5, "2052": 5, "54": 5, "2033": 5, "2053": 5, "n12": 5, "nsubsidiari": 5, "n23": 5, "nconsent": 5, "n24": 5, "npower": 5, "signatur": 5, "nrule": 5, "nsection": 5, "1350": 5, "n101": 5, "ninlin": 5, "xbrl": 5, "n104": 5, "inlin": 5, "compensatori": 5, "herewith": 5, "furnish": 5, "herebi": 5, "undertak": 5, "56": [5, 6, 7], "nsignatur": 5, "npursuant": 5, "duli": 5, "undersign": 5, "thereunto": 5, "ndate": 5, "nby": 5, "luca": [5, 8], "maestri": 5, "nluca": 5, "nsenior": 5, "nchief": 5, "nknow": 5, "THESE": 5, "appoint": 5, "cook": 5, "jointli": 5, "her": 5, "substitut": 5, "him": 5, "thereto": 5, "therewith": 5, "ratifi": 5, "virtu": 5, "hereof": 5, "nname": 5, "ttitl": 5, "tdate": 5, "tchief": 5, "tnovemb": 5, "ntimothi": 5, "tsenior": 5, "kondo": 5, "nchri": 5, "wanda": 5, "austin": 5, "nwanda": 5, "gorski": 5, "tdirector": 5, "nalex": 5, "jung": 5, "nandrea": 5, "arthur": 5, "levinson": 5, "narthur": 5, "monica": 5, "lozano": 5, "nmonica": 5, "ronald": 5, "sugar": 5, "nronald": 5, "susan": 5, "wagner": 5, "nsusan": 5, "57": [5, 6], "turbo": [5, 6, 8], "outlin": [5, 6, 7], "invdestacksmeticsisdict": 5, "setispect": 5, "20cyan": 5, "evaluationseld": 5, "anvis": 5, "droitent": 5, "discernminerv": 5, "versbobprefvers": 5, "vo\u8be5": 5, "option\u548c": 5, "meio": 5, "\u0432\u0440\u0435\u043ccisco": 5, "dellaischenpoihscap": 5, "geme": 5, "gettim": 5, "unscal": 5, "vocabulari": [5, 6, 8], "closer": 5, "sharpen": 5, "uniform": 5, "raschka": 5, "repetit": [5, 8], "radic": 5, "grappl": 5, "safer": [5, 7], "fascin": 5, "spontan": 5, "answer": [5, 6, 7, 8], "aren": [5, 6], "linear": 5, "absent": [5, 7], "coax": 5, "journei": 5, "suddenli": 5, "manifest": 5, "deliber": [5, 7], "contend": 5, "rethink": 5, "tutor": 5, "children": [5, 7], "verifi": [5, 6, 8], "predefin": [5, 8], "weren": 5, "kind": 5, "usual": 5, "quantif": 5, "contamin": [5, 7], "unseen": [5, 7], "longitudin": 5, "mostli": [5, 8], "latter": 5, "tailor": [5, 7], "great": [5, 6, 8], "cognit": 5, "misinform": [5, 7], "tempor": 5, "disclaim": 5, "referr": 5, "incorrect": [5, 7], "demograph": [5, 7], "stereotyp": [5, 7], "societ": [5, 7], "pii": [5, 7], "anonym": 5, "leakag": [5, 7], "carryov": 5, "fallaci": 5, "think": [5, 6, 7], "idiom": 5, "sarcasm": 5, "terminologi": 5, "lingual": 5, "misunderstand": 5, "syntax": 5, "scan": 5, "compat": [5, 6, 8], "overconfid": 5, "clariti": [5, 7, 8], "audienc": 5, "densiti": 5, "satisfact": [5, 8], "misus": [5, 7], "moral": 5, "co2": 5, "etc": [5, 8], "palm": [5, 6], "easi": [5, 6, 7], "synthet": [5, 6, 7, 8], "timeout": 5, "inter": 5, "rater": 5, "ti": 5, "holist": [5, 7], "experiment": [5, 6, 8], "vi": 5, "categor": [5, 6, 7, 8], "intrins": [5, 6], "extrins": 5, "sequenc": [5, 6, 8], "perplex": [5, 6], "downstream": [5, 8], "synthesi": 5, "discret": 5, "prefix": [5, 7], "roug": 5, "bleu": 5, "bilingu": 5, "understudi": 5, "overlap": 5, "favor": [5, 6, 8], "breviti": 5, "insensit": 5, "semant": [5, 8], "orient": [5, 7], "gist": 5, "meteor": 5, "synonym": 5, "paraphras": 5, "alongsid": [5, 7], "computation": 5, "cider": 5, "consensu": 5, "tf": 5, "idf": 5, "caption": 5, "reliant": 5, "corpu": [5, 6], "ter": 5, "edit": [5, 7], "hypothesi": 5, "penal": 5, "bertscor": 5, "contextu": [5, 7], "bert": 5, "spice": 5, "proposit": [5, 6], "scene": [5, 7], "analyst": 5, "rouge_1": 5, "rouge_2": 5, "ideal": [5, 6, 7, 8], "setup": [5, 6, 7, 8], "evaluate_summari": 5, "unigram": 5, "bigram": 5, "absl": 5, "py": [5, 8], "rouge_scor": 5, "generated_summari": 5, "reference_summari": 5, "google_bleu": 5, "bleu_scor": 5, "rouge1": 5, "rouge2": 5, "arbitrari": 5, "chosen": [5, 7], "sentence1": 5, "cat": [5, 7], "sat": 5, "mat": 5, "sentence2": 5, "ate": 5, "3333333333333333": 5, "7272727272727272": 5, "4444444444444445": 5, "generate_summari": 5, "summir": 5, "liner": 5, "evaluate_summary_model": 5, "model_benchmark": 5, "models_test": 5, "benchmark_summari": 5, "model_summari": 5, "evaluation_result": 5, "statu": 5, "concis": [5, 6], "element": [5, 7, 8], "verbos": [5, 6, 7, 8], "peripher": 5, "quit": [5, 6, 8], "convei": 5, "breadth": 5, "Of": [5, 6, 7], "vibe": 5, "visualize_prompt_comparison": 5, "matplotlib": 5, "radar": 5, "radar_plot": 5, "tmp": 5, "ipykernel_1652501": 5, "940173201": 5, "userwarn": [5, 8], "figurecanvasagg": 5, "largest": [5, 6], "sarmah": 5, "granular": [5, 6], "likert": 5, "ensembl": 5, "repeatedli": 5, "fluenci": 5, "refin": 5, "integ": [5, 8], "rubric": 5, "hollist": 5, "judgeevalu": 5, "grammar": [5, 6, 8], "evaluate_with_llm": 5, "criterion": 5, "judge_model": 5, "candidate_summari": 5, "grammat": 5, "y": [5, 7, 8], "z": 5, "w": [5, 6, 7], "benchmark_model": 5, "test_model": 5, "input_text": [5, 6], "trillion": [5, 6], "evals_list": 5, "1775618912": 5, "slightli": 5, "drift": [5, 7], "lowest": [5, 6], "firstli": 5, "overhead": [5, 6], "egocentr": 5, "tight": 5, "medicin": [5, 7], "glider": 5, "deshpand": 5, "3b": 5, "685": 5, "aplic": 5, "earlier": [5, 7], "depict": [5, 7, 8], "multilingu": [5, 6, 7], "golden": 5, "languang": 5, "arena": 5, "randomli": 5, "customiz": [5, 6, 7], "irrelev": 5, "unhelp": [5, 7], "occasion": 5, "rare": 5, "perfectli": 5, "cater": [5, 6], "critiqu": [5, 7], "elo": 5, "exam": 5, "probe": [5, 7], "certifi": 5, "began": [5, 6], "glue": 5, "entail": [5, 6], "superglu": 5, "successor": 5, "grew": 5, "big": 5, "bench": [5, 6], "srivastava": 5, "truthfulqa": [5, 6], "multitask": 5, "hendryck": [5, 7], "multidisciplinari": 5, "stanford": 5, "helm": 5, "multidimension": 5, "surround": [5, 6, 7, 8], "humanev": [5, 6], "lmsy": 5, "brought": 5, "dialogu": [5, 6], "chiang": 5, "gather": 5, "alpacaev": 5, "duboi": 5, "mt": 5, "argilla": 5, "mila": 5, "mit": [5, 6], "contributor": [5, 6, 8], "western": 5, "centric": 5, "divid": [5, 7], "subset": [5, 7], "agnost": 5, "dialect": 5, "render": [5, 7], "crowdsourc": 5, "livebench": 5, "white": [5, 7], "resili": [5, 7], "meaningfulli": 5, "zebralog": 5, "grid": 5, "puzzl": 5, "brailsford": 5, "1999": 5, "lsat": 5, "hous": 5, "clue": 5, "deduct": 5, "programmat": [5, 8], "2x2": 5, "6x6": 5, "shot": [5, 7, 8], "reductio": 5, "ad": [5, 6, 7, 8], "absurdum": 5, "hard": 5, "10b": 5, "counterfactu": 5, "came": 5, "arc": 5, "prize": [5, 7], "chollet": 5, "mike": [5, 7], "knoop": 5, "founder": 5, "zapier": 5, "fran\u00e7oi": 5, "creator": [5, 6], "agi": 5, "kera": 5, "genuin": 5, "possess": 5, "elementari": 5, "novelti": 5, "interpol": 5, "synthes": 5, "fly": 5, "brute": 5, "pixel": 5, "unbeaten": 5, "win": [5, 6], "poorli": 5, "recombin": 5, "spur": [5, 7], "takeawai": 5, "vertic": [5, 7], "finbench": 5, "legalbench": 5, "guha": 5, "berkelei": 5, "bfcl": 5, "patil": 5, "fourrier": 5, "bespok": 5, "sdk": 5, "autoregress": 5, "sub": [5, 6], "liter": 5, "disturb": 5, "zero": [5, 6, 7, 8], "varianc": [5, 7], "yt": 5, "ut": 5, "ol": 5, "heteroscedast": 5, "regress": 5, "bivari": 5, "evaluation_track": 5, "evaluationtrack": 5, "model_config": 5, "basemodelconfig": 5, "parallelismmanag": 5, "pipelineparamet": 5, "envconfig": 5, "is_accelerate_avail": 5, "datetim": 5, "timedelta": 5, "initprocessgroupkwarg": 5, "create_evaluation_pipelin": 5, "cache_dir": 5, "float16": 5, "max_sampl": 5, "kwargs_handl": 5, "3000": 5, "save_detail": 5, "pipeline_param": 5, "launcher_typ": 5, "env_config": 5, "override_batch_s": 5, "use_chat_templ": 5, "trust_remote_cod": 5, "pipeline_paramet": 5, "schemat": 5, "vllm": [5, 8], "tgi": 5, "num_few_shot": 5, "bar": 5, "bigbench": 5, "winogrand": 5, "hellaswag": 5, "nlp": [5, 6, 7], "save_and_push_result": 5, "show_result": 5, "model_arg": 5, "send": [5, 6, 7, 8], "serverless": 5, "inference_server_address": 5, "inference_server_auth": 5, "model_id": 5, "null": 5, "bash": [5, 6], "command": [5, 6], "model_config_path": 5, "endpoint_model": 5, "llama3": 5, "qwen2": [5, 6, 8], "alibaba": [5, 6, 8], "5b": [5, 6, 8], "hui": [5, 6], "allal": [5, 6], "cluster": 5, "noteworthi": [5, 6], "grain": [5, 6, 8], "salt": [5, 8], "modular": 5, "offici": [5, 8], "revisit": 5, "trace": 5, "langchain_tracing_v2": 5, "langchain_api_kei": 5, "hf_evalu": 5, "langsmith_evalu": 5, "ls_client": 5, "dataset_nam": 5, "create_dataset": 5, "create_exampl": 5, "dataset_id": 5, "calculate_scor": 5, "reference_output": 5, "oai_client": 5, "xp_model_nam": 5, "lastli": 5, "run_evalu": 5, "And": [5, 6, 7], "upload_result": 5, "experiment_prefix": 5, "num_repetit": 5, "386a3620": 5, "9e1cc3cb": 5, "9d6a": 5, "4356": 5, "ab34": 5, "138e0abe8be4": 5, "8741976e": 5, "5268": 5, "4b75": 5, "949f": 5, "99477dde5d64": 5, "selectedsess": 5, "b831dc1e": 5, "90bc": 5, "4ed8": 5, "8080": [5, 6], "fb42444724d6": 5, "4it": 5, "latest": [5, 6, 7, 8], "tobia": [5, 8], "evaluate_modul": 5, "6fc70b7be0088120a372dfdd5d320b39b8bb3630cb8029b193941d9376e86bb0": 5, "tue": 5, "nov": [5, 6], "couldn": 5, "5it": 5, "5053784e": 5, "64445871": 5, "a53c": 5, "44b1": 5, "a422": 5, "4f49b2f9656f": 5, "69": [5, 7], "4b29f3c9": 5, "9ef7e39a": 5, "2add": 5, "410c": 5, "89f8": 5, "9f1a8b198cf1": 5, "61": [5, 7], "insert": 5, "combined_df": 5, "concat": [5, 7], "ignore_index": [5, 7], "execution_tim": 5, "example_id": 5, "333333": 5, "224388": 5, "feb10f92": 5, "3167": 5, "41f3": 5, "bb1c": 5, "d271153a31a8": 5, "5b196b22": 5, "9f4c": 5, "489c": 5, "b020": 5, "7823208b42d6": 5, "348101": 5, "722464": 5, "c310f159": 5, "064a": 5, "4035": 5, "97c3": 5, "a25bbf43abc2": 5, "386076": 5, "704104": 5, "f7f24899": 5, "dd50": 5, "409e": 5, "93cc": 5, "6fb1622b60bf": 5, "443038": 5, "725059": 5, "242856d6": 5, "efb5": 5, "4101": 5, "b1cf": 5, "5805532838ac": 5, "373418": 5, "795302": 5, "ce975169": 5, "a0ab": 5, "40ce": 5, "8e32": 5, "efa28d06079d": 5, "stat": [5, 6], "groupbi": [5, 7], "agg": [5, 7], "sort": 5, "sort_valu": 5, "subplot": 5, "pyplot": 5, "plt": 5, "ax1": 5, "ax2": 5, "figsiz": 5, "2ecc71": 5, "3498db": 5, "e74c3c": 5, "bleu_mean": 5, "bleu_std": 5, "enumer": [5, 7], "errorbar": 5, "yerr": 5, "fmt": 5, "markers": 5, "capsiz": 5, "set_ylabel": 5, "set_titl": 5, "set_xtick": 5, "set_xticklabel": 5, "rotat": 5, "set_ylim": 5, "bottom": 5, "legend": 5, "exec_mean": 5, "exec_std": 5, "tight_layout": 5, "ndetail": 5, "4038": 5, "0453": 5, "7815": 5, "0433": 5, "3768": 5, "0424": 5, "8343": 5, "2208": 5, "3519": 5, "0775": 5, "9122": 5, "1482": 5, "377": 5, "042": 5, "078": 5, "slower": [5, 7], "04": [5, 6], "interestingli": 5, "decoupl": 5, "reload": 5, "facilit": [5, 7], "promptfooconfig": 5, "model_comparison": 5, "pretti": [5, 7], "dump": 5, "default_flow_styl": 5, "sort_kei": 5, "prompt1": 5, "defaulttest": 5, "1000m": 5, "eval_data": 5, "latency_m": 5, "totallatencym": 5, "token_usag": 5, "tokenusag": 5, "assert_pass": 5, "assertpasscount": 5, "assert_fail": 5, "assertfailcount": 5, "prompt_token": [5, 6], "num_request": 5, "numrequest": 5, "2463": 5, "000035": 5, "3773": 5, "004620": 5, "1669": 5, "000091": 5, "1669m": 5, "highest": [5, 6, 8], "3773m": 5, "00462": 5, "promptfool": 5, "manual": [5, 6, 7, 8], "redefin": 5, "prompt_comparison": 5, "prompt2": 5, "prompt3": 5, "prompt_fil": 5, "prompt_cont": 5, "BE": 5, "again": 5, "prompt_id": 5, "promptid": 5, "gradingresult": 5, "df_raw": 5, "reset_index": [5, 7], "eas": [5, 6, 7, 8], "hf": [5, 6], "plain": [5, 6], "vanilla": 5, "defi": 5, "accustom": 5, "legaci": 5, "unsustain": 5, "prd": 5, "cultiv": [5, 7], "organiz": 5, "stagnat": 5, "alb": [5, 6], "loubna": [5, 6], "anton": [5, 6], "lozhkov": [5, 6], "bakouch": [5, 6], "gabriel": [5, 6, 7], "mart\u00edn": [5, 6, 7], "bl\u00e1zquez": [5, 6], "lewi": [5, 6], "tunstal": [5, 6], "agust\u00edn": [5, 6], "piquer": [5, 6], "andr": [5, 6], "marafioti": [5, 6], "cyril": [5, 6], "zakka": [5, 6], "leandro": [5, 6], "werra": [5, 6], "wolf": [5, 6], "are24": 5, "judgearena": 5, "bps99": 5, "salli": 5, "pott": 5, "barbara": 5, "557": [5, 7], "sciencedirect": 5, "s0377221798003646": 5, "doi": [5, 7, 8], "1016": 5, "s0377": 5, "2217": 5, "00364": 5, "ctj": 5, "jerri": [5, 7], "tworek": [5, 7], "heewoo": [5, 7], "jun": [5, 7], "qime": [5, 7], "henriqu": [5, 7], "pond": [5, 7], "de": [5, 7], "oliveira": [5, 7], "pinto": [5, 7], "harri": [5, 7], "yuri": 5, "burda": 5, "greg": [5, 7], "brockman": [5, 7], "raul": [5, 7], "puri": [5, 7], "gretchen": [5, 7], "krueger": [5, 7], "petrov": [5, 7], "heidi": 5, "khlaaf": 5, "girish": [5, 7], "sastri": [5, 7], "brook": [5, 7], "chan": [5, 7], "grai": [5, 7], "ryder": [5, 7], "mikhail": [5, 7], "pavlov": [5, 7], "alethea": [5, 7], "lukasz": 5, "kaiser": [5, 7], "mohammad": [5, 7], "bavarian": [5, 7], "clemen": [5, 7], "winter": [5, 7], "philipp": 5, "tillet": [5, 7], "felip": [5, 7], "petroski": [5, 7], "dave": [5, 7], "cum": [5, 7], "plappert": 5, "fotio": 5, "chantzi": [5, 7], "barn": 5, "ariel": 5, "herbert": 5, "voss": [5, 7], "hebgen": 5, "guss": 5, "nichol": 5, "paino": [5, 7], "nikola": [5, 7], "tezak": [5, 7], "babuschkin": [5, 7], "suchir": [5, 7], "balaji": [5, 7], "shantanu": [5, 7], "jain": [5, 7], "hess": [5, 7], "carr": 5, "josh": [5, 7], "achiam": [5, 7], "vedant": 5, "misra": 5, "evan": [5, 6, 7], "morikawa": [5, 7], "matthew": 5, "knight": [5, 7], "mile": [5, 7], "brundag": [5, 7], "mira": [5, 7], "murati": [5, 7], "kati": [5, 7], "mayer": [5, 7], "bob": [5, 7, 8], "mcgrew": [5, 7], "ilya": [5, 7], "sutskev": [5, 7], "wojciech": [5, 7], "zaremba": [5, 7], "2107": 5, "03374": 5, "cz": 5, "lianmin": 5, "ying": 5, "sheng": 5, "anastasio": 5, "angelopoulo": 5, "tianl": 5, "dacheng": 5, "banghua": 5, "jordan": [5, 7], "gonzalez": 5, "ion": 5, "stoica": 5, "04132": 5, "cho24a": 5, "francoi": 5, "arcpriz": 5, "cho24b": 5, "drcw": 5, "darshan": 5, "selvan": 5, "sunitha": 5, "ravi": 5, "sky": 5, "ch": 5, "bartosz": 5, "mielczarek": 5, "anand": [5, 7], "kannappan": [5, 7], "qian": [5, 7], "14140": 5, "dglh24": 5, "yann": 5, "bal\u00e1z": 5, "galambosi": 5, "tatsunori": 5, "hashimoto": 5, "debia": 5, "04475": 5, "fac24a": 5, "wiki": [5, 8], "fac24b": 5, "fac24c": 5, "model_doc": 5, "fac24d": 5, "cookbook": 5, "llm_judg": 5, "fac24f": 5, "fhwt23": 5, "cl\u00e9mentin": 5, "nathan": 5, "habib": 5, "gnh": 5, "julian": 5, "nyarko": 5, "ho": 5, "r\u00e9": 5, "adam": [5, 7], "chilton": 5, "aditya": [5, 7], "narayana": 5, "chohla": 5, "brandon": [5, 7, 8], "waldon": 5, "rockmor": 5, "diego": 5, "zambrano": 5, "dmitri": 5, "talisman": 5, "enam": 5, "hoqu": 5, "faiz": 5, "surani": 5, "frank": [5, 7], "fagan": 5, "galit": 5, "sarfati": 5, "gregori": 5, "dickinson": 5, "haggai": 5, "porat": 5, "hegland": 5, "jessica": [5, 7], "joe": [5, 7], "nudel": 5, "joel": [5, 7], "niklau": 5, "nai": 5, "choi": 5, "margaret": [5, 6], "hagan": 5, "megan": 5, "livermor": 5, "nikon": 5, "rasumov": 5, "rahe": 5, "nil": 5, "holzenberg": 5, "noam": 5, "kolt": 5, "henderson": 5, "rehaag": 5, "sharad": 5, "shang": 5, "spencer": 5, "sunni": 5, "gandhi": 5, "zur": 5, "varun": 5, "iyer": 5, "zehua": 5, "2308": 5, "11462": 5, "hbb": 5, "collin": 5, "burn": 5, "steven": [5, 7], "basart": [5, 7], "zou": [5, 7], "manta": [5, 7], "mazeika": [5, 7], "03300": 5, "hbd": 5, "maxwel": 5, "forb": 5, "yejin": 5, "curiou": 5, "neural": [5, 8], "degener": 5, "1904": 5, "09751": 5, "hyc": [5, 6], "binyuan": [5, 6], "zeyu": [5, 6], "cui": [5, 6], "jiaxi": [5, 6], "dayiheng": [5, 6], "tianyu": [5, 6], "jiajun": [5, 6], "kai": [5, 6, 7], "dang": [5, 6], "coder": [5, 6], "preprint": [5, 6, 8], "2409": [5, 6, 7], "12186": [5, 6], "lx": 5, "zhen": 5, "xiaohan": 5, "jia": 5, "yuxuan": 5, "lai": 5, "chongyang": 5, "shuai": 5, "nlg": 5, "07103": 5, "lbl": 5, "bommasani": 5, "toni": 5, "dimitri": 5, "tsipra": 5, "dilara": 5, "soylu": 5, "michihiro": 5, "yasunaga": 5, "yian": 5, "deepak": 5, "narayanan": 5, "yuhuai": 5, "newman": 5, "binhang": 5, "bobbi": 5, "ce": 5, "christian": [5, 7], "cosgrov": 5, "acosta": 5, "nava": [5, 7], "drew": 5, "hudson": 5, "zelikman": 5, "esin": 5, "durmu": 5, "faisal": 5, "ladhak": 5, "frieda": 5, "rong": 5, "ren": [5, 6], "huaxiu": 5, "yao": [5, 7, 8], "jue": 5, "keshav": 5, "santhanam": 5, "laurel": 5, "lucia": 5, "mert": 5, "yuksekgonul": 5, "mirac": 5, "suzgun": 5, "niladri": 5, "chatterji": 5, "omar": 5, "khattab": 5, "chi": [5, 8], "sang": 5, "shibani": [5, 7], "santurkar": [5, 7], "surya": 5, "icard": 5, "tianyi": 5, "vishrav": 5, "chaudhari": 5, "xuechen": 5, "yuhui": 5, "yuta": 5, "koreeda": 5, "2211": 5, "09110": 5, "lbc24": 5, "ronan": 5, "bra": 5, "allenai": 5, "lhe22": [5, 6, 7], "stephani": [5, 6, 7], "owain": [5, 6, 7], "mimic": [5, 6, 7], "falsehood": [5, 6, 7], "2109": [5, 6, 7], "07958": [5, 6, 7], "pzwg23": 5, "shishir": 5, "tianjun": 5, "xin": [5, 7], "gorilla": 5, "15334": 5, "pro24": 5, "dev": 5, "ras24": 5, "sebastian": 5, "scratch": 5, "1633437166": 5, "sll": 5, "bhaskarjit": 5, "mingshu": 5, "jingrao": 5, "lyu": 5, "nathalia": 5, "castellano": 5, "pasquali": 5, "dhagash": 5, "12148": 5, "srf": 5, "shivalika": 5, "angelika": 5, "roman": [5, 7], "adelani": 5, "ngui": 5, "vila": 5, "suero": 5, "peerat": 5, "limkonchotiwat": 5, "kelli": 5, "marchisio": 5, "qi": 5, "leong": 5, "yosephin": 5, "susanto": 5, "raymond": [5, 7], "ng": [5, 7], "shayn": 5, "longpr": 5, "ko": 5, "madelin": 5, "antoin": 5, "bosselut": 5, "oh": 5, "leshem": 5, "choshen": 5, "daphn": 5, "ippolito": 5, "enzo": [5, 8], "ferrant": 5, "marzieh": 5, "fadae": 5, "beyza": 5, "ermi": 5, "sara": 5, "hooker": 5, "linguist": [5, 7], "03304": 5, "srr": 5, "aarohi": 5, "abhinav": 5, "rastogi": 5, "abhishek": 5, "rao": 5, "abu": 5, "awal": 5, "shoeb": 5, "abubakar": 5, "abid": [5, 6], "fisch": 5, "santoro": 5, "gupta": 5, "adri\u00e0": 5, "garriga": 5, "alonso": 5, "agnieszka": 5, "kluska": 5, "aitor": 5, "lewkowycz": 5, "akshat": 5, "warstadt": 5, "alexand": [5, 7, 8], "kocurek": 5, "ali": [5, 7], "safaya": 5, "tazarv": 5, "aman": 5, "hussain": 5, "dsouza": 5, "ambros": 5, "slone": 5, "ameet": 5, "rahan": 5, "anantharaman": 5, "ander": 5, "andreassen": 5, "madotto": 5, "santilli": 5, "stuhlm\u00fcller": 5, "la": 5, "lampinen": 5, "angelica": 5, "anh": 5, "vuong": 5, "animesh": 5, "gottardi": 5, "antonio": 5, "norelli": 5, "anu": 5, "venkatesh": 5, "arash": 5, "gholamidavoodi": 5, "arfa": 5, "tabassum": 5, "arul": 5, "menez": 5, "arun": [5, 7], "kirubarajan": 5, "asher": 5, "mullokandov": 5, "ashish": 5, "sabharw": 5, "herrick": 5, "avia": 5, "efrat": 5, "aykut": 5, "erdem": 5, "ayla": 5, "karaka\u015f": 5, "bao": [5, 6, 7], "loe": 5, "barret": [5, 7], "zoph": [5, 7], "bart\u0142omiej": 5, "bojanowski": 5, "batuhan": 5, "\u00f6zyurt": 5, "behnam": 5, "hedayatnia": 5, "neyshabur": 5, "inden": 5, "benno": 5, "stein": 5, "berk": 5, "ekmekci": 5, "blake": 5, "howald": 5, "bryan": 5, "orinion": 5, "diao": 5, "dour": 5, "stinson": 5, "cedrick": 5, "argueta": 5, "c\u00e9sar": 5, "ferri": 5, "ram\u00edrez": 5, "chandan": 5, "charl": 5, "rathkopf": 5, "chenlin": 5, "meng": 5, "chitta": 5, "baral": 5, "chiyu": 5, "callison": 5, "burch": 5, "voigt": 5, "cindi": 5, "ramirez": 5, "clara": 5, "rivera": 5, "clemencia": 5, "siro": 5, "colin": [5, 6], "raffel": [5, 6], "courtnei": 5, "ashcraft": 5, "cristina": 5, "garbacea": 5, "damien": [5, 7], "sileo": 5, "garrett": 5, "kilman": 5, "freeman": 5, "khashabi": 5, "levi": [5, 7], "mosegu\u00ed": 5, "gonz\u00e1lez": 5, "perszyk": 5, "danqi": 5, "dar": 5, "gilboa": 5, "dohan": [5, 7], "drakard": 5, "jurgen": 5, "debajyoti": 5, "datta": 5, "deni": 5, "emelin": 5, "kleyko": 5, "deniz": 5, "yuret": 5, "derek": [5, 7], "tam": [5, 8], "dieuwk": 5, "hupk": 5, "diganta": 5, "dilyar": 5, "buzan": 5, "coelho": 5, "mollo": 5, "diyi": 5, "dylan": 5, "schrader": 5, "ekaterina": 5, "shutova": 5, "ekin": 5, "dogu": 5, "cubuk": 5, "elad": 5, "segal": 5, "eleanor": 5, "hagerman": 5, "donowai": 5, "elli": 5, "pavlick": 5, "rodola": 5, "emma": 5, "lam": 5, "chu": [5, 7], "erkut": 5, "erni": 5, "dyer": 5, "jerzak": 5, "eunic": 5, "engefu": 5, "manyasi": 5, "evgenii": 5, "zheltonozhskii": 5, "fanyu": 5, "fatemeh": 5, "siar": 5, "fernando": 5, "mart\u00ednez": 5, "plume": 5, "francesca": 5, "happ\u00e9": 5, "gaurav": 5, "genta": 5, "indra": 5, "winata": 5, "gerard": 5, "melo": 5, "germ\u00e1n": 5, "kruszewski": 5, "giambattista": [5, 7], "parascandolo": [5, 7], "giorgio": 5, "mariani": 5, "gloria": 5, "gonzalo": 5, "jaimovitch": 5, "l\u00f3pez": 5, "gregor": 5, "betz": 5, "gui": [5, 6], "gur": 5, "hana": 5, "galijasev": 5, "rashkin": 5, "hannaneh": 5, "hajishirzi": 5, "harsh": 5, "hayden": 5, "bogar": 5, "henri": [5, 7], "shevlin": 5, "hinrich": 5, "sch\u00fctze": 5, "hiromu": 5, "yakura": 5, "hongm": 5, "hugh": 5, "mee": 5, "wong": [5, 7], "isaac": 5, "nobl": 5, "jaap": 5, "jumelet": 5, "geissing": 5, "jaehoon": 5, "jaim": 5, "fern\u00e1ndez": 5, "fisac": 5, "simon": 5, "koppel": 5, "koco\u0144": 5, "jana": 5, "thompson": [5, 6, 7], "janel": 5, "wingfield": 5, "jarema": 5, "radom": 5, "jascha": 5, "sohl": [5, 7], "dickstein": 5, "phang": 5, "yosinski": 5, "jekaterina": 5, "novikova": 5, "jell": 5, "bosscher": 5, "jennif": 5, "marsh": 5, "jeroen": 5, "taal": 5, "engel": 5, "jesujoba": 5, "alabi": 5, "jiam": 5, "jillian": 5, "joan": 5, "waweru": 5, "burden": 5, "bali": 5, "batcheld": 5, "berant": 5, "j\u00f6rg": 5, "frohberg": 5, "jo": 5, "rozen": 5, "orallo": 5, "boudeman": 5, "guerr": 5, "tenenbaum": 5, "joyc": 5, "chua": 5, "kanclerz": 5, "karen": 5, "livescu": 5, "karl": 5, "krauth": 5, "karthik": 5, "gopalakrishnan": 5, "katerina": 5, "ignatyeva": 5, "katja": 5, "markert": 5, "kaustubh": 5, "dhole": 5, "gimpel": 5, "omondi": 5, "kori": 5, "mathewson": 5, "kristen": 5, "chiafullo": 5, "ksenia": 5, "shkaruta": 5, "shridhar": 5, "kyle": [5, 7], "mcdonel": 5, "richardson": 5, "laria": 5, "reynold": 5, "leo": [5, 7], "dugan": 5, "lianhui": 5, "lidia": 5, "contrera": 5, "ochando": 5, "morenc": 5, "moschella": 5, "luci": 5, "ludwig": 5, "schmidt": [5, 7], "luheng": 5, "olivero": 5, "col\u00f3n": 5, "metz": [5, 7], "l\u00fctfi": 5, "kerem": 5, "\u015fenel": 5, "maarten": [5, 7], "bosma": 5, "sap": [5, 7], "maartj": 5, "hoev": 5, "maheen": 5, "farooqi": 5, "manaal": 5, "faruqui": 5, "marco": 5, "baturan": 5, "marelli": 5, "maru": 5, "maria": 5, "quintana": 5, "tolkiehn": 5, "mario": [5, 7], "giulianelli": 5, "martha": 5, "potthast": 5, "leavitt": 5, "hagen": 5, "m\u00e1ty\u00e1": 5, "schubert": 5, "medina": [5, 7], "orduna": 5, "baitemirova": 5, "melodi": 5, "arnaud": 5, "melvin": 5, "mcelrath": 5, "yee": 5, "cohen": 5, "ivanitskii": 5, "starritt": 5, "strube": 5, "micha\u0142": 5, "sw\u0119drowski": 5, "michel": [5, 7], "bevilacqua": 5, "mihir": 5, "kale": 5, "cain": 5, "mime": 5, "mitch": 5, "walker": 5, "mo": 5, "tiwari": 5, "mohit": 5, "bansal": 5, "moin": 5, "aminnaseri": 5, "mor": 5, "geva": 5, "mozhdeh": 5, "gheini": 5, "mukund": 5, "varma": 5, "nanyun": 5, "peng": [5, 7], "nayeon": 5, "neta": 5, "krakov": 5, "doiron": 5, "nicol": 5, "martinez": 5, "nikita": 5, "nangia": 5, "nikla": 5, "decker": 5, "muennighoff": 5, "nitish": [5, 7], "shirish": [5, 7], "keskar": [5, 7], "niveditha": 5, "constant": 5, "fiedel": 5, "nuan": 5, "wen": 5, "oliv": [5, 7], "agha": 5, "elbaghdadi": 5, "omer": 5, "moreno": 5, "casar": 5, "parth": 5, "doshi": 5, "pascal": 5, "fung": 5, "pu": 5, "vicol": 5, "pegah": 5, "alipoormolabashi": 5, "peiyuan": 5, "eckerslei": 5, "phu": 5, "mon": 5, "htut": 5, "pinyu": 5, "hwang": 5, "piotr": 5, "mi\u0142kowski": 5, "piyush": 5, "pouya": 5, "pezeshkpour": 5, "priti": 5, "oli": 5, "qiaozhu": 5, "qing": 5, "qinlang": 5, "rabin": 5, "banjad": 5, "rachel": [5, 7], "etta": 5, "rudolph": 5, "raefer": 5, "rahel": 5, "haback": 5, "ramon": 5, "risco": 5, "rapha\u00ebl": 5, "milli\u00e8r": 5, "rhythm": 5, "garg": [5, 6], "rif": 5, "saurou": 5, "riku": 5, "arakawa": 5, "robb": 5, "raymaek": 5, "rohan": 5, "sikand": 5, "novak": 5, "sitelew": 5, "lebra": 5, "rosann": 5, "rowan": [5, 7], "ruslan": 5, "salakhutdinov": 5, "stoval": 5, "teehan": 5, "sahib": 5, "saif": 5, "sajant": 5, "dillav": 5, "shleifer": 5, "wiseman": 5, "gruetter": 5, "schoenholz": 5, "sanghyun": 5, "sanjeev": 5, "kwatra": 5, "sarik": 5, "ghazarian": 5, "sayan": 5, "casei": [5, 7], "bischoff": 5, "gehrmann": 5, "schuster": 5, "sepideh": 5, "sadeghi": 5, "shadi": 5, "hamdan": 5, "sharon": 5, "shashank": 5, "sherri": 5, "shi": 5, "shikhar": 5, "shima": 5, "asaadi": 5, "shubh": 5, "pachchigar": 5, "shubham": 5, "toshniw": 5, "shyam": [5, 7], "upadhyai": 5, "shyamolima": 5, "debnath": 5, "siamak": 5, "shakeri": 5, "thormey": 5, "melzi": 5, "siva": 5, "reddi": 5, "sneha": 5, "priscilla": 5, "makini": 5, "soo": 5, "hwan": 5, "toren": 5, "sriharsha": 5, "hatwar": 5, "stanisla": 5, "dehaen": 5, "stefan": 5, "divic": 5, "stella": 5, "biderman": 5, "stephen": 5, "prasad": 5, "piantadosi": 5, "stuart": [5, 7], "shieber": 5, "summer": [5, 7], "misherghi": 5, "svetlana": 5, "kiritchenko": 5, "swaroop": 5, "tal": 5, "linzen": 5, "tariq": 5, "tatsu": 5, "te": 5, "th\u00e9o": 5, "desbord": 5, "theodor": 5, "rothschild": 5, "phan": [5, 7], "tiberiu": 5, "nkinyili": 5, "timo": 5, "schick": 5, "timofei": 5, "kornev": 5, "titu": 5, "tunduni": 5, "gerstenberg": 5, "trenton": 5, "trishala": 5, "neeraj": 5, "tushar": 5, "khot": 5, "shultz": 5, "uri": 5, "shaham": 5, "vera": 5, "demberg": 5, "victoria": [5, 7], "nyamai": 5, "vika": 5, "raunak": 5, "vinai": 5, "ramasesh": 5, "udai": 5, "prabhu": 5, "vishakh": 5, "padmakumar": 5, "vivek": 5, "srikumar": 5, "fedu": [5, 7], "wout": 5, "vossen": 5, "xiaoyu": 5, "tong": [5, 7], "xinran": 5, "xinyi": 5, "yadollah": 5, "yaghoobzadeh": 5, "yair": 5, "lakretz": 5, "yangqiu": 5, "yasaman": 5, "bahri": 5, "yichi": 5, "yide": 5, "yifu": 5, "yonatan": 5, "belinkov": 5, "yufang": 5, "seid": 5, "zhuoy": 5, "zijian": 5, "ziji": 5, "zirui": 5, "ziyi": 5, "extrapol": 5, "2206": 5, "04615": 5, "wpn": 5, "yada": 5, "pruksachatkun": 5, "amanpreet": 5, "hill": 5, "stickier": 5, "wsm": 5, "1804": 5, "07461": 5, "wtb": 5, "tai": 5, "borgeaud": 5, "dani": 5, "yogatama": 5, "denni": [5, 7], "donald": 5, "metzler": 5, "ed": 5, "oriol": 5, "vinyal": 5, "dean": 5, "07682": 5, "wdr": 5, "doolei": 5, "manlei": 5, "arka": [5, 7], "pal": 5, "feuer": 5, "siddhartha": 5, "ravid": 5, "shwartz": [5, 7], "ziv": 5, "khalid": [5, 6], "saifullah": 5, "siddartha": 5, "naidu": 5, "chinmai": 5, "hegd": 5, "lecun": 5, "goldstein": 5, "willi": 5, "neiswang": 5, "micah": 5, "goldblum": 5, "19314": 5, "yyh": 5, "baosong": [5, 6], "chengpeng": 5, "chengyuan": [5, 6], "fei": [5, 6], "guant": 5, "haoran": [5, 6], "huan": [5, 6], "jialong": 5, "jialin": 5, "jianhong": [5, 6], "tu": [5, 6], "jianwei": [5, 6], "jianxin": [5, 6], "jin": [5, 7], "jingren": [5, 6], "jinz": 5, "jinzheng": 5, "junyang": [5, 6], "keme": [5, 6], "keqin": [5, 6], "kexin": [5, 6], "mingfeng": [5, 6], "xue": [5, 6, 7], "ni": 5, "pei": [5, 6], "ru": 5, "men": [5, 6], "ruiz": 5, "runji": [5, 6], "shiji": 5, "sinan": 5, "tianhang": 5, "wenbin": 5, "ge": 5, "xiaodong": 5, "deng": 5, "xiaohuan": 5, "xingzhang": [5, 6], "xinyu": [5, 7], "xipin": 5, "xuancheng": [5, 6], "yichang": [5, 6], "wan": [5, 6], "yunfei": 5, "yuqiong": [5, 6], "zhenru": [5, 6], "zhihao": 5, "10671": 5, "zcl24": 5, "zhihan": 5, "cao": 5, "lizi": 5, "openreview": 5, "forum": 5, "aegrf1uy0p": 5, "zc": 5, "siyuan": 5, "zhuang": [5, 7], "zhanghao": 5, "yonghao": 5, "zi": 5, "zhuohan": 5, "xing": [5, 7], "2306": 5, "05685": 5, "huggingface24": 5, "metaai24": 5, "di": 6, "hunter": 6, "photo": 6, "email": 6, "hipaa": 6, "properti": [6, 7], "gdpr": 6, "iot": 6, "unreli": 6, "impract": 6, "slm": 6, "viabl": 6, "sensor": 6, "interconnect": 6, "frontend": 6, "garner": 6, "traction": 6, "yourself": 6, "aw": [6, 7], "bedrock": 6, "sambanova": 6, "sla": 6, "veloc": 6, "roadmap": 6, "commodit": 6, "winner": 6, "loser": 6, "condens": 6, "clean": 6, "2024t": 6, "versatil": 6, "72b": 6, "med": 6, "bloomberggpt": 6, "underw": 6, "adept": 6, "toxigen": 6, "alnajjar": 6, "13b": [6, 7], "01": 6, "outperform": 6, "32b": 6, "feasibl": 6, "2m": 6, "unstructur": [6, 8], "modal": 6, "diagnosi": 6, "patient": 6, "necessit": 6, "flagship": 6, "405b": 6, "gemini": 6, "pack": 6, "cautious": 6, "isol": [6, 7], "cpot": 6, "cpit": 6, "tco": 6, "tpot": 6, "ttft": 6, "gpqa": 6, "ratio": 6, "median": 6, "afford": 6, "lite": 6, "micro": 6, "encod": [6, 7, 8], "cent": 6, "1m": 6, "cheapest": 6, "phi": 6, "half": [6, 7], "permiss": [6, 7], "apach": 6, "microsoft": 6, "simpler": [6, 7, 8], "fewer": [6, 7], "700m": 6, "100m": 6, "gemma": [6, 8], "deepseek": 6, "v2": [6, 7], "grown": 6, "withdraw": 6, "incomplet": [6, 7], "preprocess": [6, 8], "unclear": 6, "15t": 6, "8t": 6, "fineweb": 6, "penedo": 6, "96": [6, 7], "crawl": 6, "snapshot": 6, "codebas": 6, "ablat": 6, "vital": [6, 7], "favorit": 6, "spawn": 6, "ultrachat": 6, "2024u": 6, "created_job": 6, "fine_tun": 6, "training_fil": 6, "file_id": 6, "ultrachat_chunk_train": 6, "validation_fil": 6, "ultrachat_chunk_ev": 6, "training_step": 6, "0001": 6, "auto_start": 6, "job_id": 6, "toolkit": [6, 7], "sft": 6, "nemo": [6, 7], "codestr": 6, "2024v": 6, "enough": 6, "despit": [6, 8], "rewrit": 6, "smolvlm": 6, "mlx": [6, 8], "mlc": 6, "peft": 6, "programm": 6, "graphic": [6, 7], "vram": 6, "mathbf": 6, "x_1": [6, 8], "x_2": [6, 8], "x_n": [6, 8], "x_": [6, 8], "\u03b8": 6, "matrix": [6, 7], "cerebra": 6, "mozilla": 6, "docker": 6, "gerganov": 6, "georgi": 6, "hundr": 6, "overwhelm": [6, 8], "manifesto": 6, "enjoy": 6, "bog": 6, "exploratori": 6, "hacker": 6, "Will": [6, 7], "prototyp": 6, "prematur": 6, "besid": 6, "lighter": 6, "sacrific": 6, "unifi": [6, 8], "ggml": [6, 8], "ibm": [6, 7], "metadata": 6, "disk": 6, "backward": 6, "2024x": 6, "repo": 6, "easier": [6, 7, 8], "compil": 6, "linux": 6, "argument": [6, 7, 8], "sudo": 6, "apt": 6, "cmake": 6, "bind": 6, "betlen": 6, "cnv": 6, "llamacpp": 6, "succinct": 6, "ctrl": 6, "interject": 6, "philosoph": 6, "debat": 6, "fulfil": 6, "happi": 6, "responsibli": 6, "bye": 6, "goodby": 6, "port": 6, "127": 6, "curl": [6, 8], "localhost": 6, "v1": [6, 7], "bearer": 6, "finish_reason": 6, "deepli": 6, "1734627879": 6, "completion_token": 6, "total_token": 6, "chatcmpl": 6, "5wl2tzjzdmzupvxwp2gcedr8xbpsyhfm": 6, "prompt_n": 6, "prompt_m": 6, "132": 6, "prompt_per_token_m": 6, "prompt_per_second": 6, "77619878666999": 6, "predicted_n": 6, "predicted_m": 6, "1700": 6, "654": [6, 8], "predicted_per_token_m": 6, "36882142857143": 6, "predicted_per_second": 6, "92850867960208": 6, "gbnf": [6, 8], "8pm": 6, "appointmenttim": 6, "appointmentdetail": 6, "handi": 6, "model_path": 6, "llama_cpp": 6, "create_chat_complet": 6, "occupi": 6, "activist": 6, "justin": [6, 7], "tunnei": 6, "ocho": 6, "appach": 6, "cosmopolitan": 6, "libc": 6, "portabl": 6, "durabl": 6, "usabl": [6, 7, 8], "tinyllama": 6, "wget": 6, "jartin": 6, "q5_k_m": 6, "renam": 6, "ex": 6, "chmod": 6, "nobrows": 6, "registri": 6, "nativ": [6, 8], "container": 6, "trai": 6, "familiar": 6, "bare": 6, "ssfl": 6, "sh": [6, 8], "Or": 6, "11434": 6, "chatrespons": 6, "easiest": 6, "rich": [6, 7], "playground": 6, "simultan": [6, 7], "verif": [6, 8], "importantli": [6, 8], "intuit": 6, "beginn": 6, "tensorrt": 6, "trt": 6, "latex": 6, "voic": 6, "pwa": 6, "medium": [6, 7, 8], "gpt4all": 6, "rbac": 6, "q4_k": 6, "q6_k": 6, "mib": 6, "wikitext": 6, "salesforc": 6, "wikipedia": [6, 8], "min_prompt_length": 6, "input_texts_raw": 6, "2010": 6, "valkyria": 6, "chronicl": 6, "forgiv": 6, "newcom": 6, "raita": 6, "honjou": 6, "compos": [6, 7], "hitoshi": 6, "sakimoto": 6, "takeshi": 6, "ozawa": 6, "writer": 6, "theme": [6, 7], "sung": 6, "escap": 6, "escaped_text": 6, "block_scal": 6, "block": [6, 7], "parenthes": 6, "block_min": 6, "formula": 6, "superblock": 6, "5625": 6, "ieee": 6, "754": 6, "ppl": 6, "exp": 6, "sum_": 6, "log_2": 6, "x_i": [6, 8], "avg": 6, "_i": 6, "corr": 6, "ln": [6, 8], "kullback": 6, "leibler": 6, "entropi": 6, "logit": 6, "d_": 6, "softmax": [6, 8], "sum": 6, "kld": 6, "q2_kresult": 6, "q6": 6, "004": 6, "q2": 6, "112": 6, "q4": 6, "smallest": 6, "390": 6, "67": [6, 7], "81": [6, 7], "93": [6, 7], "462": 6, "614": 6, "170": 6, "q4_k_m": 6, "thread": 6, "16x": 6, "85x": 6, "79x": 6, "ubuntu": 6, "lt": 6, "x86_64": 6, "gnu": 6, "thank": [6, 8], "intel": 6, "i7": 6, "8550u": 6, "15gib": 6, "samsung": 6, "ssd": 6, "970": 6, "evo": 6, "500gb": 6, "1170": 6, "meant": 6, "ai4c": 6, "ai4a": 6, "paperswithcod": [6, 7], "ana24a": 6, "leaderboard": [6, 7], "artificialanalysi": 6, "ana24b": 6, "ana24c": 6, "bc24": 6, "andrei": [6, 7], "abetlen": 6, "fac4": 6, "optimum": 6, "concept_guid": 6, "fac4t": 6, "fac4u": 6, "200k": 6, "ultrachat_200k": 6, "fac4v": 6, "blogpost": 6, "gc24": 6, "ggerganov": [6, 8], "blob": [6, 8], "readm": [6, 8], "gc4a": 6, "gc4b": 6, "pka": 6, "guilherm": 6, "hynek": 6, "kydl\u00ed\u010dek": 6, "decant": 6, "finest": 6, "17557": 6, "qwe4b": 6, "qy": 6, "beichen": 6, "tingyu": 6, "zihan": 6, "qiu": 6, "15115": 6, "rev24": 6, "harvard": 6, "nyt": 6, "harvardlawreview": 6, "timess": 6, "zwa": 6, "wael": 6, "geoffrei": [6, 7], "angu": 6, "arnav": 6, "jefferi": 6, "kinnison": 6, "sherstinski": 6, "piero": 6, "molino": 6, "travi": 6, "addair": 6, "devvret": 6, "310": 6, "2405": 6, "00732": 6, "huggingface4xa": 6, "huggingface4xb": 6, "ibmthink24": 6, "lmstudio24": 6, "lmstudio": 6, "metaai4c": 6, "mozillaocho24": 6, "salesforce24": 6, "immens": 7, "commonplac": 7, "hartvigsen": 7, "societi": 7, "statement": 7, "alarm": 7, "openli": 7, "dolli": 7, "llama2": [7, 8], "emb": 7, "generalist": 7, "injustic": 7, "inequ": 7, "undermin": 7, "perpetu": 7, "displac": 7, "eros": 7, "fake": 7, "deepfak": 7, "distrust": 7, "cyberattack": 7, "spread": 7, "disinform": 7, "inadvert": 7, "interven": 7, "irrevers": 7, "uncheck": 7, "extinct": 7, "race": 7, "incentiv": 7, "shortcut": 7, "behind": 7, "stress": 7, "urgent": 7, "reorient": 7, "birth": 7, "siam": 7, "edgington": 7, "jailbreak": 7, "promptcraft": 7, "stealth": 7, "sutton": 7, "subtl": 7, "subtleti": 7, "exception": 7, "phrase": 7, "evad": 7, "hqve": 7, "frer": 7, "hplidai": 7, "pl": 7, "hyperion": 7, "coast": 7, "redwood": 7, "tallest": 7, "tree": [7, 8], "routin": 7, "prejudic": 7, "gallego": 7, "leak": 7, "poison": 7, "intention": 7, "inject": 7, "mislead": 7, "exabeam": 7, "finra": 7, "3110": 7, "mandat": 7, "supervisori": 7, "unicef": 7, "empow": 7, "contest": 7, "congress": 7, "enact": 7, "pictur": [7, 8], "territori": 7, "oversea": 7, "chines": 7, "legitim": 7, "consent": 7, "complaint": 7, "cooper": 7, "extraterritori": 7, "offshor": 7, "draft": 7, "voluntari": 7, "neutral": 7, "player": 7, "prepared": 7, "compris": 7, "cbrn": 7, "persuas": 7, "autonomi": 7, "gradat": 7, "scorecard": 7, "elig": 7, "advisori": 7, "sag": 7, "shut": 7, "prerequisit": 7, "harden": 7, "asl": 7, "biosafeti": 7, "elev": 7, "warn": [7, 8], "bioweapon": 7, "compartment": 7, "difficulti": 7, "4x": 7, "jump": 7, "paus": 7, "deepmind": 7, "biosecur": 7, "buffer": 7, "formul": [7, 8], "calibr": 7, "promin": 7, "taxonomi": 7, "llamaguard": 7, "20241022": 7, "3x": 7, "5x": 7, "alaga": 7, "substandard": 7, "oxford": 7, "wachter": 7, "blur": 7, "ill": 7, "stifl": 7, "suscept": 7, "aadc": 7, "outset": 7, "curricula": 7, "adversari": 7, "uncov": [7, 8], "appar": 7, "thoroughli": 7, "lm": [7, 8], "problemat": 7, "undergo": 7, "280b": 7, "cai": [7, 8], "utilis": 7, "minimis": 7, "enshrin": 7, "evas": 7, "resort": 7, "avenu": 7, "cambria": 7, "inherit": 7, "influenti": 7, "debias": 7, "occurr": 7, "phish": 7, "clarifi": 7, "toler": 7, "checklist": 7, "abus": 7, "ux": 7, "architect": 7, "diagram": 7, "retrofit": 7, "promptli": 7, "dashboard": 7, "misalign": 7, "star": 7, "postpon": 7, "combat": 7, "counter": 7, "traffic": 7, "frustrat": 7, "workaround": 7, "silo": 7, "hierarch": 7, "hierarchi": 7, "66": 7, "depth": 7, "mcq": 7, "regex": [7, 8], "joint": 7, "facet": 7, "purpl": 7, "circl": 7, "opensafetylab": 7, "salad_bench_dataset": 7, "base_set": 7, "gptfuzzer": 7, "auto": [7, 8], "qid": 7, "o1": 7, "supremaci": 7, "o53": 7, "o14": 7, "o5": 7, "o65": 7, "plagiar": 7, "o16": 7, "o6": 7, "o47": 7, "campaign": 7, "o12": 7, "o52": 7, "surveil": 7, "spous": 7, "know": [7, 8], "o13": 7, "breakdown": 7, "ncount": 7, "21318": 7, "8756": 7, "6486": 7, "o2": 7, "1717": 7, "o4": 7, "1477": 7, "o3": 7, "socioeconom": 7, "851": 7, "int64": 7, "gen": 7, "15433": 7, "hh": 7, "4184": 7, "659": 7, "advbench": 7, "230": 7, "189": 7, "toxicchat": 7, "anyth": 7, "817": 7, "misconcept": 7, "ingrain": 7, "mc1": 7, "singular": 7, "choices4": 7, "mc2": 7, "set4": 7, "scorer": 7, "correctli": [7, 8], "truthful_qa": 7, "truthfulqa_dataset": 7, "multiple_choic": 7, "best_answ": 7, "correct_answ": 7, "incorrect_answ": 7, "watermelon": 7, "digest": 7, "noth": 7, "stomach": 7, "sick": 7, "wonderopoli": 7, "wonder": 7, "belli": 7, "swallow": 7, "dream": 7, "die": 7, "indigest": 7, "unconsci": 7, "excret": 7, "asr": 7, "r2d2": 7, "wider": [7, 8], "mass": 7, "destruct": 7, "asynchron": 7, "webpurifi": 7, "protectai": 7, "comprehend": 7, "amazon": 7, "nvidia": [7, 8], "keyword": 7, "toolset": 7, "nemmo": 7, "synchron": 7, "nemoguardrail": 7, "llmrail": 7, "railsconfig": 7, "from_path": 7, "rail": 7, "hello": 7, "ministr": 7, "mistralai": 7, "mistral_api_kei": 7, "moderate_chat": 7, "omni": 7, "pprint": 7, "to_json": 7, "threaten": 7, "illicit": 7, "granit": 7, "guardian": 7, "consortium": 7, "v3": 7, "11b": 7, "begin_of_text": 7, "start_header_id": 7, "end_header_id": 7, "unsafe_categori": 7, "user_message_1": 7, "model_answer_1": 7, "comma": 7, "eot_id": 7, "eom_id": 7, "denot": 7, "s1": 7, "s2": 7, "s3": 7, "s4": 7, "s5": 7, "defam": 7, "s6": 7, "s7": 7, "s8": 7, "s9": 7, "s10": 7, "s11": 7, "s12": 7, "s13": 7, "atla": 7, "2b": 7, "hap": 7, "38m": 7, "125m": 7, "padhi": 7, "shieldgemma": 7, "judge_prompt": 7, "american": 7, "vandal": 7, "underag": 7, "drink": 7, "vulgar": 7, "obscen": 7, "racism": 7, "derogatori": 7, "firearm": 7, "safety_scor": 7, "IN": 7, "borderlin": 7, "verdict": 7, "boolean": [7, 8], "brief": 7, "rational": 7, "delimit": 7, "paramount": [7, 8], "evenli": 7, "good_sampl": 7, "bad_sampl": 7, "2024z": 7, "surg": 7, "scam": 7, "get_profanity_sampl": 7, "show_stat": 7, "bool": 7, "current_dir": 7, "getcwd": 7, "data_path": 7, "profanity_en": 7, "random_st": 7, "ncategori": 7, "category_count": 7, "category_1": 7, "1f": 7, "profanity_sampl": 7, "nsampl": 7, "anatomi": 7, "slur": 7, "182": 7, "bodili": 7, "fluid": 7, "excrement": 7, "insult": 7, "mental": 7, "disabl": 7, "jap": 7, "babi": [7, 8], "batter": 7, "crazi": 7, "sob": 7, "fukka": 7, "sh1t3": 7, "get_salad_sampl": 7, "salad_sampl": 7, "tortur": 7, "porn": 7, "sin": 7, "sight": 7, "god": 7, "embezzl": 7, "xanax": 7, "alcohol": 7, "get_good_sampl": 7, "min_scor": 7, "reichstag": 7, "profanity_data": 7, "salad_data": 7, "good_data": 7, "all_data": 7, "prompt_sampl": 7, "is_unsaf": 7, "counti": 7, "holli": 7, "ridg": 7, "nc": 7, "town": 7, "onslow": 7, "carolina": 7, "diver": 7, "underwat": 7, "maze": 7, "coral": 7, "treasur": 7, "vivid": 7, "sensori": 7, "emot": 7, "labyrinthin": 7, "passag": 7, "reef": 7, "suspens": 7, "obstacl": 7, "creatur": 7, "nomin": 7, "nobel": 7, "literatur": 7, "love": 7, "ny": [7, 8], "logo": 7, "thief": 7, "rob": 7, "famou": 7, "nstatist": 7, "source_stat": 7, "type_stat": 7, "tiktoken": 7, "plug": 7, "safetyvalid": 7, "validationresult": 7, "dataclass": 7, "abstractmethod": 7, "llmguardvalid": 7, "scanner": 7, "bantop": 7, "llm_guard": 7, "input_scann": 7, "scan_prompt": 7, "matchtyp": 7, "default_banned_top": 7, "banned_top": 7, "super": 7, "banned_topics_scann": 7, "use_onnx": 7, "toxicity_scann": 7, "match_typ": 7, "fail_fast": 7, "unsafe_scann": 7, "gun": 7, "cool": 7, "hunt": 7, "deer": 7, "dad": 7, "mistralvalid": 7, "hate_and_discrimin": 7, "violence_and_threat": 7, "dangerous_and_criminal_cont": 7, "selfharm": 7, "openaivalid": 7, "attr": 7, "dir": 7, "getattr": 7, "illicit_viol": 7, "llmjudgevalid": 7, "prompt_path": 7, "llmjudg": 7, "filenotfounderror": 7, "elaps": 7, "score_valid": 7, "id_": 7, "validator_inst": 7, "start_tim": 7, "validation_result": 7, "elapsed_tim": 7, "prompt_sample_id": 7, "validator_nam": 7, "scoring_prompt": 7, "scoring_result": 7, "01536": 7, "34098": 7, "497136": 7, "546416": 7, "calculate_validator_metr": 7, "bad_sourc": 7, "good_sourc": 7, "tpr": 7, "fpr": 7, "f1_score": 7, "tn": 7, "fp": 7, "fn": 7, "tp": 7, "255": 7, "74": 7, "238": 7, "130": 7, "256": 7, "79": 7, "149": 7, "213": 7, "190": 7, "235": 7, "63": 7, "222": 7, "480": 7, "157": 7, "487": 7, "495": 7, "482": 7, "667": 7, "248": 7, "466": 7, "143": 7, "355": 7, "slowest": 7, "room": 7, "false_posit": 7, "left_on": 7, "right_on": 7, "persian": 7, "teenag": 7, "\u0686\u0627\u0644\u0634": 7, "\u0645\u0648\u0645\u0648": 7, "\u06cc\u06a9": 7, "\u062d\u0642\u0647": 7, "\u0648": 7, "\u0627\u0641\u0633\u0627\u0646\u0647": 7, "\u0645\u062d\u0644\u06cc": 7, "\u0627\u0633\u062a": 7, "\u06a9\u0647": 7, "\u0634\u0627\u06cc\u0639\u0647": 7, "\u0622\u0646": 7, "\u062f\u0631": 7, "\u0634\u0628\u06a9\u0647": 7, "\u0647\u0627\u06cc": 7, "\u0627\u062c\u062a\u0645\u0627\u0639\u06cc": 7, "\u067e\u062e\u0634": 7, "\u0634\u062f": 7, "\u06af\u0632\u0627\u0631\u0634": 7, "\u06a9\u0627\u0631\u0628\u0631\u06cc": 7, "\u0628\u0627": 7, "\u0646\u0627\u0645": 7, "\u06a9\u0648\u062f\u06a9\u0627\u0646": 7, "\u0646\u0648\u062c\u0648\u0627\u0646\u0627\u0646": 7, "\u0631\u0627": 7, "\u0628\u0647": 7, "\u0627\u0639\u0645\u0627\u0644": 7, "\u062e\u0637\u0631\u0646\u0627\u06a9\u06cc": 7, "\u0645\u0627\u0646\u0646\u062f": 7, "\u062e\u0648\u062f\u0632\u0646\u06cc": 7, "\u062e\u0648\u062f\u06a9\u0634\u06cc": 7, "\u062a\u0631\u063a\u06cc\u0628": 7, "\u0645\u06cc": 7, "\u06a9\u0646\u062f": 7, "\u0648\u062c\u0648\u062f": 7, "\u0635\u062d\u0628\u062a": 7, "\u0632\u06cc\u0627\u062f\u06cc": 7, "\u0645\u0648\u0631\u062f": 7, "\u0627\u06cc\u0646": 7, "\u062a\u0639\u062f\u0627\u062f": 7, "\u0634\u06a9\u0627\u06cc\u0627\u062a": 7, "\u0648\u0627\u0642\u0639\u06cc": 7, "\u0628\u0633\u06cc\u0627\u0631": 7, "\u06a9\u0645": 7, "\u0628\u0648\u062f": 7, "\u067e\u0644\u06cc\u0633": 7, "\u0635\u062f\u0645\u0647": 7, "\u062f\u06cc\u062f\u0646": 7, "\u062f\u0644\u06cc\u0644": 7, "\u062a\u0623\u062b\u06cc\u0631": 7, "\u0645\u0633\u062a\u0642\u06cc\u0645": 7, "\u067e\u062f\u06cc\u062f\u0647": 7, "\u062a\u0623\u06cc\u06cc\u062f": 7, "\u0646\u06a9\u0631\u062f\u0647": 7, "\u062a\u0631\u0633": 7, "\u0646\u06af\u0631\u0627\u0646\u06cc": 7, "\u0627\u06cc\u062c\u0627\u062f": 7, "\u0634\u062f\u0647": 7, "\u0628\u06cc\u0634\u062a\u0631": 7, "\u0627\u0632": 7, "\u062e\u0648\u062f": 7, "\u0631\u0633\u0627\u0646\u0647": 7, "\u0647\u0627": 7, "\u0637\u0648\u0631\u06cc": 7, "\u062e\u06cc\u0631\u06cc\u0647": 7, "\u0647\u0634\u062f\u0627\u0631": 7, "\u062f\u0627\u062f\u0646\u062f": 7, "\u0622\u0633\u06cc\u0628": 7, "\u0627\u0646\u062a\u0638\u0627\u0631\u0627\u062a": 7, "\u0645\u062d\u062a\u0648\u0627\u06cc": 7, "\u062e\u0634\u0648\u0646\u062a": 7, "\u0622\u0645\u06cc\u0632": 7, "\u0627\u06cc\u0646\u062a\u0631\u0646\u062a": 7, "\u06af\u0641\u062a\u0647": 7, "\u0634\u0648\u062f": 7, "\u0627\u0648\u0644\u06cc\u0646": 7, "\u0628\u0627\u0631": 7, "\u0633\u0627\u0644": 7, "\u06f2\u06f0\u06f1\u06f8": 7, "\u067e\u0633": 7, "\u0622\u0646\u06a9\u0647": 7, "\u0631\u0648\u0632\u0646\u0627\u0645\u0647": 7, "\u0627\u0646\u062f\u0648\u0646\u0632\u06cc\u0627\u06cc\u06cc": 7, "\u062e\u0628\u0631": 7, "\u062f\u062e\u062a\u0631": 7, "\u06f1\u06f2": 7, "\u0633\u0627\u0644\u0647": 7, "\u062f\u0627\u062f": 7, "\u0645\u0648\u0636\u0648\u0639": 7, "\u062c\u0647\u0627\u0646\u06cc": 7, "\u062a\u0628\u062f\u06cc\u0644": 7, "\u0645\u062c\u0633\u0645\u0647": 7, "\u0647\u0646\u0631\u0645\u0646\u062f": 7, "\u0698\u0627\u067e\u0646\u06cc": 7, "\u0647\u0631": 7, "\u0686\u0646\u062f": 7, "\u0634\u0627\u06cc\u062f": 7, "\u0646\u06af\u0627\u0647": 7, "\u0628\u0639\u0636\u06cc": 7, "\u0632\u06cc\u0628\u0627": 7, "\u0646\u0628\u0627\u0634\u062f": 7, "\u0627\u0645\u0627": 7, "\u06a9\u0627\u0645\u0644\u0627": 7, "\u0628\u06cc": 7, "\u062e\u0637\u0631": 7, "\u0627\u06cc\u0631\u0627\u0646": 7, "\u0645\u062f\u062a": 7, "\u0628\u06cc\u0646": 7, "\u06a9\u0627\u0631\u0628\u0631\u0627\u0646": 7, "\u0645\u0637\u0631\u062d": 7, "\u0633\u0627\u0644\u06cc": 7, "\u0633\u0631\u0627\u0633\u0631": 7, "\u062c\u0647\u0627\u0646": 7, "\u0645\u0634\u0627\u0628\u0647\u06cc": 7, "\u0628\u0631\u0627\u06cc": 7, "\u0648\u0627\u0644\u062f\u06cc\u0646": 7, "\u06a9\u0631\u062f\u0647": 7, "\u0627\u0641\u0631\u0627\u062f": 7, "\u0686\u0647": 7, "\u06a9\u0627\u0631\u06cc": 7, "\u062f\u0639\u0648\u062a": 7, "tourist": 7, "distress": 7, "polish": 7, "galician": 7, "dzisiaj": 7, "szwecji": 7, "innych": 7, "bogatych": 7, "krajach": 7, "ludzi": 7, "u\u017cywaj\u0105": 7, "mn\u00f3stwo": 7, "najr\u00f3\u017cniejszych": 7, "urz\u0105dze\u0144": 7, "hox": 7, "suecia": 7, "outro": 7, "pa\u00eds": 7, "rico": 7, "xent": 7, "usa": [7, 8], "moita": 7, "m\u00e1quina": 7, "diferent": 7, "\u0142\u00f3dka": 7, "zaczyna": 7, "ton\u0105\u0107": 7, "tury\u015bci": 7, "wracaj\u0105": 7, "statek": 7, "dom\u00f3w": 7, "gdzie": 7, "opowiadaj\u0105": 7, "tym": 7, "jak": 7, "zostali": 7, "zaatakowani": 7, "surprisingli": 7, "unsettl": 7, "paradox": 7, "harbor": 7, "wisdom": 7, "aspir": 7, "technologist": 7, "disciplinari": 7, "ethicist": 7, "policymak": 7, "ai24": 7, "asa24": 7, "jide": 7, "jona": 7, "schuett": 7, "marku": 7, "anderljung": 7, "08751": 7, "bhy": 7, "hinton": 7, "pieter": 7, "abbeel": 7, "trevor": 7, "darrel": 7, "yuval": 7, "harari": 7, "ya": 7, "lan": 7, "shai": 7, "shalev": 7, "gillian": 7, "hadfield": 7, "clune": 7, "tegan": 7, "maharaj": 7, "hutter": 7, "at\u0131l\u0131m": 7, "g\u00fcne\u015f": 7, "baydin": 7, "sheila": 7, "mcilraith": 7, "qiqi": 7, "ashwin": 7, "acharya": 7, "anca": 7, "dragan": 7, "philip": 7, "torr": 7, "russel": 7, "kahneman": 7, "s\u00f6ren": 7, "mindermann": 7, "amid": 7, "384": 7, "6698": 7, "1126": 7, "adn0117": 7, "bbc": 7, "emili": 7, "braca": 7, "israel": 7, "carter": 7, "hafsa": 7, "kanchwala": 7, "khojasteh": 7, "charli": 7, "landow": 7, "luo": 7, "magarelli": 7, "mirin": 7, "averi": 7, "moyer": 7, "kayla": 7, "simpson": 7, "amelia": 7, "skawinski": 7, "heverin": 7, "23308": 7, "bmc": 7, "dillon": 7, "brendan": 7, "murphi": 7, "khachaturov": 7, "gleav": 7, "kellin": 7, "pelrin": 7, "2408": [7, 8], "02946": 7, "cmm": 7, "erik": 7, "lorenzo": 7, "malandri": 7, "fabio": 7, "mercorio": 7, "navid": 7, "nobani": 7, "seveso": 7, "15248": 7, "edg24": 7, "exa24": 7, "cyber": 7, "grb": 7, "rossi": 7, "barrow": 7, "mehrab": 7, "tanjim": 7, "sungchul": 7, "franck": 7, "dernoncourt": 7, "ruiyi": 7, "nesreen": 7, "2309": 7, "00770": 7, "h44z": 7, "hgp": 7, "saadia": 7, "hamid": 7, "palangi": 7, "dipankar": 7, "ec": 7, "kamar": 7, "oxi": 7, "smaranda": 7, "muresan": 7, "preslav": 7, "nakov": 7, "alin": 7, "villavicencio": 7, "editor": 7, "60th": 7, "3309": 7, "3326": 7, "dublin": 7, "aclanthologi": 7, "acl": 7, "18653": 7, "hym": 7, "weijiang": 7, "weitao": 7, "weihong": 7, "zhangyin": 7, "haotian": 7, "qianglong": 7, "weihua": 7, "xiaocheng": 7, "bing": 7, "dx": 7, "1145": [7, 8], "3703155": 7, "ldw": 7, "lijun": 7, "ruohui": 7, "xuhao": 7, "wangmeng": 7, "zuo": 7, "dahua": 7, "qiao": 7, "shao": 7, "05044": 7, "mpy": 7, "xuwang": 7, "zifan": 7, "norman": 7, "mu": 7, "elham": 7, "sakhae": 7, "nathaniel": 7, "forsyth": 7, "04249": 7, "mlc24": 7, "illumin": 7, "ailumin": 7, "oaa": 7, "adler": 7, "ahmad": 7, "ilg": 7, "akkaya": 7, "florencia": 7, "leoni": 7, "aleman": 7, "janko": 7, "altenschmidt": 7, "altman": 7, "shyamal": 7, "anadkat": 7, "avila": 7, "valeri": 7, "balcom": 7, "baltescu": 7, "haim": 7, "belgum": 7, "irwan": 7, "bello": 7, "jake": 7, "berdin": 7, "bernadett": 7, "shapiro": 7, "berner": 7, "lenni": 7, "bogdonoff": 7, "boiko": 7, "madelain": 7, "boyd": 7, "luisa": 7, "brakman": 7, "button": 7, "rosi": 7, "campbel": 7, "cann": 7, "brittani": 7, "carei": 7, "carlson": 7, "rori": 7, "carmichael": 7, "che": 7, "foti": 7, "sulli": 7, "rubi": 7, "chess": 7, "chester": 7, "cho": 7, "hyung": 7, "won": 7, "chung": 7, "jeremiah": 7, "currier": 7, "yunx": 7, "cori": 7, "decareaux": 7, "degri": 7, "deutsch": 7, "devil": 7, "dhar": 7, "steve": 7, "dowl": 7, "dun": 7, "adrien": 7, "ecoffet": 7, "atti": 7, "eleti": 7, "tyna": 7, "elound": 7, "farhi": 7, "niko": 7, "sim\u00f3n": 7, "posada": 7, "fishman": 7, "juston": 7, "isabella": 7, "fulford": 7, "georg": 7, "gibson": 7, "vik": 7, "tarun": 7, "gogineni": 7, "goh": 7, "rapha": 7, "gontijo": 7, "lope": 7, "gordon": 7, "morgan": 7, "grafstein": 7, "yufei": 7, "guo": 7, "hallaci": 7, "heaton": 7, "johann": 7, "heideck": 7, "hickei": 7, "wade": 7, "hoeschel": 7, "houghton": 7, "kenni": 7, "hsu": 7, "shengli": 7, "joost": 7, "huizinga": 7, "shawn": 7, "joann": 7, "jang": 7, "roger": 7, "haozhun": 7, "shino": 7, "jomoto": 7, "billi": 7, "jonn": 7, "tomer": 7, "kaftan": 7, "\u0142ukasz": 7, "kamali": 7, "ingmar": 7, "kanitscheid": 7, "tabarak": 7, "khan": 7, "logan": 7, "kilpatrick": 7, "jong": 7, "wook": 7, "christina": 7, "yongjik": 7, "hendrik": 7, "kirchner": 7, "kiro": 7, "matt": 7, "kokotajlo": 7, "kondraciuk": 7, "kondrich": 7, "konstantinidi": 7, "kosic": 7, "vishal": 7, "kuo": 7, "lamp": 7, "ikai": 7, "teddi": 7, "jade": 7, "leung": 7, "chak": 7, "ming": 7, "lim": 7, "molli": 7, "mateusz": 7, "litwin": 7, "theresa": 7, "lopez": 7, "patricia": 7, "lue": 7, "makanju": 7, "malfacini": 7, "markov": 7, "yaniv": 7, "markovski": 7, "bianca": 7, "mayn": 7, "mckinnei": 7, "christin": 7, "mcleavei": 7, "mcmillan": 7, "mcneil": 7, "aalok": 7, "menick": 7, "mishchenko": 7, "vinni": 7, "monaco": 7, "murk": 7, "m\u00e9ly": 7, "ashvin": 7, "nair": 7, "reiichiro": 7, "nakano": 7, "rajeev": 7, "nayak": 7, "arvind": 7, "neelakantan": 7, "hyeonwoo": 7, "noh": 7, "keef": 7, "jakub": 7, "pachocki": 7, "palermo": 7, "ashlei": 7, "pantuliano": 7, "parish": 7, "emi": 7, "parparita": 7, "passo": 7, "perelman": 7, "belbut": 7, "pere": 7, "pokorni": 7, "pokrass": 7, "vitchyr": 7, "pong": 7, "tolli": 7, "powel": 7, "bori": 7, "proehl": 7, "rae": 7, "ramesh": 7, "franci": 7, "kendra": 7, "rimbach": 7, "carl": 7, "rotst": 7, "roussez": 7, "saltarelli": 7, "ted": 7, "sander": 7, "schnurr": 7, "selsam": 7, "kyla": 7, "sheppard": 7, "toki": 7, "sherbakov": 7, "shieh": 7, "shoker": 7, "pranav": 7, "szymon": 7, "sidor": 7, "sigler": 7, "sitkin": 7, "sokolowski": 7, "natali": 7, "staudach": 7, "madelein": 7, "tootoonchian": 7, "tseng": 7, "preston": 7, "tuggl": 7, "turlei": 7, "juan": 7, "cer\u00f3n": 7, "urib": 7, "vallon": 7, "vijayvergiya": 7, "jai": 7, "alvin": 7, "ward": 7, "cj": 7, "weinmann": 7, "akila": 7, "welihinda": 7, "jiayi": 7, "weng": 7, "lilian": 7, "wiethoff": 7, "willner": 7, "wolrich": 7, "lauren": 7, "workman": 7, "sherwin": 7, "yoo": 7, "zeller": 7, "shengjia": 7, "juntang": 7, "zhuk": 7, "2303": 7, "08774": 7, "pnc": 7, "inkit": 7, "manish": 7, "nagireddi": 7, "giandomenico": 7, "cornacchia": 7, "subhajit": 7, "chaudhuri": 7, "tejaswini": 7, "pedapati": 7, "pierr": 7, "dognin": 7, "keerthiram": 7, "murugesan": 7, "miehl": 7, "santill\u00e1n": 7, "kieran": 7, "giulio": 7, "zizzo": 7, "muhammad": 7, "zaid": 7, "hame": 7, "purcel": 7, "desmond": 7, "pan": 7, "ing": 7, "vejsbjerg": 7, "dali": 7, "hind": 7, "werner": 7, "geyer": 7, "ambrish": 7, "rawat": 7, "kush": 7, "varshnei": 7, "prasanna": 7, "sattigeri": 7, "07724": 7, "saffron": 7, "ring": 7, "aslanid": 7, "glaes": 7, "nat": 7, "mcalees": 7, "irv": 7, "2202": 7, "03286": 7, "szw": 7, "qinghua": 7, "higham": 7, "gorban": 7, "bastouni": 7, "ivan": 7, "tyukin": 7, "12670": 7, "vsk": 7, "simplesafetytest": 7, "2311": 7, "08370": 7, "wmr24": 7, "sandra": 7, "brent": 7, "mittelstadt": 7, "duti": 7, "royal": 7, "240197": 7, "royalsocietypublish": 7, "1098": 7, "rso": 7, "ylx24": 7, "jiahao": 7, "xingwei": 7, "zyi": 7, "shune": 7, "lyumanshan": 7, "jingyu": 7, "shui": 7, "haobin": 7, "pengfei": 7, "hewu": 7, "ghost": 7, "14931": 7, "zho24": 7, "amazonwservices24": 7, "anthropic24": 7, "cdn": 7, "1adf000c8f675958c2ee23805d91aaade1cd4613": 7, "centerfasafety24a": 7, "centerforaisafeti": 7, "centerfasafety24b": 7, "deepmind24": 7, "googleapi": 7, "fsf": 7, "europeanmagency24": 7, "ema": 7, "europa": 7, "activities_en": 7, "financialirauthority24": 7, "ibm24": 7, "watsonx": 7, "saa": 7, "libraryocongress23": 7, "loc": 7, "gov": 7, "mistralai24": 7, "mlsteam24": 7, "mlsafeti": 7, "nationaliosatechnology24": 7, "nist": 7, "itl": 7, "nvidia24": 7, "openai24a": 7, "openai24b": 7, "opensafetylab24a": 7, "opensafetylab24b": 7, "protectai24": 7, "surgeai24": 7, "ukgovernment24": 7, "unicef24": 7, "innocenti": 7, "julia": 8, "easili": 8, "trial": 8, "wrangl": 8, "hoc": 8, "dataset": 8, "unwant": 8, "overflow": 8, "twitter": 8, "youtub": 8, "ldot": 8, "prod_": 8, "syntact": 8, "central": 8, "delic": 8, "heart": 8, "xml": 8, "invalid": 8, "ttt": 8, "itt": 8, "nousresearch": 8, "herm": 8, "json_format": 8, "person1": 8, "q1": 8, "person2": 8, "response_cont": 8, "is_json": 8, "myjson": 8, "nest": 8, "conceptu": 8, "overview": 8, "unend": 8, "whitespac": 8, "throw": 8, "somewher": 8, "json_object": 8, "circul": 8, "vertex": 8, "worri": 8, "enum": 8, "secextract": 8, "mentioned_ent": 8, "mentioned_plac": 8, "extract_from_sec_fil": 8, "sec_filing_text": 8, "parser": 8, "hint": 8, "prompt_extract": 8, "sec_extract": 8, "washington": 8, "beg": 8, "1652": 8, "171": 8, "unnorm": 8, "0325": 8, "strongest": 8, "bfloat16": 8, "device_map": 8, "src": 8, "python3": 8, "nvml": 8, "return_tensor": 8, "pt": 8, "inference_mod": 8, "last_token_logit": 8, "next_token_prob": 8, "nn": 8, "dim": 8, "top_k_prob": 8, "top_k_indic": 8, "topk": 8, "top_k_token": 8, "decod": 8, "idx": 8, "skip_special_token": 8, "prob": 8, "0305": 8, "0197": 8, "0106": 8, "0093": 8, "logitsprocessor": 8, "logits_processor": 8, "logitsprocessorlist": 8, "customlogitsprocessor": 8, "intermediari": 8, "input_id": 8, "__call__": 8, "longtensor": 8, "batch_siz": 8, "sequence_length": 8, "floattensor": 8, "vocab_s": 8, "mask": 8, "pick": 8, "greedi": 8, "yesnologitsprocessor": 8, "initial_length": 8, "fill_": 8, "inf": 8, "debug": 8, "yes_token": 8, "add_special_token": 8, "no_token": 8, "yes_no_logit": 8, "yes_no_prob": 8, "yes_prob": 8, "no_prob": 8, "yes_mask": 8, "1e4": 8, "NO": 8, "generation_output_control": 8, "uncontrol": 8, "generation_output": 8, "renorm": 8, "4263": 8, "5737": 8, "10407": 8, "4607": 8, "6250": 8, "9219": 8, "helper": 8, "model_output": 8, "gen_output": 8, "batch_decod": 8, "clean_up_tokenization_spac": 8, "classic": 8, "italian": 8, "willard": 8, "louf": 8, "reformul": 8, "finit": 8, "fsm": 8, "s_": 8, "s_t": 8, "s_1": 8, "tild": 8, "odot": 8, "rightarrow": 8, "wise": 8, "thien": 8, "automaton": 8, "dfa": 8, "outgo": 8, "yy": 8, "ever": 8, "aa": 8, "lwai": 8, "prop": 8, "yynnaa": 8, "malform": 8, "base_prompt": 8, "sec_extraction_outlin": 8, "zsp": 8, "zicorp": 8, "with_structured_output": 8, "runnabl": 8, "typeddict": 8, "qu": 8, "langchain_openai": 8, "chatopenai": 8, "langchain_cor": 8, "chatprompttempl": 8, "extract_from_sec_filing_langchain": 8, "structured_llm": 8, "prompt_templ": 8, "from_messag": 8, "llm_chain": 8, "invok": 8, "sec_extraction_langchain": 8, "bnf": 8, "backu": 8, "naur": 8, "fssl": 8, "extract_entities_from_sec_fil": 8, "ollama_structured_output_prompt_suffix": 8, "ollama_structured_output_temperatur": 8, "uncensor": 8, "model_json_schema": 8, "response_json": 8, "sharpli": 8, "wrapper": 8, "exllama2": 8, "zoo": 8, "furthermor": 8, "nonetheless": 8, "extran": 8, "dispar": 8, "preval": 8, "speak": 8, "aider": 8, "outweigh": 8, "rebutt": 8, "reproduct": 8, "paint": 8, "dottxt": 8, "flaw": 8, "uneven": 8, "conflat": 8, "drawback": 8, "pfiffer": 8, "wrestl": 8, "aid24": 8, "dot24": 8, "demo": 8, "gge24": 8, "lan4b": 8, "lww": 8, "xun": 8, "hanyu": 8, "yezhaohui": 8, "shichao": 8, "simin": 8, "shunyu": 8, "feiyu": 8, "xiong": 8, "12599": 8, "llf": 8, "xieyang": 8, "frederick": 8, "fiannaca": 8, "terri": 8, "koo": 8, "dixon": 8, "ea": 8, "machineri": 8, "3613905": 8, "3650756": 8, "xuan": 8, "hai": 8, "nguyen": 8, "ngoc": 8, "tiviati": 8, "hieu": 8, "dao": 8, "shafiq": 8, "joti": 8, "kenji": 8, "kawaguchi": 8, "nanci": 8, "min": 8, "kan": 8, "08656": 8, "out24": 8, "twt": 8, "zhi": 8, "cheng": 8, "kuang": 8, "tsai": 8, "chieh": 8, "hung": 8, "yun": 8, "nung": 8, "02442": 8, "tt24": 8, "vivien": 8, "vivien000": 8, "wl23": 8, "r\u00e9mi": 8, "09702": 8, "guidanceai24": 8, "nvidia4a": 8, "wikipediacontributors24": 8, "wiktionari": 8, "naur_form": 8}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"about": [0, 2], "book": [0, 2], "content": [0, 3, 4, 5, 6, 7, 8], "core": 0, "challeng": 0, "we": 0, "ll": 0, "address": 0, "A": [0, 2, 3, 4], "practic": [0, 2, 6, 8], "approach": [0, 4, 7], "an": 0, "open": [0, 2, 6], "sourc": [0, 2, 6], "note": [0, 3], "perspect": 0, "who": 0, "thi": 0, "i": [0, 3], "For": 0, "outcom": 0, "prerequisit": 0, "set": 0, "up": 0, "your": [0, 6], "environ": 0, "code": 0, "repositori": 0, "python": 0, "setup": [0, 3], "api": [0, 7], "kei": [0, 5], "configur": 0, "troubleshoot": 0, "common": [0, 7], "issu": 0, "author": 0, "": 0, "prefac": [1, 2], "tame": 2, "llm": [2, 4, 5, 6, 7], "guid": 2, "pitfal": [2, 7], "softwar": [2, 5], "chapter": 2, "1": [2, 7], "The": [2, 4, 5, 6], "eval": [2, 5, 7], "gap": [2, 5], "2": [2, 6, 7], "manag": 2, "input": 2, "data": [2, 3], "3": [2, 7], "structur": [2, 8], "output": [2, 8], "4": [2, 7], "safeti": [2, 7], "5": [2, 7], "prefer": [2, 3], "base": [2, 3, 5, 7], "align": [2, 3], "6": [2, 7], "local": [2, 6], "7": 2, "fall": [2, 4], "cost": [2, 4, 6], "paradox": [2, 4], "8": 2, "frontier": 2, "appendix": 2, "tool": [2, 5, 6, 7, 8], "resourc": 2, "introduct": [3, 5, 6, 7, 8], "from": 3, "raw": 3, "capabl": 3, "On": 3, "misalign": 3, "languag": 3, "model": [3, 5, 6], "human": 3, "supervis": 3, "fine": [3, 6, 8], "tune": [3, 6, 8], "sft": 3, "augment": 3, "post": [3, 8], "train": 3, "answer": 3, "limit": 3, "collaps": 3, "fake": 3, "case": [3, 6, 7], "studi": [3, 6, 7], "polici": [3, 7], "experiment": 3, "deliver": 3, "smollm2": 3, "dataset": [3, 5, 6, 7], "synthet": 3, "gener": [3, 5, 7], "user": [3, 7], "prompt": [3, 6, 8], "reject": 3, "respons": 3, "chosen": 3, "dpo": 3, "optim": [3, 4], "prepar": 3, "vibe": 3, "check": [3, 4], "evalu": [3, 5, 7], "discuss": [3, 8], "conclus": [3, 4, 5, 6, 7, 8], "citat": [3, 5, 7, 8], "refer": [3, 4, 5, 6, 7, 8], "why": 4, "matter": 4, "more": 4, "than": 4, "ever": 4, "right": 4, "size": 4, "strateg": 4, "metric": [4, 5], "requir": [4, 5], "busi": 4, "perform": [4, 6], "oper": 4, "technic": [4, 7], "quantiz": [4, 6], "list": 4, "non": 5, "determinist": 5, "machin": 5, "emerg": 5, "properti": 5, "problem": [5, 8], "statement": [5, 8], "tradit": 5, "v": [5, 6], "design": [5, 7], "applic": 5, "test": 5, "matrix": 5, "conceptu": 5, "overview": 5, "consider": 5, "task": [5, 6], "benchmark": [5, 6, 7], "leaderboard": 5, "lightev": 5, "mmlu": 5, "econometr": 5, "sampl": [5, 7], "famili": [5, 6], "us": 5, "langsmith": 5, "promptfoo": 5, "comparison": [5, 6, 8], "choos": 6, "suitabl": 6, "result": 6, "llama": 6, "licens": 6, "commun": 6, "support": 6, "custom": [6, 7], "mistral": [6, 7], "decemb": 6, "22": 6, "2024": 6, "deploy": 6, "serv": 6, "cpp": 6, "llamafil": 6, "ollama": [6, 8], "lama": 6, "ui": 6, "lm": 6, "studio": 6, "jan": 6, "webui": 6, "openwebui": 6, "effect": 6, "level": 6, "hardwar": 6, "takeawai": [6, 7], "risk": 7, "ai": 7, "amplifi": 7, "exist": 7, "harm": 7, "novel": 7, "associ": 7, "autonom": 7, "exacerb": 7, "factor": 7, "specif": 7, "guidanc": 7, "govern": 7, "organ": 7, "privat": 7, "sector": 7, "openai": 7, "anthrop": 7, "googl": 7, "rubric": 7, "mlcommon": 7, "centr": 7, "porquoi": 7, "red": 7, "team": 7, "constitut": 7, "explain": 7, "xai": 7, "plan": 7, "phase": 7, "definit": 7, "research": [7, 8], "identif": 7, "framework": [7, 8], "architectur": 7, "implement": 7, "select": 7, "go": 7, "market": 7, "compon": 7, "salad": 7, "bench": 7, "truthfulqa": 7, "harmbench": 7, "safebench": 7, "techniqu": [7, 8], "repres": 7, "layer": 7, "map": 7, "rule": 7, "filter": 7, "moder": 7, "bad": 7, "good": 7, "guard": 7, "judg": 7, "valid": 7, "engin": 8, "json": 8, "mode": 8, "logit": 8, "process": 8, "outlin": 8, "langchain": 8, "best": 8, "compar": 8, "solut": 8, "ongo": 8, "debat": 8, "acknowledg": 8}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinxcontrib.bibtex": 9, "sphinx": 57}, "alltitles": {"About the Book": [[0, "about-the-book"], [2, "about-the-book"]], "Contents": [[0, "contents"], [3, "contents"], [4, "contents"], [5, "contents"], [6, "contents"], [7, "contents"], [8, "contents"]], "Core Challenges We\u2019ll Address": [[0, "core-challenges-we-ll-address"]], "A Practical Approach": [[0, "a-practical-approach"]], "An Open Source Approach": [[0, "an-open-source-approach"]], "Open Source Book": [[0, "open-source-book"]], "A Note on Perspective": [[0, "a-note-on-perspective"]], "Who This Book Is For": [[0, "who-this-book-is-for"]], "Outcomes": [[0, "outcomes"]], "Prerequisites": [[0, "prerequisites"]], "Setting Up Your Environment": [[0, "setting-up-your-environment"]], "Code Repository": [[0, "code-repository"]], "Python Environment Setup": [[0, "python-environment-setup"]], "API Keys Configuration": [[0, "api-keys-configuration"]], "Troubleshooting Common Issues": [[0, "troubleshooting-common-issues"]], "About the Author(s)": [[0, "about-the-author-s"]], "Preface": [[1, "preface"], [2, "preface"]], "Taming LLMs": [[2, "taming-llms"]], "A Practical Guide to LLM Pitfalls with Open Source Software": [[2, "a-practical-guide-to-llm-pitfalls-with-open-source-software"]], "Chapter 1: The Evals Gap": [[2, "chapter-1-the-evals-gap"]], "Chapter 2: Managing Input Data": [[2, "chapter-2-managing-input-data"]], "Chapter 3: Structured Output": [[2, "chapter-3-structured-output"]], "Chapter 4: Safety": [[2, "chapter-4-safety"]], "Chapter 5: Preference-Based Alignment": [[2, "chapter-5-preference-based-alignment"]], "Chapter 6: Local LLMs in Practice": [[2, "chapter-6-local-llms-in-practice"]], "Chapter 7: The Falling Cost Paradox": [[2, "chapter-7-the-falling-cost-paradox"]], "Chapter 8: Frontiers": [[2, "chapter-8-frontiers"]], "Appendix A: Tools and Resources": [[2, "appendix-a-tools-and-resources"]], "Preference-Based Alignment": [[3, "preference-based-alignment"]], "Introduction": [[3, "introduction"], [5, "introduction"], [6, "introduction"], [7, "introduction"], [8, "introduction"]], "From Raw Capabilities to Preference Alignment": [[3, "from-raw-capabilities-to-preference-alignment"]], "On the Misalignment of Language Models": [[3, "on-the-misalignment-of-language-models"]], "Aligning Language Models with Human Preferences": [[3, "aligning-language-models-with-human-preferences"]], "Supervised Fine-Tuning (SFT) for Model Alignment": [[3, "supervised-fine-tuning-sft-for-model-alignment"]], "Augmenting SFT with Human Preferences": [[3, "augmenting-sft-with-human-preferences"]], "Is Post-Training the Answer?": [[3, "is-post-training-the-answer"]], "Limitations": [[3, "limitations"]], "Model Collapse": [[3, "model-collapse"]], "Faking Alignment": [[3, "faking-alignment"]], "Case Study: Aligning a Language Model to a Policy": [[3, "case-study-aligning-a-language-model-to-a-policy"]], "Experimental Setup": [[3, "experimental-setup"]], "Deliverables": [[3, "deliverables"]], "A Note on smolLM2 Models": [[3, "a-note-on-smollm2-models"]], "Policy": [[3, "policy"]], "Preference Dataset - Synthetic Dataset Generation": [[3, "preference-dataset-synthetic-dataset-generation"]], "User Prompts": [[3, "user-prompts"]], "Rejected Responses": [[3, "rejected-responses"]], "Chosen Responses": [[3, "chosen-responses"]], "Generate DPO Dataset": [[3, "generate-dpo-dataset"]], "DPO-Based Optimization": [[3, "dpo-based-optimization"]], "Data Preparation": [[3, "data-preparation"]], "Fine-Tuning": [[3, "fine-tuning"]], "Vibe Check": [[3, "vibe-check"]], "Alignment Evaluation": [[3, "alignment-evaluation"]], "Discussion and Conclusions": [[3, "discussion-and-conclusions"]], "Citation": [[3, "citation"], [5, "citation"], [7, "citation"], [8, "citation"]], "References": [[3, "references"], [4, "references"], [5, "references"], [6, "references"], [7, "references"], [8, "references"]], "The Falling Cost Paradox": [[4, "the-falling-cost-paradox"]], "Why Optimization Matters More Than Ever": [[4, "why-optimization-matters-more-than-ever"]], "Right-Sizing LLMs: A Strategic Approach": [[4, "right-sizing-llms-a-strategic-approach"]], "Metrics": [[4, "metrics"], [5, "metrics"]], "Requirements": [[4, "requirements"]], "Business Requirements": [[4, "business-requirements"]], "Performance Requirements": [[4, "performance-requirements"]], "Operational Requirements": [[4, "operational-requirements"]], "Technical Requirements": [[4, "technical-requirements"]], "Quantization": [[4, "quantization"], [6, "quantization"]], "Check-list": [[4, "check-list"]], "Conclusion": [[4, "conclusion"], [5, "conclusion"], [6, "conclusion"], [7, "conclusion"], [8, "conclusion"]], "The Evals Gap": [[5, "the-evals-gap"]], "Non-Deterministic Generative Machines": [[5, "non-deterministic-generative-machines"]], "Emerging Properties": [[5, "emerging-properties"]], "Problem Statement": [[5, "problem-statement"], [8, "problem-statement"]], "Evals of Traditional Software vs LLMs": [[5, "evals-table"]], "Evals Design": [[5, "evals-design"]], "LLM Application Testing Requirements Matrix": [[5, "validation-requirements"]], "Conceptual Overview": [[5, "conceptual-overview"]], "Design Considerations": [[5, "design-considerations"]], "Key Metrics for Evaluating Generative Tasks": [[5, "key-metrics"]], "Evaluators": [[5, "evaluators"]], "Model-Based Evaluation": [[5, "model-based-evaluation"]], "Evaluating Evaluators": [[5, "evaluating-evaluators"]], "Benchmarks and Leaderboards": [[5, "benchmarks-and-leaderboards"]], "Tools": [[5, "tools"], [8, "tools"]], "LightEval": [[5, "lighteval"]], "MMLU Econometrics Task Dataset sample": [[5, "mmlu-econometrics"]], "Model Families Evaluated Using LightEval": [[5, "model-families"]], "LangSmith": [[5, "langsmith"]], "PromptFoo": [[5, "promptfoo"]], "Comparison": [[5, "comparison"], [6, "comparison"], [6, "id36"]], "Comparison of Lighteval, LangSmith, and Promptfoo": [[5, "tool-comparison"]], "Local LLMs in Practice": [[6, "local-llms-in-practice"]], "Choosing your Model": [[6, "choosing-your-model"]], "Task Suitability": [[6, "task-suitability"]], "Benchmark results for Llama 2 family of models.": [[6, "llama2-benchmark"]], "Performance & Cost": [[6, "performance-cost"]], "Licensing": [[6, "licensing"]], "Open Source LLMs.": [[6, "open-source-llms"]], "Community Support": [[6, "community-support"]], "Customization": [[6, "customization"]], "Mistral fine-tuning costs as of December 22, 2024.": [[6, "mistral-costs"]], "Tools for Local LLM Deployment": [[6, "tools-for-local-llm-deployment"]], "Serving Models": [[6, "serving-models"]], "LLama.cpp": [[6, "llama-cpp"]], "Llamafile": [[6, "llamafile"]], "Ollama": [[6, "ollama"], [8, "ollama"]], "lama.cpp vs Ollama vs Llamafile Comparison": [[6, "feature-comparison-local"]], "UI": [[6, "ui"]], "LM Studio": [[6, "lm-studio"]], "Jan": [[6, "jan"]], "Open WebUI": [[6, "open-webui"]], "LM Studio vs Jan vs OpenWebUI Comparison": [[6, "feature-comparison-ui"]], "Case Study: The Effect of Quantization on LLM Performance": [[6, "case-study-the-effect-of-quantization-on-llm-performance"]], "Prompts Dataset": [[6, "prompts-dataset"]], "Quantization Levels": [[6, "quantization-levels"]], "Benchmarking": [[6, "benchmarking"], [7, "benchmarking"]], "Results": [[6, "results"]], "Quantization Benchmarks": [[6, "quantization-benchmarks"]], "Benchmarking Hardware": [[6, "benchmarking-hardware"]], "Takeaways": [[6, "takeaways"], [7, "takeaways"]], "Safety": [[7, "safety"]], "Safety Risks": [[7, "safety-risks"]], "General AI Safety Risks": [[7, "general-ai-safety-risks"]], "Amplified Existing Harms and Novel Risks": [[7, "amplified-existing-harms-and-novel-risks"]], "Risks Associated with Autonomous AI": [[7, "risks-associated-with-autonomous-ai"]], "Exacerbating Factors": [[7, "exacerbating-factors"]], "LLMs Specific Safety Risks": [[7, "llms-specific-safety-risks"]], "Guidance": [[7, "guidance"]], "Governments & Organizations": [[7, "governments-organizations"]], "Private Sector": [[7, "private-sector"]], "OpenAI": [[7, "openai"]], "Anthropic": [[7, "anthropic"]], "Google": [[7, "google"]], "Rubrics": [[7, "rubrics"]], "MLCommons AI Safety Benchmark": [[7, "mlcommons-ai-safety-benchmark"]], "Centre for the Governance of AI Rubric": [[7, "centre-for-the-governance-of-ai-rubric"]], "Porquoi": [[7, "porquoi"]], "Approaches": [[7, "approaches"]], "Red Teaming": [[7, "red-teaming"]], "Constitutional AI": [[7, "constitutional-ai"]], "Explainable AI (XAI)": [[7, "explainable-ai-xai"]], "Designing a Safety Plan": [[7, "designing-a-safety-plan"]], "Phase 1. Policy Definition": [[7, "phase-1-policy-definition"]], "Phase 2. User Research & Risk Identification": [[7, "phase-2-user-research-risk-identification"]], "Phase 3. Evaluation Framework": [[7, "phase-3-evaluation-framework"]], "Phase 4. Safety Architecture Design": [[7, "phase-4-safety-architecture-design"]], "Phase 5. Implementation & Tools Selection": [[7, "phase-5-implementation-tools-selection"]], "Phase 6. Go-to-Market": [[7, "phase-6-go-to-market"]], "Common Pitfalls": [[7, "common-pitfalls"]], "Technical Implementation Components": [[7, "technical-implementation-components"]], "Benchmarks & Datasets": [[7, "benchmarks-datasets"]], "SALAD-Bench": [[7, "salad-bench"]], "TruthfulQA": [[7, "truthfulqa"]], "HarmBench": [[7, "harmbench"]], "SafeBench": [[7, "safebench"]], "Tools & Techniques": [[7, "tools-techniques"]], "Representative Safety Layer Risk Map.": [[7, "safety-layer-table"]], "Rules-Based Safety Filtering": [[7, "rules-based-safety-filtering"]], "Rules-Based Safety Filtering Tools.": [[7, "safety-layer-tools"]], "LLM-Based Safety Filtering": [[7, "llm-based-safety-filtering"]], "Custom Moderation": [[7, "custom-moderation"]], "Case Study: Implementing a Safety Filter": [[7, "case-study-implementing-a-safety-filter"]], "Evals Dataset": [[7, "evals-dataset"]], "Bad Samples": [[7, "bad-samples"]], "Good Samples": [[7, "good-samples"]], "Safety Filters": [[7, "safety-filters"]], "LLM-Guard": [[7, "llm-guard"]], "Mistral Moderation API": [[7, "mistral-moderation-api"]], "OpenAI Moderation API": [[7, "openai-moderation-api"]], "Custom Judge Validator": [[7, "custom-judge-validator"]], "Structured Output": [[8, "structured-output"]], "Techniques": [[8, "techniques"]], "Prompt Engineering": [[8, "prompt-engineering"]], "JSON Mode (Fine-Tuned)": [[8, "json-mode-fine-tuned"]], "Logit Post-Processing": [[8, "logit-post-processing"]], "Outlines": [[8, "outlines"]], "LangChain": [[8, "langchain"]], "Discussion": [[8, "discussion"]], "Best Practices": [[8, "best-practices"]], "Comparing Solutions": [[8, "comparing-solutions"]], "Structured Output Frameworks Comparison": [[8, "structured-output-frameworks"]], "Research and Ongoing Debate": [[8, "research-and-ongoing-debate"]], "Acknowledgements": [[8, "acknowledgements"]]}, "indexentries": {}})
\ No newline at end of file
diff --git a/tamingllms/_build/jupyter_execute/markdown/intro.ipynb b/tamingllms/_build/jupyter_execute/markdown/intro.ipynb
index 98db741..028bf6f 100644
--- a/tamingllms/_build/jupyter_execute/markdown/intro.ipynb
+++ b/tamingllms/_build/jupyter_execute/markdown/intro.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "b5247b8d",
+   "id": "f2846c09",
    "metadata": {},
    "source": [
     "(intro)=\n",
diff --git a/tamingllms/_build/jupyter_execute/notebooks/alignment.ipynb b/tamingllms/_build/jupyter_execute/notebooks/alignment.ipynb
index d39e04c..9f62297 100644
--- a/tamingllms/_build/jupyter_execute/notebooks/alignment.ipynb
+++ b/tamingllms/_build/jupyter_execute/notebooks/alignment.ipynb
@@ -2537,7 +2537,7 @@
    "source": [
     "## Discussion and Conclusions\n",
     "\n",
-    "LLMs are complex systems and alignment is a challenging problem. In this chapter, we discussed how post-training alignment techniques can be used to align a language model to human preferences. In the case study, we demonstrated how to use DPO to align a language model to a user-provider policy further automating the process via synthetic data generation and LLM-as-judge evaluation. Our approach does serve as a proof of concept, however, several considerations should be taken into account when using this methodology in practice.\n",
+    "LLMs are complex systems and alignment is a challenging problem. In this chapter, we discussed how post-training techniques can be used to align a language model to human preferences. In the case study, we demonstrated how to use DPO to align a language model to a user-provider policy further automating the process via synthetic data generation and LLM-as-judge evaluation. Our approach serves as a proof of concept and several considerations should be taken into account when using this methodology in practice.\n",
     "\n",
     "**Synthetic Data Generation**\n",
     "\n",
diff --git a/tamingllms/_build/jupyter_execute/notebooks/cost.ipynb b/tamingllms/_build/jupyter_execute/notebooks/cost.ipynb
new file mode 100644
index 0000000..b03dd48
--- /dev/null
+++ b/tamingllms/_build/jupyter_execute/notebooks/cost.ipynb
@@ -0,0 +1,450 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "(cost)=\n",
+    "# The Falling Cost Paradox\n",
+    "```{epigraph}\n",
+    "It is a confusion of ideas to suppose that the economical use of fuel is equivalent to diminished consumption. <br>\n",
+    "The very contrary is the truth. \n",
+    "\n",
+    "-- William Stanley Jevons\n",
+    "```\n",
+    "```{contents}\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Why Optimization Matters More Than Ever\n",
+    "\n",
+    "According to recent analysis from a16z {cite}`a16z2024llmflation`, the cost of LLM inference is decreasing by approximately 10x every year - a rate that outpaces even Moore's Law in the PC revolution or Edholm's Law during the bandwidth explosion of the dot-com era.\n",
+    "\n",
+    "```{figure} ../_static/cost/llmflation.png\n",
+    "---\n",
+    "name: llmflation\n",
+    "alt: LLMflation\n",
+    "scale: 70%\n",
+    "align: center\n",
+    "---\n",
+    "LLMflation {cite}`a16z2024llmflation`: The cost of LLM inference is decreasing by approximately 10x every year.\n",
+    "```\n",
+    "\n",
+    "A model achieving an MMLU score of 42 that cost \\$60 per million tokens in late 2021 can now be run for just \\$0.06 per million tokens. For higher-capability models scoring 83 on MMLU, prices have fallen by a factor of 62 since GPT-4's introduction in March 2023. \n",
+    "\n",
+    "This dramatic decline stems from multiple compounding factors including:\n",
+    "\n",
+    "- Improved GPU efficiency through architectural advances and Moore's Law\n",
+    "- Model quantization progress, moving from 16-bit to 4-bit or lower precision\n",
+    "- Software optimizations reducing compute and memory bandwidth requirements\n",
+    "- Emergence of smaller yet similarly capable models\n",
+    "- Better instruction tuning techniques like RLHF and DPO\n",
+    "- Competition from open-source models and low-cost providers\n",
+    "\n",
+    "This trend raises a critical question: If LLM costs are plummeting so rapidly, why should businesses and developers invest precious time and resources in optimizing their LLM usage? Wouldn't it make more sense to simply wait for the next wave of cost improvements rather than optimize today? In two words: **Jevons Paradox**. \n",
+    "\n",
+    "The Jevons Paradox was first observed by English economist William Stanley Jevons in 1865. Studying coal consumption during the Industrial Revolution, Jevons made a counterintuitive discovery: as steam engines became more efficient and coal use became more economical, total coal consumption increased rather than decreased driving the (Industrial Revolution) and the total spending up.\n",
+    "\n",
+    "This pattern has repeated throughout technological history:\n",
+    "\n",
+    "- Computing Power: As cost per computation plummeted, we didn't spend less on computing, instead we found new creative uses for computers, from smartphones to cloud servers\n",
+    "- Network Bandwidth: As data transmission got cheaper, we shifted from text messaging to HD video streaming and real-time gaming\n",
+    "- Data Storage: As cost per gigabyte fell, we moved from storing documents to hosting entire media libraries and training massive AI models\n",
+    "\n",
+    "One could argue that LLMs and Generative AI more broadly are following a similar trajectory. As costs decline, we're seeing the emergence of new applications:\n",
+    "- Embedding AI capabilities into every application and workflow\n",
+    "- Real-time analysis of audio transcripts and conversations\n",
+    "- Running AI models directly on edge devices and smartphones\n",
+    "- Multimodal applications combining text, images, audio and video \n",
+    "\n",
+    "In this environment of rapidly falling costs but potential for exponential growth in usage, optimizing LLM costs becomes more, not less, important. Here's why:\n",
+    "\n",
+    "**A) Scale Magnifies Everything**. When operating at billions of tokens per day, even small inefficiencies have major effects:\n",
+    "- A single digit improvement in efficiency can save millions of dollars annually at scale\n",
+    "- Every 100 milliseconds of latency is about 8% difference in engagement rates (30% on mobile) [^groklatency]\n",
+    "[^groklatency]: Quote from Jonathan Ross, CEO of Groq, a company that specializes in AI Inference services.\n",
+    "\n",
+    "**B) Tiered Pricing Persists**. While average costs are declining, the market maintains a tiered structure:\n",
+    "- Different models offer varying price-performance tradeoffs\n",
+    "- ChatGPT Pro at \\$200 per month breaks the price drop trend perhaps triggering a new wave of premium models\n",
+    "- Cost optimization is still required to select the right model for each specific use case\n",
+    "\n",
+    "**C) Competition Drives Innovation**. Companies that master LLM efficiency gain significant advantages:\n",
+    "- Ability to offer more competitive pricing\n",
+    "- Capacity to handle larger scale operations\n",
+    "- Resources to invest in product improvement\n",
+    "\n",
+    "**D) Performance and Cost Are Linked**. Cost optimization often yields performance benefits:\n",
+    "- Resource efficiency enables handling larger user loads\n",
+    "- More efficiency and reduced latency leads to improved user experience\n",
+    "\n",
+    "In this environment, companies that master efficient LLM usage while exploiting new capabilities opened up by falling costs will be best positioned to innovate and scale. This dual focus - efficiency and innovation - will likely characterize successful AI companies in the years ahead.\n",
+    "\n",
+    "Motivated by this insight, in the next sections we will dive into the factors that drive LLM cost decay and how to optimize LLM usage in practical applications. The discussion will explore key optimization areas including inference optimization through techniques like Flash Attention and cached prompts, model compression via quantization and distillation, and practical implementation patterns such as response caching, batch processing, and early stopping - all aimed at achieving efficient usage and cost reductions while maintaining model performance and reliability.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Right-Sizing LLMs: A Strategic Approach\n",
+    "\n",
+    "Before implementing cost optimization strategies for LLMs, organizations must develop a comprehensive understanding of their own requirements and constraints. This systematic approach prevents both over-engineering and under-provisioning, leading to more efficient and cost-effective implementations.\n",
+    "\n",
+    "In this section, we define key performance and cost related metrics that will guide our discussion. Then we propose a set of requirements practitioners should consider before we dive into cost optimization techniques.\n",
+    "\n",
+    "\n",
+    "### Metrics"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Requirements\n",
+    "\n",
+    "#### Business Requirements\n",
+    "\n",
+    "First, one needs to define the problem to be solved and to what extent it is worth to be solved. Use case requirements form the foundation of any LLM implementation project. A clear definition of the specific business problema and task to be accomplished must be established upfront, along with concrete performance metrics covering accuracy, latency and throughput. This should be accompanied by well-defined cost-per-transaction targets, clear ROI expectations, and a strategic allocation of budgets across different use cases to ensure resources are optimally distributed.\n",
+    "\n",
+    "Budget and ROI considerations are critical for ensuring the long-term viability of LLM implementations. Organizations must establish clear spending limits that align with their financial capabilities while defining realistic cost-per-transaction targets. ROI expectations need to be carefully established through detailed analysis, followed by a strategic allocation of budgets across various use cases based on their business impact and priority.\n",
+    "\n",
+    "Compliance and security requirements cannot be overlooked. This involves a thorough identification of all applicable regulatory requirements and the establishment of robust data handling standards. Organizations must specify comprehensive audit requirements to maintain transparency and accountability, while implementing appropriate security controls to protect sensitive data and system access.\n",
+    "\n",
+    "Future-proofing considerations help ensure the longevity and adaptability of LLM implementations. This requires careful planning for scale to accommodate future growth, along with the evaluation of multi-model strategies to reduce dependency on single solutions. Organizations should carefully assess vendor lock-in risks and explore open-source alternatives to maintain flexibility and control over their AI infrastructure.\n",
+    "\n",
+    "Chapter {ref}`local` provides a detailed discussion on relevant considerations when {ref}`local-model-selection`.\n",
+    "\n",
+    "#### Performance Requirements\n",
+    "\n",
+    "Accuracy and quality form the foundation of any LLM deployment's performance requirements. At its core, this involves determining the minimum level of accuracy that the model must achieve to be considered successful. This serves as a critical baseline for evaluating model performance and making deployment decisions. Establishing clear evaluation metrics, whether through automated measures or human evaluation processes, provides concrete ways to assess if these thresholds are being met. Continuous monitoring of these accuracy metrics ensures the system maintains its performance over time as usage patterns and data distributions evolve. Chapter {ref}`evals` provides a detailed discussion on how to evaluate the performance of LLM-based applications.\n",
+    "\n",
+    "Latency and throughput requirements are equally crucial for ensuring a positive user experience and system reliability. These specifications define how quickly the system must respond to requests and how many concurrent users it can handle. Response time requirements must be carefully balanced against the computational resources available, while peak load capabilities need to account for usage spikes and growth patterns. The decision between real-time processing for immediate responses versus batch processing for efficiency depends heavily on the use case and user expectations. \n",
+    "\n",
+    "\n",
+    "#### Operational Requirements\n",
+    "\n",
+    "Scale and capacity planning forms the foundation of operational requirements for LLM deployments. This involves a comprehensive analysis of expected system usage and growth patterns to ensure the infrastructure can handle both current and future demands. Organizations must carefully project their daily and monthly API call volumes while calculating the average number of tokens per request to accurately estimate resource needs. Understanding usage patterns, including seasonal variations, enables proper capacity planning. Additionally, developing 12-24 month growth projections helps ensure the infrastructure can scale appropriately as demand increases.\n",
+    "\n",
+    "Reliability and availability requirements are equally critical for maintaining consistent service quality. These specifications define the expected uptime percentage that the system must maintain, typically expressed as a percentage of total operational time. Organizations need to establish clear maintenance windows that minimize disruption to users while ensuring necessary system updates and optimizations can be performed. Comprehensive backup and failover requirements must be specified to ensure business continuity in case of failures. High availability needs should be clearly defined, including redundancy levels and recovery time objectives, to maintain service quality even during unexpected events.\n",
+    "\n",
+    "#### Technical Requirements\n",
+    "\n",
+    "System integration requirements define how the LLM system will interact and communicate with existing infrastructure and applications. This involves carefully mapping all integration points where the LLM system needs to connect with other systems, establishing standardized data formats and interfaces for seamless communication, implementing robust security measures to protect data in transit, and identifying any technical constraints that could impact integration. Getting these integration requirements right is crucial for ensuring the LLM system can function effectively within the broader technical ecosystem.\n",
+    "\n",
+    "Data management requirements address how information will be stored, processed, and maintained within the LLM system. This encompasses determining appropriate storage solutions for maintaining conversation context and history, selecting and configuring vector databases to enable efficient retrieval-augmented generation (RAG), creating comprehensive data retention policies that balance operational needs with resource constraints, and ensuring all data handling practices comply with relevant privacy regulations. Proper data management is essential for both system performance and regulatory compliance, making it a critical consideration in any LLM implementation.\n",
+    "\n",
+    "\n",
+    "This structured approach to requirements analysis enables organizations to:\n",
+    "1. Select appropriate models aligned with specific needs\n",
+    "2. Identify targeted optimization opportunities\n",
+    "3. Scale efficiently while controlling costs\n",
+    "4. Develop realistic resource allocation strategies\n",
+    "\n",
+    "The following sections explore specific optimization techniques, but their implementation should always be guided by these foundational requirements.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Quantization\n",
+    "\n",
+    "Quantization is a common and relevant technique in making LLMs more efficient and accessible. At a high level, quantization reduces the number of bits used to represent a model's parameters. The most common form of quantization is to represent model's weights at lower precision at post-training phase. It has become a standard technique to generate a series of quantized models given a large pre-trained base model.\n",
+    "\n",
+    "While a standard pre-trained LLM might use 32-bit floating-point (FP32) or 16-bit floating-point (FP16) numbers to store its weights, quantized versions can operate at lower precision levels such as 8, 4 or even 2 bits per parameter, reducing memory footprint without proportional losses in performance, necessarily. For instance, for a model of 30 billion parameters, using FP32 means 4 bytes per weight or 120 GB for the whole weights. If the model is quantized such that weights are represented in 1 byte, the memory needed for the model’s weights decreases to 30 GB, hence potentially fitting into consumer grade hardware. This is done at the cost of precision loss, but the trade-off is often worth it though require careful analysis."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's take a look at model weights of a language model (`SmolLM2-135M-Instruct`) that has been quantized to 2-bit and 16-bit precisions. We will use an utility function `load_gguf` from the `taming_utils` package to load model weights of the quantized models directly from Hugging Face."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from taming_utils import load_gguf\n",
+    "\n",
+    "MODEL_NAME = \"bartowski/SmolLM2-135M-Instruct-GGUF\"\n",
+    "GGUF_FILE_Q2_K = \"SmolLM2-135M-Instruct-Q2_K.gguf\"\n",
+    "GGUF_FILE_F16 = \"SmolLM2-135M-Instruct-F16.gguf\"\n",
+    "\n",
+    "model_q2_k = load_gguf(model_name=MODEL_NAME, \n",
+    "              gguf_file=GGUF_FILE_Q2_K)\n",
+    "\n",
+    "model_f16 = load_gguf(model_name=MODEL_NAME, \n",
+    "              gguf_file=GGUF_FILE_F16)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We extract the MLP weights from the first layer of the model as a proxy."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mlp_weights_q2_k = model_q2_k.model.layers[0].mlp.gate_proj.weight\n",
+    "mlp_weights_f16 = model_f16.model.layers[0].mlp.gate_proj.weight"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Original weights at 16-bit precision:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Parameter containing:\n",
+       "tensor([[-0.0145,  0.1826,  0.1377,  ...,  0.1719, -0.1387, -0.0298],\n",
+       "        [-0.1631,  0.0781, -0.2051,  ..., -0.2070, -0.0334,  0.2891],\n",
+       "        [-0.1768, -0.0488, -0.2393,  ..., -0.0396, -0.1348, -0.1533],\n",
+       "        ...,\n",
+       "        [ 0.0771,  0.0845, -0.0232,  ...,  0.0178, -0.1040, -0.0771],\n",
+       "        [ 0.1582,  0.1167, -0.0474,  ...,  0.0845,  0.0359, -0.2500],\n",
+       "        [ 0.0432,  0.0972,  0.0933,  ...,  0.2188,  0.0776,  0.0674]],\n",
+       "       requires_grad=True)"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mlp_weights_f16"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Quantized weights at 2-bit precision:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Parameter containing:\n",
+       "tensor([[-0.0028,  0.1852,  0.1396,  ...,  0.1506, -0.1635, -0.0043],\n",
+       "        [-0.1768,  0.0680, -0.2257,  ..., -0.1890, -0.0464,  0.2960],\n",
+       "        [-0.1840, -0.0451, -0.2395,  ..., -0.0413, -0.1446, -0.1446],\n",
+       "        ...,\n",
+       "        [ 0.0621,  0.0621, -0.0478,  ...,  0.0038, -0.0830, -0.0830],\n",
+       "        [ 0.1473,  0.0926, -0.0547,  ...,  0.0824,  0.0429, -0.2737],\n",
+       "        [ 0.0355,  0.0782,  0.0782,  ...,  0.2043,  0.0740,  0.0740]],\n",
+       "       requires_grad=True)"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mlp_weights_q2_k"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "How do they compare? We arrive at a Pearson correlation of 99.7% between the two sets of weights."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pearson correlation: 0.9970\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "# Convert tensors to numpy arrays (detach from computation graph if needed)\n",
+    "weights_f16 = mlp_weights_f16.detach().cpu().numpy()\n",
+    "weights_q2_k = mlp_weights_q2_k.detach().cpu().numpy()\n",
+    "\n",
+    "flat_f16 = weights_f16.flatten()\n",
+    "flat_q2_k = weights_q2_k.flatten()\n",
+    "\n",
+    "# Calculate correlation\n",
+    "correlation = np.corrcoef(flat_f16, flat_q2_k)[0,1]\n",
+    "print(f\"Pearson correlation: {correlation:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Quantization is a powerful technique for reducing the memory footprint of LLMs. This can be exemplified by the case of LLaMa 3.3 70B as quantized by {cite}`unsloth2024llama3` [^unsloth]. The model's memory requirements vary significantly based on the quantization level used as demonstrated in {numref}`quantized`.\n",
+    "\n",
+    "[^unsloth]: Unsloth runs a business of making LLMs fine-tuning streamlined. Check them out at [unsloth.ai](https://unsloth.ai).\n",
+    "\n",
+    "```{figure} ../_static/cost/quantized.png\n",
+    "---\n",
+    "name: quantized\n",
+    "alt: Quantized Model Size\n",
+    "scale: 50%\n",
+    "align: center\n",
+    "---\n",
+    "Quantized Model Size: `unsloth/Llama-3.3-70B-Instruct-GGUF`\n",
+    "```\n",
+    "\n",
+    "We observe that the quantization process yields remarkable reductions in model size, demonstrating a clear trade-off between precision and memory requirements. The transition from F16 (141.1 GB) to Q8_0 (75 GB) achieves a dramatic 47% reduction in model size while maintaining relatively high numerical precision. Further quantization levels reveal an interesting pattern of diminishing returns - each step down in precision yields progressively smaller absolute size reductions, though the cumulative effect remains significant. At the extreme end, the Q2_K model (26.4 GB) requires only 19% of the storage space of its F16 counterpart [^quantization-levels].\n",
+    "\n",
+    "[^quantization-levels]: You may have noticed quantization levels have a special notation. Including the bit width in the name of the model but also quantization types (e.g. _K, _0). You can find more information about the quantization levels in {cite}`huggingface2024quantization`.\n",
+    "\n",
+    "This wide spectrum of model sizes enables deployment across diverse hardware environments. The lightweight Q2_K variant opens possibilities for running inference on consumer-grade hardware like high-end laptops or desktop computers. In contrast, the full-precision F16 model demands enterprise-grade computing resources with substantial memory capacity. This flexibility in deployment options makes quantization a powerful tool for democratizing access to large language models while managing computational costs.\n",
+    " \n",
+    "While quantization has proven highly effective, there is a limit to how far it can be pushed - specifically, the 1-bit ceiling. A notable advancement in this space is BitNet {cite}`wang20241bitaiinfra11` which pushes the boundaries of extreme quantization.\n",
+    "\n",
+    "BitNet's implementation, bitnet.cpp, has demonstrated significant performance improvements across both ARM and x86 architectures (see {numref}`bitnet`). When compared to llama.cpp, the framework achieves speedups ranging from 1.37x to 5.07x on ARM processors and 2.37x to 6.17x on x86 systems. These performance gains scale with model size - larger models benefit more substantially from BitNet's optimizations. The efficiency improvements extend beyond raw speed: energy consumption drops by 55-70% on ARM and 71-82% on x86 processors. Perhaps most impressively, bitnet.cpp enables running a 100B parameter BitNet b1.58 model on a single CPU at speeds matching human reading pace (5-7 tokens per second).\n",
+    "\n",
+    "```{figure} ../_static/cost/bitnet.png\n",
+    "---\n",
+    "name: bitnet\n",
+    "alt: BitNet\n",
+    "scale: 30%\n",
+    "align: center\n",
+    "---\n",
+    "BitNet: {cite}`wang20241bitaiinfra11`\n",
+    "```\n",
+    "\n",
+    "The framework's initial release focused on CPU inference optimization, with particular emphasis on 1-bit LLM architectures (BitNet b1.58). While initial testing shows promising results, these findings are specific to the tested models and kernels (its specialized kernels are carefully crafted to exploit the unique characteristics of these extremely quantized models). Further validation is needed before generalizing these results across different architectures and use cases.\n",
+    "\n",
+    "As a relatively recent innovation, 1-bit LLMs represent an exciting frontier in model compression. However, their full potential and limitations require additional research and real-world validation. The technology demonstrates how creative approaches to quantization can continue pushing the boundaries of efficient AI deployment."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Beyond its memory footprint reduction, quantization delivers several compelling advantages: it accelerates computation through faster arithmetic operations and larger batch sizes, reduces costs by enabling deployment on less expensive hardware and making LLMs more accessible to smaller organizations, and improves energy efficiency by lowering memory bandwidth usage and power consumption - particularly beneficial for mobile and edge devices, ultimately contributing to more sustainable AI deployment.\n",
+    "\n",
+    "Each reduction in precision risks performance degradation. Finding optimal quantization schemes remains an active research area. See Case Study on Quantization for Local Models in Chapter {ref}`local` for more details.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Check-list\n",
+    "\n",
+    "**Planning and Requirements**\n",
+    "- [ ] Start with a clear understanding of your application's needs and the factors that contribute to LLM costs\n",
+    "- [ ] Choose the right model for your task, balancing performance and cost\n",
+    "- [ ] Be aware of the potential challenges and limitations of open-source LLMs and take appropriate measures to mitigate them\n",
+    "\n",
+    "**Model Optimization**\n",
+    "- [ ] Explore model compression and quantization to reduce model size and computational demands\n",
+    "- [ ] Fine-tune pre-trained models on domain-specific data to improve accuracy and efficiency\n",
+    "- [ ] Consider using RAG to enhance performance and reduce reliance on purely generative processes\n",
+    "\n",
+    "**Prompt Engineering**\n",
+    "- [ ] Optimize prompts and utilize prompt engineering techniques to minimize token usage\n",
+    "- [ ] Experiment with different prompting strategies to unlock the full potential of open-source LLMs\n",
+    "\n",
+    "**Infrastructure and Operations**\n",
+    "- [ ] Implement caching and batching strategies to optimize resource utilization\n",
+    "- [ ] Monitor LLM usage patterns and costs to identify areas for optimization\n",
+    "- [ ] Set up observability and logging to track model performance and costs\n",
+    "- [ ] Establish automated testing and evaluation pipelines\n",
+    "\n",
+    "**Cost Management**\n",
+    "- [ ] Track and analyze inference costs across different model variants\n",
+    "- [ ] Implement cost allocation and chargeback mechanisms\n",
+    "- [ ] Set up cost alerts and budgeting controls\n",
+    "- [ ] Regularly review and optimize resource utilization\n",
+    "\n",
+    "## Conclusion"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]\n",
+    "\n",
+    "[cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/\n",
+    "[cc-by-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png\n",
+    "[cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC-BY--NC--SA-4.0-lightgrey.svg\n",
+    "\n",
+    "```\n",
+    "@misc{tharsistpsouza2024tamingllms,\n",
+    "  author = {Tharsis T. P. Souza},\n",
+    "  title = {Taming LLMs: A Practical Guide to LLM Pitfalls with Open Source Software},\n",
+    "  year = {2024},\n",
+    "  chapter = {The Falling Cost Paradox},\n",
+    "  journal = {GitHub repository},\n",
+    "  url = {https://github.com/souzatharsis/tamingLLMs)\n",
+    "}\n",
+    "```\n",
+    "## References\n",
+    "```{bibliography}\n",
+    ":filter: docname in docnames\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file
diff --git a/tamingllms/_build/jupyter_execute/notebooks/local.ipynb b/tamingllms/_build/jupyter_execute/notebooks/local.ipynb
index 7cdc5f1..59f7f13 100644
--- a/tamingllms/_build/jupyter_execute/notebooks/local.ipynb
+++ b/tamingllms/_build/jupyter_execute/notebooks/local.ipynb
@@ -4,6 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "(local)=\n",
     "# Local LLMs in Practice\n",
     "```{epigraph}\n",
     "Freedom is something that dies unless it's used.\n",
@@ -40,7 +41,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Models Considerations\n",
+    "(local-model-selection)=\n",
+    "## Choosing your Model\n",
     "\n",
     "The landscape of open source LLMs is rapidly evolving, with new models emerging by the day. While proprietary LLMs have garnered significant attention, open source LLMs are gaining traction due to their flexibility, customization options, and cost-effectiveness. \n",
     "\n",
@@ -1352,7 +1354,6 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Citation\n",
     "[![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]\n",
     "\n",
     "[cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/\n",
diff --git a/tamingllms/_build/jupyter_execute/notebooks/structured_output.ipynb b/tamingllms/_build/jupyter_execute/notebooks/structured_output.ipynb
index 090daa7..1d16cf6 100644
--- a/tamingllms/_build/jupyter_execute/notebooks/structured_output.ipynb
+++ b/tamingllms/_build/jupyter_execute/notebooks/structured_output.ipynb
@@ -467,9 +467,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/tobias/src/tamingLLMs/tamingllms/.venv/lib/python3.11/site-packages/torch/cuda/__init__.py:654: UserWarning: Can't initialize NVML\n",
+      "  warnings.warn(\"Can't initialize NVML\")\n"
+     ]
+    }
+   ],
    "source": [
     "MODEL_NAME = \"HuggingFaceTB/SmolLM2-1.7B-Instruct\"\n",
     "PROMPT = \"Is Enzo a good name for a baby?\"\n",
@@ -1384,7 +1393,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,
diff --git a/tamingllms/_config.yml b/tamingllms/_config.yml
index a34b825..3c3a15b 100644
--- a/tamingllms/_config.yml
+++ b/tamingllms/_config.yml
@@ -5,26 +5,12 @@ title: "Taming Large Language Models: A Practical Guide to LLM Pitfalls with Pyt
 author: Tharsis T. P. Souza
 copyright: "Tharsis T. P. Souza, 2024"  # Copyright year to be placed in the footer
 project: "Taming LLMs" 
-#logo: /home/tobias/src/tamingLLMs/tamingllms/tamingllms/_static/logo.png
 
 # Force re-execution of notebooks on each build.
 # See https://jupyterbook.org/content/execute.html
 execute:
   execute_notebooks: 'off'
 
-#html:
-#  comments:
-#    hypothesis: true
-#  extra_navbar: |
-#      <div>
-#          <div class="nav-item"> <a href="https://github.com/souzatharsis/tamingllms" class="nav-link external"> GitHub <outboundlink/> </a> </div>
-#      </div>
-#  baseurl: https://souzatharsis.github.io/tamingllms/
-#  extra_footer: |
-#    <p>
-#    Apache 2.0 License
-#    </p>
-
 # Define the name of the latex output file for PDF builds
 latex:
   latex_documents:
diff --git a/tamingllms/_static/cost/bitnet.png b/tamingllms/_static/cost/bitnet.png
new file mode 100644
index 0000000..5d0d74e
Binary files /dev/null and b/tamingllms/_static/cost/bitnet.png differ
diff --git a/tamingllms/_static/cost/llmflation.png b/tamingllms/_static/cost/llmflation.png
new file mode 100644
index 0000000..5061149
Binary files /dev/null and b/tamingllms/_static/cost/llmflation.png differ
diff --git a/tamingllms/_static/cost/quantized.png b/tamingllms/_static/cost/quantized.png
new file mode 100644
index 0000000..2dc6d44
Binary files /dev/null and b/tamingllms/_static/cost/quantized.png differ
diff --git a/tamingllms/_static/cost/quantized.tsx b/tamingllms/_static/cost/quantized.tsx
new file mode 100644
index 0000000..aef322c
--- /dev/null
+++ b/tamingllms/_static/cost/quantized.tsx
@@ -0,0 +1,58 @@
+import React from 'react';
+import { LineChart, Line, XAxis, YAxis, CartesianGrid, Tooltip, ResponsiveContainer } from 'recharts';
+
+const MemoryUsageChart = () => {
+  const data = [
+    { name: 'F16', value: 141.1 },
+    { name: 'Q8_0', value: 75.0 },
+    { name: 'Q6_K', value: 59.9 },
+    { name: 'Q5_K_M', value: 49.9 },
+    { name: 'Q4_K_M', value: 42.5 },
+    { name: 'Q3_K_M', value: 34.3 },
+    { name: 'Q2_K', value: 26.4 }
+  ];
+
+  return (
+    <div className="w-full h-96 p-4">
+      <ResponsiveContainer width="100%" height="100%">
+        <LineChart data={data} margin={{ top: 20, right: 30, left: 20, bottom: 5 }}>
+          <CartesianGrid strokeDasharray="3 3" />
+          <XAxis 
+            dataKey="name"
+            tick={{ fontSize: 12, fontWeight: 'bold' }}
+          />
+          <YAxis 
+            label={{ 
+              value: 'Model Size (GB)', 
+              angle: -90, 
+              position: 'insideLeft',
+              style: { 
+                textAnchor: 'middle',
+                fontWeight: 'bold'
+              }
+            }}
+            tick={{ fontSize: 12, fontWeight: 'bold' }}
+          />
+          <Tooltip 
+            formatter={(value) => [`${value} GB`, 'Model Size']}
+            contentStyle={{ 
+              backgroundColor: '#fff', 
+              border: '1px solid #ccc',
+              fontWeight: 'bold'
+            }}
+          />
+          <Line 
+            type="monotone"
+            dataKey="value" 
+            stroke="#3eaf7c"
+            strokeWidth={2}
+            dot={{ fill: '#3eaf7c', r: 4 }}
+            activeDot={{ r: 6 }}
+          />
+        </LineChart>
+      </ResponsiveContainer>
+    </div>
+  );
+};
+
+export default MemoryUsageChart;
\ No newline at end of file
diff --git a/tamingllms/_toc.yml b/tamingllms/_toc.yml
index de492f3..778cdac 100644
--- a/tamingllms/_toc.yml
+++ b/tamingllms/_toc.yml
@@ -13,6 +13,8 @@ chapters:
 - file: notebooks/safety.ipynb
 - file: notebooks/alignment.ipynb
 - file: notebooks/local.ipynb
+- file: notebooks/cost.ipynb
+# - file: genindex
 #- file: notebooks/output_size_limit.ipynb
 #- file: markdown
 #- file: notebooks
diff --git a/tamingllms/markdown/toc.md b/tamingllms/markdown/toc.md
index 83c6895..1578091 100644
--- a/tamingllms/markdown/toc.md
+++ b/tamingllms/markdown/toc.md
@@ -32,7 +32,7 @@ Abstract: *The current discourse around Large Language Models (LLMs) tends to fo
 
 ## [Chapter 6: Local LLMs in Practice](https://www.souzatharsis.com/tamingLLMs/notebooks/local.html)
 
-## Chapter 7: The Cost Factor
+## Chapter 7: The Falling Cost Paradox
 
 ## Chapter 8: Frontiers
 
diff --git a/tamingllms/notebooks/alignment.ipynb b/tamingllms/notebooks/alignment.ipynb
index 552ad7f..9eeeffa 100644
--- a/tamingllms/notebooks/alignment.ipynb
+++ b/tamingllms/notebooks/alignment.ipynb
@@ -2537,7 +2537,7 @@
    "source": [
     "## Discussion and Conclusions\n",
     "\n",
-    "LLMs are complex systems and alignment is a challenging problem. In this chapter, we discussed how post-training alignment techniques can be used to align a language model to human preferences. In the case study, we demonstrated how to use DPO to align a language model to a user-provider policy further automating the process via synthetic data generation and LLM-as-judge evaluation. Our approach does serve as a proof of concept, however, several considerations should be taken into account when using this methodology in practice.\n",
+    "LLMs are complex systems and alignment is a challenging problem. In this chapter, we discussed how post-training techniques can be used to align a language model to human preferences. In the case study, we demonstrated how to use DPO to align a language model to a user-provider policy further automating the process via synthetic data generation and LLM-as-judge evaluation. Our approach serves as a proof of concept and several considerations should be taken into account when using this methodology in practice.\n",
     "\n",
     "**Synthetic Data Generation**\n",
     "\n",
diff --git a/tamingllms/notebooks/cost.ipynb b/tamingllms/notebooks/cost.ipynb
new file mode 100644
index 0000000..0bb1d48
--- /dev/null
+++ b/tamingllms/notebooks/cost.ipynb
@@ -0,0 +1,450 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "(cost)=\n",
+    "# The Falling Cost Paradox\n",
+    "```{epigraph}\n",
+    "It is a confusion of ideas to suppose that the economical use of fuel is equivalent to diminished consumption. <br>\n",
+    "The very contrary is the truth. \n",
+    "\n",
+    "-- William Stanley Jevons\n",
+    "```\n",
+    "```{contents}\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Why Optimization Matters More Than Ever\n",
+    "\n",
+    "According to recent analysis from a16z {cite}`a16z2024llmflation`, the cost of LLM inference is decreasing by approximately 10x every year - a rate that outpaces even Moore's Law in the PC revolution or Edholm's Law during the bandwidth explosion of the dot-com era.\n",
+    "\n",
+    "```{figure} ../_static/cost/llmflation.png\n",
+    "---\n",
+    "name: llmflation\n",
+    "alt: LLMflation\n",
+    "scale: 30%\n",
+    "align: center\n",
+    "---\n",
+    "LLMflation {cite}`a16z2024llmflation`: The cost of LLM inference is decreasing by approximately 10x every year.\n",
+    "```\n",
+    "\n",
+    "A model achieving an MMLU score of 42 that cost \\$60 per million tokens in late 2021 can now be run for just \\$0.06 per million tokens. For higher-capability models scoring 83 on MMLU, prices have fallen by a factor of 62 since GPT-4's introduction in March 2023. \n",
+    "\n",
+    "This dramatic decline stems from multiple compounding factors including:\n",
+    "\n",
+    "- Improved GPU efficiency through architectural advances and Moore's Law\n",
+    "- Model quantization progress, moving from 16-bit to 4-bit or lower precision\n",
+    "- Software optimizations reducing compute and memory bandwidth requirements\n",
+    "- Emergence of smaller yet similarly capable models\n",
+    "- Better instruction tuning techniques like RLHF and DPO\n",
+    "- Competition from open-source models and low-cost providers\n",
+    "\n",
+    "This trend raises a critical question: If LLM costs are plummeting so rapidly, why should businesses and developers invest precious time and resources in optimizing their LLM usage? Wouldn't it make more sense to simply wait for the next wave of cost improvements rather than optimize today? In two words: **Jevons Paradox**. \n",
+    "\n",
+    "The Jevons Paradox was first observed by English economist William Stanley Jevons in 1865. Studying coal consumption during the Industrial Revolution, Jevons made a counterintuitive discovery: as steam engines became more efficient and coal use became more economical, total coal consumption increased rather than decreased driving the (Industrial Revolution) and the total spending up.\n",
+    "\n",
+    "This pattern has repeated throughout technological history:\n",
+    "\n",
+    "- Computing Power: As cost per computation plummeted, we didn't spend less on computing, instead we found new creative uses for computers, from smartphones to cloud servers\n",
+    "- Network Bandwidth: As data transmission got cheaper, we shifted from text messaging to HD video streaming and real-time gaming\n",
+    "- Data Storage: As cost per gigabyte fell, we moved from storing documents to hosting entire media libraries and training massive AI models\n",
+    "\n",
+    "One could argue that LLMs and Generative AI more broadly are following a similar trajectory. As costs decline, we're seeing the emergence of new applications:\n",
+    "- Embedding AI capabilities into every application and workflow\n",
+    "- Real-time analysis of audio transcripts and conversations\n",
+    "- Running AI models directly on edge devices and smartphones\n",
+    "- Multimodal applications combining text, images, audio and video \n",
+    "\n",
+    "In this environment of rapidly falling costs but potential for exponential growth in usage, optimizing LLM costs becomes more, not less, important. Here's why:\n",
+    "\n",
+    "**A) Scale Magnifies Everything**. When operating at billions of tokens per day, even small inefficiencies have major effects:\n",
+    "- A single digit improvement in efficiency can save millions of dollars annually at scale\n",
+    "- Every 100 milliseconds of latency is about 8% difference in engagement rates (30% on mobile) [^groklatency]\n",
+    "[^groklatency]: Quote from Jonathan Ross, CEO of Groq, a company that specializes in AI Inference services.\n",
+    "\n",
+    "**B) Tiered Pricing Persists**. While average costs are declining, the market maintains a tiered structure:\n",
+    "- Different models offer varying price-performance tradeoffs\n",
+    "- ChatGPT Pro at \\$200 per month breaks the price drop trend perhaps triggering a new wave of premium models\n",
+    "- Cost optimization is still required to select the right model for each specific use case\n",
+    "\n",
+    "**C) Competition Drives Innovation**. Companies that master LLM efficiency gain significant advantages:\n",
+    "- Ability to offer more competitive pricing\n",
+    "- Capacity to handle larger scale operations\n",
+    "- Resources to invest in product improvement\n",
+    "\n",
+    "**D) Performance and Cost Are Linked**. Cost optimization often yields performance benefits:\n",
+    "- Resource efficiency enables handling larger user loads\n",
+    "- More efficiency and reduced latency leads to improved user experience\n",
+    "\n",
+    "In this environment, companies that master efficient LLM usage while exploiting new capabilities opened up by falling costs will be best positioned to innovate and scale. This dual focus - efficiency and innovation - will likely characterize successful AI companies in the years ahead.\n",
+    "\n",
+    "Motivated by this insight, in the next sections we will dive into the factors that drive LLM cost decay and how to optimize LLM usage in practical applications. The discussion will explore key optimization areas including inference optimization through techniques like Flash Attention and cached prompts, model compression via quantization and distillation, and practical implementation patterns such as response caching, batch processing, and early stopping - all aimed at achieving efficient usage and cost reductions while maintaining model performance and reliability.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Right-Sizing LLMs: A Strategic Approach\n",
+    "\n",
+    "Before implementing cost optimization strategies for LLMs, organizations must develop a comprehensive understanding of their own requirements and constraints. This systematic approach prevents both over-engineering and under-provisioning, leading to more efficient and cost-effective implementations.\n",
+    "\n",
+    "In this section, we define key performance and cost related metrics that will guide our discussion. Then we propose a set of requirements practitioners should consider before we dive into cost optimization techniques.\n",
+    "\n",
+    "\n",
+    "### Metrics"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Requirements\n",
+    "\n",
+    "#### Business Requirements\n",
+    "\n",
+    "First, one needs to define the problem to be solved and to what extent it is worth to be solved. Use case requirements form the foundation of any LLM implementation project. A clear definition of the specific business problema and task to be accomplished must be established upfront, along with concrete performance metrics covering accuracy, latency and throughput. This should be accompanied by well-defined cost-per-transaction targets, clear ROI expectations, and a strategic allocation of budgets across different use cases to ensure resources are optimally distributed.\n",
+    "\n",
+    "Budget and ROI considerations are critical for ensuring the long-term viability of LLM implementations. Organizations must establish clear spending limits that align with their financial capabilities while defining realistic cost-per-transaction targets. ROI expectations need to be carefully established through detailed analysis, followed by a strategic allocation of budgets across various use cases based on their business impact and priority.\n",
+    "\n",
+    "Compliance and security requirements cannot be overlooked. This involves a thorough identification of all applicable regulatory requirements and the establishment of robust data handling standards. Organizations must specify comprehensive audit requirements to maintain transparency and accountability, while implementing appropriate security controls to protect sensitive data and system access.\n",
+    "\n",
+    "Future-proofing considerations help ensure the longevity and adaptability of LLM implementations. This requires careful planning for scale to accommodate future growth, along with the evaluation of multi-model strategies to reduce dependency on single solutions. Organizations should carefully assess vendor lock-in risks and explore open-source alternatives to maintain flexibility and control over their AI infrastructure.\n",
+    "\n",
+    "Chapter {ref}`local` provides a detailed discussion on relevant considerations when {ref}`local-model-selection`.\n",
+    "\n",
+    "#### Performance Requirements\n",
+    "\n",
+    "Accuracy and quality form the foundation of any LLM deployment's performance requirements. At its core, this involves determining the minimum level of accuracy that the model must achieve to be considered successful. This serves as a critical baseline for evaluating model performance and making deployment decisions. Establishing clear evaluation metrics, whether through automated measures or human evaluation processes, provides concrete ways to assess if these thresholds are being met. Continuous monitoring of these accuracy metrics ensures the system maintains its performance over time as usage patterns and data distributions evolve. Chapter {ref}`evals` provides a detailed discussion on how to evaluate the performance of LLM-based applications.\n",
+    "\n",
+    "Latency and throughput requirements are equally crucial for ensuring a positive user experience and system reliability. These specifications define how quickly the system must respond to requests and how many concurrent users it can handle. Response time requirements must be carefully balanced against the computational resources available, while peak load capabilities need to account for usage spikes and growth patterns. The decision between real-time processing for immediate responses versus batch processing for efficiency depends heavily on the use case and user expectations. \n",
+    "\n",
+    "\n",
+    "#### Operational Requirements\n",
+    "\n",
+    "Scale and capacity planning forms the foundation of operational requirements for LLM deployments. This involves a comprehensive analysis of expected system usage and growth patterns to ensure the infrastructure can handle both current and future demands. Organizations must carefully project their daily and monthly API call volumes while calculating the average number of tokens per request to accurately estimate resource needs. Understanding usage patterns, including seasonal variations, enables proper capacity planning. Additionally, developing 12-24 month growth projections helps ensure the infrastructure can scale appropriately as demand increases.\n",
+    "\n",
+    "Reliability and availability requirements are equally critical for maintaining consistent service quality. These specifications define the expected uptime percentage that the system must maintain, typically expressed as a percentage of total operational time. Organizations need to establish clear maintenance windows that minimize disruption to users while ensuring necessary system updates and optimizations can be performed. Comprehensive backup and failover requirements must be specified to ensure business continuity in case of failures. High availability needs should be clearly defined, including redundancy levels and recovery time objectives, to maintain service quality even during unexpected events.\n",
+    "\n",
+    "#### Technical Requirements\n",
+    "\n",
+    "System integration requirements define how the LLM system will interact and communicate with existing infrastructure and applications. This involves carefully mapping all integration points where the LLM system needs to connect with other systems, establishing standardized data formats and interfaces for seamless communication, implementing robust security measures to protect data in transit, and identifying any technical constraints that could impact integration. Getting these integration requirements right is crucial for ensuring the LLM system can function effectively within the broader technical ecosystem.\n",
+    "\n",
+    "Data management requirements address how information will be stored, processed, and maintained within the LLM system. This encompasses determining appropriate storage solutions for maintaining conversation context and history, selecting and configuring vector databases to enable efficient retrieval-augmented generation (RAG), creating comprehensive data retention policies that balance operational needs with resource constraints, and ensuring all data handling practices comply with relevant privacy regulations. Proper data management is essential for both system performance and regulatory compliance, making it a critical consideration in any LLM implementation.\n",
+    "\n",
+    "\n",
+    "This structured approach to requirements analysis enables organizations to:\n",
+    "1. Select appropriate models aligned with specific needs\n",
+    "2. Identify targeted optimization opportunities\n",
+    "3. Scale efficiently while controlling costs\n",
+    "4. Develop realistic resource allocation strategies\n",
+    "\n",
+    "The following sections explore specific optimization techniques, but their implementation should always be guided by these foundational requirements.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Quantization\n",
+    "\n",
+    "Quantization is a common and relevant technique in making LLMs more efficient and accessible. At a high level, quantization reduces the number of bits used to represent a model's parameters. The most common form of quantization is to represent model's weights at lower precision at post-training phase. It has become a standard technique to generate a series of quantized models given a large pre-trained base model.\n",
+    "\n",
+    "While a standard pre-trained LLM might use 32-bit floating-point (FP32) or 16-bit floating-point (FP16) numbers to store its weights, quantized versions can operate at lower precision levels such as 8, 4 or even 2 bits per parameter, reducing memory footprint without proportional losses in performance, necessarily. For instance, for a model of 30 billion parameters, using FP32 means 4 bytes per weight or 120 GB for the whole weights. If the model is quantized such that weights are represented in 1 byte, the memory needed for the model’s weights decreases to 30 GB, hence potentially fitting into consumer grade hardware. This is done at the cost of precision loss, but the trade-off is often worth it though require careful analysis."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's take a look at model weights of a language model (`SmolLM2-135M-Instruct`) that has been quantized to 2-bit and 16-bit precisions. We will use an utility function `load_gguf` from the `taming_utils` package to load model weights of the quantized models directly from Hugging Face."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from taming_utils import load_gguf\n",
+    "\n",
+    "MODEL_NAME = \"bartowski/SmolLM2-135M-Instruct-GGUF\"\n",
+    "GGUF_FILE_Q2_K = \"SmolLM2-135M-Instruct-Q2_K.gguf\"\n",
+    "GGUF_FILE_F16 = \"SmolLM2-135M-Instruct-F16.gguf\"\n",
+    "\n",
+    "model_q2_k = load_gguf(model_name=MODEL_NAME, \n",
+    "              gguf_file=GGUF_FILE_Q2_K)\n",
+    "\n",
+    "model_f16 = load_gguf(model_name=MODEL_NAME, \n",
+    "              gguf_file=GGUF_FILE_F16)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We extract the MLP weights from the first layer of the model as a proxy."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mlp_weights_q2_k = model_q2_k.model.layers[0].mlp.gate_proj.weight\n",
+    "mlp_weights_f16 = model_f16.model.layers[0].mlp.gate_proj.weight"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Original weights at 16-bit precision:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Parameter containing:\n",
+       "tensor([[-0.0145,  0.1826,  0.1377,  ...,  0.1719, -0.1387, -0.0298],\n",
+       "        [-0.1631,  0.0781, -0.2051,  ..., -0.2070, -0.0334,  0.2891],\n",
+       "        [-0.1768, -0.0488, -0.2393,  ..., -0.0396, -0.1348, -0.1533],\n",
+       "        ...,\n",
+       "        [ 0.0771,  0.0845, -0.0232,  ...,  0.0178, -0.1040, -0.0771],\n",
+       "        [ 0.1582,  0.1167, -0.0474,  ...,  0.0845,  0.0359, -0.2500],\n",
+       "        [ 0.0432,  0.0972,  0.0933,  ...,  0.2188,  0.0776,  0.0674]],\n",
+       "       requires_grad=True)"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mlp_weights_f16"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Quantized weights at 2-bit precision:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Parameter containing:\n",
+       "tensor([[-0.0028,  0.1852,  0.1396,  ...,  0.1506, -0.1635, -0.0043],\n",
+       "        [-0.1768,  0.0680, -0.2257,  ..., -0.1890, -0.0464,  0.2960],\n",
+       "        [-0.1840, -0.0451, -0.2395,  ..., -0.0413, -0.1446, -0.1446],\n",
+       "        ...,\n",
+       "        [ 0.0621,  0.0621, -0.0478,  ...,  0.0038, -0.0830, -0.0830],\n",
+       "        [ 0.1473,  0.0926, -0.0547,  ...,  0.0824,  0.0429, -0.2737],\n",
+       "        [ 0.0355,  0.0782,  0.0782,  ...,  0.2043,  0.0740,  0.0740]],\n",
+       "       requires_grad=True)"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mlp_weights_q2_k"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "How do they compare? We arrive at a Pearson correlation of 99.7% between the two sets of weights."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pearson correlation: 0.9970\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "# Convert tensors to numpy arrays (detach from computation graph if needed)\n",
+    "weights_f16 = mlp_weights_f16.detach().cpu().numpy()\n",
+    "weights_q2_k = mlp_weights_q2_k.detach().cpu().numpy()\n",
+    "\n",
+    "flat_f16 = weights_f16.flatten()\n",
+    "flat_q2_k = weights_q2_k.flatten()\n",
+    "\n",
+    "# Calculate correlation\n",
+    "correlation = np.corrcoef(flat_f16, flat_q2_k)[0,1]\n",
+    "print(f\"Pearson correlation: {correlation:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Quantization is a powerful technique for reducing the memory footprint of LLMs. This can be exemplified by the case of LLaMa 3.3 70B as quantized by {cite}`unsloth2024llama3` [^unsloth]. The model's memory requirements vary significantly based on the quantization level used as demonstrated in {numref}`quantized`.\n",
+    "\n",
+    "[^unsloth]: Unsloth runs a business of making LLMs fine-tuning streamlined. Check them out at [unsloth.ai](https://unsloth.ai).\n",
+    "\n",
+    "```{figure} ../_static/cost/quantized.png\n",
+    "---\n",
+    "name: quantized\n",
+    "alt: Quantized Model Size\n",
+    "scale: 50%\n",
+    "align: center\n",
+    "---\n",
+    "Quantized Model Size: `unsloth/Llama-3.3-70B-Instruct-GGUF`\n",
+    "```\n",
+    "\n",
+    "We observe that the quantization process yields remarkable reductions in model size, demonstrating a clear trade-off between precision and memory requirements. The transition from F16 (141.1 GB) to Q8_0 (75 GB) achieves a dramatic 47% reduction in model size while maintaining relatively high numerical precision. Further quantization levels reveal an interesting pattern of diminishing returns - each step down in precision yields progressively smaller absolute size reductions, though the cumulative effect remains significant. At the extreme end, the Q2_K model (26.4 GB) requires only 19% of the storage space of its F16 counterpart [^quantization-levels].\n",
+    "\n",
+    "[^quantization-levels]: You may have noticed quantization levels have a special notation. Including the bit width in the name of the model but also quantization types (e.g. _K, _0). You can find more information about the quantization levels in {cite}`huggingface2024quantization`.\n",
+    "\n",
+    "This wide spectrum of model sizes enables deployment across diverse hardware environments. The lightweight Q2_K variant opens possibilities for running inference on consumer-grade hardware like high-end laptops or desktop computers. In contrast, the full-precision F16 model demands enterprise-grade computing resources with substantial memory capacity. This flexibility in deployment options makes quantization a powerful tool for democratizing access to large language models while managing computational costs.\n",
+    " \n",
+    "While quantization has proven highly effective, there is a limit to how far it can be pushed - specifically, the 1-bit ceiling. A notable advancement in this space is BitNet {cite}`wang20241bitaiinfra11` which pushes the boundaries of extreme quantization.\n",
+    "\n",
+    "BitNet's implementation, bitnet.cpp, has demonstrated significant performance improvements across both ARM and x86 architectures (see {numref}`bitnet`). When compared to llama.cpp, the framework achieves speedups ranging from 1.37x to 5.07x on ARM processors and 2.37x to 6.17x on x86 systems. These performance gains scale with model size - larger models benefit more substantially from BitNet's optimizations. The efficiency improvements extend beyond raw speed: energy consumption drops by 55-70% on ARM and 71-82% on x86 processors. Perhaps most impressively, bitnet.cpp enables running a 100B parameter BitNet b1.58 model on a single CPU at speeds matching human reading pace (5-7 tokens per second).\n",
+    "\n",
+    "```{figure} ../_static/cost/bitnet.png\n",
+    "---\n",
+    "name: bitnet\n",
+    "alt: BitNet\n",
+    "scale: 30%\n",
+    "align: center\n",
+    "---\n",
+    "BitNet: {cite}`wang20241bitaiinfra11`\n",
+    "```\n",
+    "\n",
+    "The framework's initial release focused on CPU inference optimization, with particular emphasis on 1-bit LLM architectures (BitNet b1.58). While initial testing shows promising results, these findings are specific to the tested models and kernels (its specialized kernels are carefully crafted to exploit the unique characteristics of these extremely quantized models). Further validation is needed before generalizing these results across different architectures and use cases.\n",
+    "\n",
+    "As a relatively recent innovation, 1-bit LLMs represent an exciting frontier in model compression. However, their full potential and limitations require additional research and real-world validation. The technology demonstrates how creative approaches to quantization can continue pushing the boundaries of efficient AI deployment."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Beyond its memory footprint reduction, quantization delivers several compelling advantages: it accelerates computation through faster arithmetic operations and larger batch sizes, reduces costs by enabling deployment on less expensive hardware and making LLMs more accessible to smaller organizations, and improves energy efficiency by lowering memory bandwidth usage and power consumption - particularly beneficial for mobile and edge devices, ultimately contributing to more sustainable AI deployment.\n",
+    "\n",
+    "Each reduction in precision risks performance degradation. Finding optimal quantization schemes remains an active research area. See Case Study on Quantization for Local Models in Chapter {ref}`local` for more details.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Check-list\n",
+    "\n",
+    "**Planning and Requirements**\n",
+    "- [ ] Start with a clear understanding of your application's needs and the factors that contribute to LLM costs\n",
+    "- [ ] Choose the right model for your task, balancing performance and cost\n",
+    "- [ ] Be aware of the potential challenges and limitations of open-source LLMs and take appropriate measures to mitigate them\n",
+    "\n",
+    "**Model Optimization**\n",
+    "- [ ] Explore model compression and quantization to reduce model size and computational demands\n",
+    "- [ ] Fine-tune pre-trained models on domain-specific data to improve accuracy and efficiency\n",
+    "- [ ] Consider using RAG to enhance performance and reduce reliance on purely generative processes\n",
+    "\n",
+    "**Prompt Engineering**\n",
+    "- [ ] Optimize prompts and utilize prompt engineering techniques to minimize token usage\n",
+    "- [ ] Experiment with different prompting strategies to unlock the full potential of open-source LLMs\n",
+    "\n",
+    "**Infrastructure and Operations**\n",
+    "- [ ] Implement caching and batching strategies to optimize resource utilization\n",
+    "- [ ] Monitor LLM usage patterns and costs to identify areas for optimization\n",
+    "- [ ] Set up observability and logging to track model performance and costs\n",
+    "- [ ] Establish automated testing and evaluation pipelines\n",
+    "\n",
+    "**Cost Management**\n",
+    "- [ ] Track and analyze inference costs across different model variants\n",
+    "- [ ] Implement cost allocation and chargeback mechanisms\n",
+    "- [ ] Set up cost alerts and budgeting controls\n",
+    "- [ ] Regularly review and optimize resource utilization\n",
+    "\n",
+    "## Conclusion"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]\n",
+    "\n",
+    "[cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/\n",
+    "[cc-by-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png\n",
+    "[cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC-BY--NC--SA-4.0-lightgrey.svg\n",
+    "\n",
+    "```\n",
+    "@misc{tharsistpsouza2024tamingllms,\n",
+    "  author = {Tharsis T. P. Souza},\n",
+    "  title = {Taming LLMs: A Practical Guide to LLM Pitfalls with Open Source Software},\n",
+    "  year = {2024},\n",
+    "  chapter = {The Falling Cost Paradox},\n",
+    "  journal = {GitHub repository},\n",
+    "  url = {https://github.com/souzatharsis/tamingLLMs)\n",
+    "}\n",
+    "```\n",
+    "## References\n",
+    "```{bibliography}\n",
+    ":filter: docname in docnames\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tamingllms/notebooks/local.ipynb b/tamingllms/notebooks/local.ipynb
index b451331..fde2739 100644
--- a/tamingllms/notebooks/local.ipynb
+++ b/tamingllms/notebooks/local.ipynb
@@ -4,6 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "(local)=\n",
     "# Local LLMs in Practice\n",
     "```{epigraph}\n",
     "Freedom is something that dies unless it's used.\n",
@@ -40,7 +41,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Models Considerations\n",
+    "(local-model-selection)=\n",
+    "## Choosing your Model\n",
     "\n",
     "The landscape of open source LLMs is rapidly evolving, with new models emerging by the day. While proprietary LLMs have garnered significant attention, open source LLMs are gaining traction due to their flexibility, customization options, and cost-effectiveness. \n",
     "\n",
@@ -1352,7 +1354,6 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Citation\n",
     "[![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]\n",
     "\n",
     "[cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/\n",
diff --git a/tamingllms/notebooks/structured_output.ipynb b/tamingllms/notebooks/structured_output.ipynb
index 4bc64db..64359b4 100644
--- a/tamingllms/notebooks/structured_output.ipynb
+++ b/tamingllms/notebooks/structured_output.ipynb
@@ -467,9 +467,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/tobias/src/tamingLLMs/tamingllms/.venv/lib/python3.11/site-packages/torch/cuda/__init__.py:654: UserWarning: Can't initialize NVML\n",
+      "  warnings.warn(\"Can't initialize NVML\")\n"
+     ]
+    }
+   ],
    "source": [
     "MODEL_NAME = \"HuggingFaceTB/SmolLM2-1.7B-Instruct\"\n",
     "PROMPT = \"Is Enzo a good name for a baby?\"\n",
@@ -1384,7 +1393,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,
diff --git a/tamingllms/notebooks/structured_output_original.ipynb b/tamingllms/notebooks/structured_output_original.ipynb
deleted file mode 100644
index 79ee4c3..0000000
--- a/tamingllms/notebooks/structured_output_original.ipynb
+++ /dev/null
@@ -1,1167 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "(structure)=\n",
-    "# Wrestling with Structured Output\n",
-    "```{epigraph}\n",
-    "In limits, there is freedom. Creativity thrives within structure.\n",
-    "\n",
-    "-- Julia B. Cameron\n",
-    "```\n",
-    "```{contents}\n",
-    "```\n",
-    "\n",
-    "## Introduction\n",
-    "\n",
-    "Large language models (LLMs) excel at generating human-like text, but they often struggle to produce output in a structured format consistently. This poses a significant challenge when we need LLMs to generate data that can be easily processed by other systems, such as databases, APIs, or other software applications.   Sometimes, even with a well-crafted prompt, an LLM might produce an unstructured response when a structured one is expected. This can be particularly challenging when integrating LLMs into systems that require specific data formats."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "As a motivating example, consider the following simple task: Given a segment of a SEC financial filing, generate a two-person discussion about the key financial data from the text in JSON format, simulating what would be a real-world discussion about the underlying companies' disclosed financial information. We would like to generate a structured output that can be easily parsed and integrated with other systems."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Throughout this notebook, we will consider as input a segment of a sample SEC filing of Apple Inc."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "MAX_LENGTH = 10000 # We limit the input length to avoid token issues\n",
-    "with open('../data/apple.txt', 'r') as file:\n",
-    "    sec_filing = file.read()\n",
-    "sec_filing = sec_filing[:MAX_LENGTH] "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from dotenv import load_dotenv\n",
-    "import os\n",
-    "\n",
-    "# Load environment variables from .env file\n",
-    "load_dotenv(override=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from openai import OpenAI"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "client = OpenAI()\n",
-    "# Define the prompt expecting a structured JSON response\n",
-    "prompt = f\"\"\"\n",
-    "Generate a two-person discussion about the key financial data from the following text in JSON format.\n",
-    "TEXT: {sec_filing}\n",
-    "\"\"\"\n",
-    "\n",
-    "response = client.chat.completions.create(\n",
-    "    model=\"gpt-3.5-turbo\",\n",
-    "    messages=[{\"role\": \"user\", \"content\": prompt}]\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Person 1: Wow, Apple Inc. seems to have a lot of different products and services they offer. It's interesting to see the breakdown of their revenue streams in their Form 10-K.\n",
-      "\n",
-      "Person 2: Absolutely, they have a diverse portfolio with iPhones, Macs, iPads, wearables, and even services. It's impressive to see how they have capitalized on different technology trends.\n",
-      "\n",
-      "Person 1: I noticed that they have a large market value of over $2.6 trillion as of March 29, 2024. That's a huge amount, and it shows the confidence investors have in the company.\n",
-      "\n",
-      "Person 2: Definitely, that's a significant figure. It's also good to see that they are complying with all the required SEC regulations and filing their reports in a timely manner.\n",
-      "\n",
-      "Person 1: Yes, it's crucial for investors to have access to accurate and up-to-date financial information. It helps in making informed decisions about their investments in the company.\n",
-      "\n",
-      "Person 2: Absolutely, transparency and compliance with regulations are key in the financial industry. It's good to see that Apple Inc. is taking those aspects seriously.\n"
-     ]
-    }
-   ],
-   "source": [
-    "response_content = response.choices[0].message.content\n",
-    "print(response_content)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "\n",
-    "def is_json(myjson):\n",
-    "  try:\n",
-    "    json.loads(myjson)\n",
-    "  except ValueError as e:\n",
-    "    return False\n",
-    "  return True"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "False\n"
-     ]
-    }
-   ],
-   "source": [
-    "is_json(response_content)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this example, despite the prompt clearly asking for a JSON object, the LLM generates an unstructured natural language sentence instead. This simple example highlights the inconsistency and unpredictability of LLMs when it comes to producing structured output."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Problem Statement\n",
-    "\n",
-    "Obtaining structured output from LLMs presents several significant challenges:\n",
-    "\n",
-    "* **Inconsistency**: LLMs often produce unpredictable results, sometimes generating well-structured output and other times deviating from the expected format.\n",
-    "\n",
-    "* **Lack of Type Safety**: LLMs do not inherently understand data types, which can lead to errors when their output is integrated with systems requiring specific data formats.\n",
-    "\n",
-    "* **Prompt Engineering Complexity**: Crafting prompts that effectively guide LLMs to produce the correct structured output is complex and requires extensive experimentation.\n",
-    "\n",
-    "## User Needs\n",
-    "\n",
-    "What user needs drive the demand for LLM output constraints when building LLM-based applications? In a recent work by Google Research {cite}`10.1145/3613905.3650756`, the authors explore the user need for constraints on the output of large language models, drawing on a survey of 51 industry professionals who use LLMs in their work. These needs can be broadly categorized as follows:\n",
-    "\n",
-    "**1. Improving Developer Efficiency and Workflow**\n",
-    "\n",
-    "*   **Reducing Trial and Error in Prompt Engineering**: Developers find the process of crafting prompts to elicit desired output formats to be time-consuming, often involving extensive testing and iteration. LLM output constraints could make this process more efficient and predictable.\n",
-    "*   **Minimizing Post-processing of LLM Outputs**: Developers frequently have to write complex code to wrangle and process LLM outputs that don't conform to expected formats. LLM structured output would simplify this, reducing the need for ad-hoc post-processing code.\n",
-    "*   **Streamlining Integration with Downstream Processes**: LLMs are often used within larger pipelines where their output serves as input for subsequent modules. Output constraints are crucial to ensure compatibility and prevent errors.\n",
-    "*   **Enhancing the Quality of Synthetic Datasets**: LLMs are increasingly used to generate synthetic data for AI training. Constraints can ensure data integrity and prevent the inclusion of unwanted elements that could negatively impact training outcomes.\n",
-    "\n",
-    "**2. Meeting UI and Product Requirements**\n",
-    "\n",
-    "*   **Adhering to UI Size Limitations**: LLM-generated content often needs to fit into specific UI elements with size restrictions, especially on mobile devices. Output length constraints prevent content overflow and ensure proper display within the UI.\n",
-    "*   **Ensuring Output Consistency**: Consistent output length and format are crucial for user experience and UI clarity. Constraints help maintain this consistency, avoiding overwhelming variability in generated text.\n",
-    "*   **Complying with Platform Character Limits**: Certain platforms, such as Twitter or YouTube Shorts, impose character limits on content. Length constraints allow LLMs to comply with these restrictions, ensuring content can be published successfully.\n",
-    "\n",
-    "**3. Enhancing User Trust and Experience**\n",
-    "\n",
-    "*   **Mitigating Hallucinations**: Users expect LLM-powered tools to be reliable and truthful. Constraining LLM outputs to a set of possible outcomes can help mitigate hallucinations, ensuring the output is valid.\n",
-    "*   **Driving User Adoption**: Users are more likely to engage with LLM-powered tools that provide reliable and consistent experiences. By ensuring output accuracy, consistency, and safety through constraints, developers can enhance user satisfaction and drive adoption.\n",
-    "\n",
-    "It is important to emphasize that the ability to constrain LLM output is not just a technical consideration but a fundamental user need, impacting developer efficiency, user experience, and the overall success of LLM-powered applications. \n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Solutions\n",
-    "\n",
-    "Several strategies and tools can be employed to address the challenges of structured output from LLMs.\n",
-    "\n",
-    "### Strategies\n",
-    "\n",
-    "* **Schema Guidance**: Providing the LLM with a clear schema or blueprint of the desired output structure helps to constrain its generation and improve consistency. This can be achieved by using tools like Pydantic to define the expected data structure and then using that definition to guide the LLM's output. \n",
-    "\n",
-    "* **Output Parsing**: When LLMs don't natively support structured output, parsing their text output using techniques like regular expressions or dedicated parsing libraries can extract the desired information. For example, you can use regular expressions to extract specific patterns from the LLM's output, or you can use libraries like Pydantic to parse the output into structured data objects.\n",
-    "\n",
-    "* **Type Enforcement**: Using tools that enforce data types, such as Pydantic in Python, can ensure that the LLM output adheres to the required data formats. This can help to prevent errors when integrating the LLM's output with other systems."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Techniques and Tools"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### One-Shot Prompts\n",
-    "\n",
-    "In one-shot prompting, you provide a single example of the desired output format within the prompt."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "prompt = f\"\"\"\n",
-    "Generate a two-person discussion about the key financial data from the following text in JSON format.\n",
-    "\n",
-    "<JSON_FORMAT>\n",
-    "{\n",
-    "   \"Person1\": {\n",
-    "     \"name\": \"Alice\",\n",
-    "     \"statement\": \"The revenue for Q1 has increased by 20% compared to last year.\"\n",
-    "   },\n",
-    "   \"Person2\": {\n",
-    "     \"name\": \"Bob\",\n",
-    "     \"statement\": \"That's great news! What about the net profit margin?\"\n",
-    "   }\n",
-    "}\n",
-    "</JSON_FORMAT>\n",
-    "\n",
-    "TEXT: {sec_filing}\n",
-    "\"\"\"\n",
-    "\n",
-    "response = client.chat.completions.create(\n",
-    "    model=\"gpt-3.5-turbo\",\n",
-    "    messages=[{\"role\": \"user\", \"content\": prompt}]\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{\n",
-      "   \"Person1\": {\n",
-      "     \"name\": \"Alice\",\n",
-      "     \"statement\": \"The revenue for Q1 has increased by 20% compared to last year.\"\n",
-      "   },\n",
-      "   \"Person2\": {\n",
-      "     \"name\": \"Bob\",\n",
-      "     \"statement\": \"That's great news! What about the net profit margin?\"\n",
-      "   }\n",
-      "}\n"
-     ]
-    }
-   ],
-   "source": [
-    "response_content = response.choices[0].message.content\n",
-    "print(response_content)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 33,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "is_json(response_content)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Structured Output with Provider-Specific APIs\n",
-    "\n",
-    "One-shot prompting is a simple technique that can lead to material improvements in structured output, though may not be sufficient for complex (e.g. nested) structures and / or when the model's output needs to be restricted to a specific set of options or types.\n",
-    "\n",
-    "Provider-specific APIs can offer ways to handle those challenges. We will explore two approaches here using OpenAI's API:\n",
-    "\n",
-    "* **JSON Mode**: Most LLM APIs today offer features specifically designed for generating JSON output.\n",
-    "* **Structured Outputs**: Some LLM APIs offer features specifically designed for generating structured outputs with type safety.\n",
-    "\n",
-    "#### JSON Mode\n",
-    "\n",
-    "JSON mode is a feature provided by most LLM API providers, such as OpenAI, that allows the model to generate output in JSON format. This is particularly useful when you need structured data as a result, such as when parsing the output programmatically or integrating it with other systems that require JSON input. As depicted in {numref}`json-mode`, JSON mode is implemented by instructing theLLM model to use JSON as response format and optionally defining a target schema.\n",
-    "\n",
-    "\n",
-    "```{figure} ../_static/structured_output/json.png\n",
-    "---\n",
-    "name: json-mode\n",
-    "alt: JSON Mode\n",
-    "scale: 50%\n",
-    "align: center\n",
-    "---\n",
-    "Conceptual overview of JSON mode.\n",
-    "```\n",
-    "\n",
-    "When using JSON mode with OpenAI's API, it is recommended to instruct the model to produce JSON via some message in the conversation, for example via your system message. If you don't include an explicit instruction to generate JSON, the model may generate an unending stream of whitespace and the request may run continually until it reaches the token limit. To help ensure you don't forget, the API will throw an error if the string \"JSON\" does not appear somewhere in the context.\n",
-    "\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "prompt = f\"\"\"\n",
-    "Generate a two-person discussion about the key financial data from the following text in JSON format.\n",
-    "TEXT: {sec_filing}\n",
-    "\"\"\"\n",
-    "response = client.chat.completions.create(\n",
-    "    model=\"gpt-3.5-turbo\",\n",
-    "    messages=[{\"role\": \"user\", \"content\": prompt}],\n",
-    "response_format = { \"type\": \"json_object\" }\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{\n",
-      "  \"person1\": \"I see that Apple Inc. reported a total market value of approximately $2,628,553,000,000 held by non-affiliates as of March 29, 2024. That's a significant amount!\",\n",
-      "  \"person2\": \"Yes, it definitely shows the scale and value of the company in the market. It's impressive to see the sheer size of the market value.\",\n",
-      "  \"person1\": \"Also, they mentioned having 15,115,823,000 shares of common stock issued and outstanding as of October 18, 2024. That's a large number of shares circulating in the market.\",\n",
-      "  \"person2\": \"Absolutely, the number of shares outstanding plays a crucial role in determining the company's market capitalization and investor interest.\"\n",
-      "}\n"
-     ]
-    }
-   ],
-   "source": [
-    "response_content = response.choices[0].message.content\n",
-    "print(response_content)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 50,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 50,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "is_json(response_content)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This example solution is specific to OpenAI's API. Other LLM providers offer similar functionality, for example:\n",
-    "\n",
-    "* Google's Vertex AI offers a `parse` method for structured outputs.\n",
-    "* Anthropic offers a `structured` method for structured outputs."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "JSON mode will not guarantee the output matches any specific schema, only that it is valid and parses without errors. For that purpose, we can leverage a new feature recently released by OpenAI called \"Structured Outputs\" to ensure the output data matches a target schema with type safety.\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Structured Output Mode**\n",
-    "\n",
-    "Structured Outputs is a feature that ensures the model will always generate responses that adhere to your supplied JSON Schema, so you don't need to worry about the model omitting a required key, or hallucinating an invalid enum value.\n",
-    "\n",
-    "Some benefits of Structured Outputs include:\n",
-    "- **Reliable type-safety**: No need to validate or retry incorrectly formatted responses.\n",
-    "- **Explicit refusals**: Safety-based model refusals are now programmatically detectable.\n",
-    "- **Simpler prompting**: No need for strongly worded prompts to achieve consistent formatting.\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here's a Python example demonstrating how to use the OpenAI API to generate a structured output. In this example, we aim at extracting structured data from our sample SEC filing, in particular: (i) entities and (ii) places mentioned in the input doc. This example uses the `response_format` parameter within the OpenAI API call. This functionality is supported by GPT-4o models, specifically `gpt-4o-mini-2024-07-18`, `gpt-4o-2024-08-06`, and later versions."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pydantic import BaseModel\n",
-    "from openai import OpenAI\n",
-    "\n",
-    "class SECExtraction(BaseModel):\n",
-    "    mentioned_entities: list[str]\n",
-    "    mentioned_places: list[str]\n",
-    "\n",
-    "def extract_from_sec_filing(sec_filing_text: str, prompt: str) -> SECExtraction:\n",
-    "    \"\"\"\n",
-    "    Extracts structured data from an input SEC filing text.\n",
-    "    \"\"\"\n",
-    "    client = OpenAI()\n",
-    "    completion = client.beta.chat.completions.parse(\n",
-    "        model=\"gpt-4o-mini\",\n",
-    "        messages=[\n",
-    "            {\n",
-    "                \"role\": \"system\",\n",
-    "                \"content\": prompt\n",
-    "            },\n",
-    "            {\"role\": \"user\", \"content\": sec_filing_text}\n",
-    "        ],\n",
-    "        response_format=SECExtraction\n",
-    "    )\n",
-    "    return completion.choices[0].message.parsed"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Explanation:**\n",
-    "\n",
-    "*   **Data Structures:** The code defines one Pydantic model, `SECExtraction`, to represent the structured output of our parser. This model provide type hints and structure for the response.\n",
-    "*   **API Interaction:** The `extract_from_sec_filing` function uses the OpenAI client to send a chat completion request to the `gpt-4o-mini-2024-07-18` model. The prompt instructs the model to extract our target attributes from input text. The `response_format` is set to `SECExtraction`, ensuring the response conforms to the specified Pydantic model.\n",
-    "*   **Output Processing:** The returned response is parsed into the `SECExtraction` model. The code then returns the parsed data."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 69,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "prompt_extraction = \"You are an expert at structured data extraction. You will be given unstructured text from a SEC filing and extracted names of mentioned entities and places and should convert the response into the given structure.\"\n",
-    "sec_extraction = extract_from_sec_filing(sec_filing, prompt_extraction)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 70,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Extracted entities: ['Apple Inc.', 'The Nasdaq Stock Market LLC']\n",
-      "Extracted places: ['Washington, D.C.', 'California', 'Cupertino, California']\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"Extracted entities:\", sec_extraction.mentioned_entities)\n",
-    "print(\"Extracted places:\", sec_extraction.mentioned_places)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We observe that the model was able to extract the entities and places from the input text, and return them in the specified format."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Benefits**\n",
-    "\n",
-    "*   **Structured Output:** The use of Pydantic models and the `response_format` parameter enforces the structure of the model's output, making it more reliable and easier to process.\n",
-    "\n",
-    "*   **Schema Adherence:**  Structured Outputs in OpenAI API guarantee that the response adheres to the provided schema.\n",
-    "\n",
-    "This structured approach improves the reliability and usability of your application by ensuring consistent, predictable output from the OpenAI API.\n",
-    "\n",
-    "This example solution is specific to OpenAI's API. That begs the question: How can we solve this problem generally for widely available LLM providers? In the next sections, we will explore how `LangChain` and `Outlines` may serve as general purpose tools that can help us do just that.\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### LangChain\n",
-    "\n",
-    "LangChain is a framework designed to simplify the development of LLM applications. It provider an abstraction layer over many LLM providers, including OpenAI, that offers several tools for parsing structured output.\n",
-    "\n",
-    "In particular, LangChain offers the `with_structured_output` method, which can be used with LLMs that support structured output APIs, allowing you to enforce a schema directly within the prompt.\n",
-    "\n",
-    "> `with_structured_output` takes a schema as input which specifies the names, types, and descriptions of the desired output attributes. The method returns a model-like Runnable, except that instead of outputting strings or messages it outputs objects corresponding to the given schema. The schema can be specified as a TypedDict class, JSON Schema or a Pydantic class. If TypedDict or JSON Schema are used then a dictionary will be returned by the Runnable, and if a Pydantic class is used then a Pydantic object will be returned.\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "```bash\n",
-    "pip install -qU langchain-openai\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 110,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain_openai import ChatOpenAI\n",
-    "from langchain_core.prompts import ChatPromptTemplate\n",
-    "def extract_from_sec_filing_langchain(sec_filing_text: str, prompt: str) -> SECExtraction:\n",
-    "    \"\"\"\n",
-    "    Extracts structured data from an input SEC filing text using LangChain.\n",
-    "    \"\"\"\n",
-    "    llm = ChatOpenAI(model=\"gpt-4o-mini\")\n",
-    "\n",
-    "    structured_llm = llm.with_structured_output(SECExtraction)\n",
-    "\n",
-    "    prompt_template = ChatPromptTemplate.from_messages(\n",
-    "        [\n",
-    "            (\"system\", prompt),\n",
-    "            (\"human\", \"{sec_filing_text}\"),\n",
-    "        ]\n",
-    "    )\n",
-    "\n",
-    "    llm_chain = prompt_template | structured_llm\n",
-    "    \n",
-    "    return llm_chain.invoke(sec_filing_text)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 111,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "prompt_extraction = \"You are an expert at structured data extraction. You will be given unstructured text from a SEC filing and extracted names of mentioned entities and places and should convert the response into the given structure.\"\n",
-    "sec_extraction_langchain = extract_from_sec_filing_langchain(sec_filing, prompt_extraction)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 112,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Extracted entities: ['Apple Inc.']\n",
-      "Extracted places: ['California', 'Cupertino']\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"Extracted entities:\", sec_extraction_langchain.mentioned_entities)\n",
-    "print(\"Extracted places:\", sec_extraction_langchain.mentioned_places)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We observe that the model was able to extract the entities and places from the input text, and return them in the specified format. A full list of models that support `.with_structured_output()` can be found [here](https://python.langchain.com/docs/integrations/chat/#featured-providers)."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Outlines\n",
-    "\n",
-    "Outlines {cite}`outlines2024` is a library specifically focused on structured text generation from LLMs. Under the hood, Outlines works by adjusting the probability distribution of the model's output logits - the raw scores from the final layer of the neural network that are normally converted into text tokens. By introducing carefully crafted logit biases, Outlines can guide the model to prefer certain tokens over others, effectively constraining its outputs to a predefined set of valid options. \n",
-    "\n",
-    "The authors solve the general guided generation problem {cite}`willard2023efficientguidedgenerationlarge`, which as a consequence solves the problem of structured output generation, in LLMs by introducing an efficient indexing approach that reformulates neural text generation using finite-state machines (FSMs).\n",
-    "\n",
-    "They define the next token generation as a random variable:\n",
-    "\n",
-    "$$s_{t+1} \\sim \\text{Categorical}(\\alpha) \\text{ where } \\alpha = \\text{LLM}(S_t, \\theta)$$\n",
-    "\n",
-    "Where:\n",
-    "\n",
-    "- $s_{t+1}$ is the next token to be generated\n",
-    "- $S_t = (s_1...s_t)$ represents a sequence of t tokens with $s_t \\in V$\n",
-    "- $V$ is the vocabulary with size $|V| = N$ (typically around $10^4$ or larger)\n",
-    "- $\\alpha \\in \\mathbb{R}^N$ is the output logits/probabilities over the vocabulary\n",
-    "- $\\theta$ is the set of trained parameters of the LLM\n",
-    "- $\\text{LLM}$ refers to a deep neural network trained on next-token-completion tasks\n",
-    "- $\\text{Categorical}(\\alpha)$ represents sampling from a categorical distribution with probabilities $\\alpha$\n",
-    "\n",
-    "When applying masking for guided generation, this becomes:\n",
-    "\n",
-    "$$\n",
-    "\\tilde{\\alpha} = m(S_t) \\odot \\alpha\n",
-    "$$\n",
-    "\n",
-    "$$\n",
-    "\\tilde{s}_{t+1} \\sim \\text{Categorical}(\\tilde{\\alpha})\n",
-    "$$\n",
-    "\n",
-    "Where:\n",
-    "\n",
-    "- $m: P(V) \\rightarrow {0,1}^N$ is a boolean mask function\n",
-    "- $\\odot$ represents element-wise multiplication\n",
-    "- $\\tilde{\\alpha}$ is the masked (constrained) probability distribution\n",
-    "- $\\tilde{s}_{t+1}$ is the next token sampled under constraints\n",
-    "\n",
-    "This formulation allows the masking operation to guide the generation process by zeroing out probabilities of invalid tokens according to the finite state machine states. But instead of checking the entire vocabulary (size N) at each generation step (O(N) complexity) to enforce output constraints, they convert constraints (regex/grammar) into FSM states and build an index mapping FSM states to valid vocabulary tokens. This achieves O(1) average complexity for token generation.\n",
-    "\n",
-    "In summary, there are two stages in the Outlines framework {cite}`vivien2024regex`:\n",
-    "\n",
-    "1. **Preprocessing Step**: Outlines converts a character-level deterministic finite automaton (DFA) testing whether a string matches a regex into a token-level DFA testing whether a token sequence is decoded in a string matching the regex.\n",
-    "\n",
-    "2. **Decoding Step**: At decoding time, the DFA is used to determine, for each new token, which potential tokens are allowed. Starting from the initial state of the DFA, the allowed tokens are determined by the outgoing transitions from the current state. The corresponding mask is applied to the next token probabilities and these probabilities are renormalized. A new token can then be sampled and the state of the DFA updated.\n",
-    "\n",
-    "At each step, the model's probability distribution is masked and renormalized according to the current state and valid transitions."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "As an example, let's suppose we want to constrain the output of an LLM to the following set of options: \n",
-    "- Y/yes\n",
-    "- N/no\n",
-    "- N/never\n",
-    "- A/always\n",
-    "\n",
-    "\n",
-    "This can be done by creating a state machine that has a start state, an end state and a set of valid transitions between states with possible states represented as the following regex string: `r\"\\s*([Yy]es|[Nn]o|[Nn]ever|[Aa]lways)\"`.\n",
-    "\n",
-    "The state machine below illustrates how Outlines works under the hood {numref}`outlines_state_machine`, where:\n",
-    "- Prop: Represents the logit token probability given by the LLM\n",
-    "- Mask: Mask value of the transition as defined by the state machine\n",
-    "- Final: The renormalized token probability post-masking\n",
-    "\n",
-    "```{figure} ../_static/structured_output/outlines_state_machine.png\n",
-    "---\n",
-    "name: outlines_state_machine\n",
-    "alt: Outlines State Machine\n",
-    "scale: 50%\n",
-    "align: center\n",
-    "---\n",
-    "Outlines State Machine.\n",
-    "```\n",
-    "\n",
-    "The initial \"Start\" state contains a masking table that controls which tokens can begin the sequence. In this example, only characters from the set `[YyNnAa]` are allowed as valid first characters, with each having an assigned probability and mask value. The masking mechanism effectively filters out invalid tokens by setting their mask values to 0, ensuring only permitted transitions to the \"First\" state.\n",
-    "\n",
-    "After transitioning to the \"First\" state, the system continues to use probability masking to guide the sequence. For example, when receiving 'Y' as input, the masking table adjusts token probabilities to ensure valid continuations.\n",
-    "\n",
-    "This finite state machine architecture serves multiple purposes in controlling text generation:\n",
-    "\n",
-    "1. Managing token probabilities through strategic masking\n",
-    "2. Preventing invalid token sequences \n",
-    "3. Enforcing specific token patterns\n",
-    "4. Providing fine-grained control over token generation and validation"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This provides fine-grained control over the model's generation process. In that way, Outlines, the Python package, provides several powerful controlled generation features:\n",
-    "\n",
-    "* **Regex-based structured generation**: Guide the generation process using regular expressions.\n",
-    "* **Multiple Choice Generation**: Restrict the LLM output to a predefined set of options.\n",
-    "* **Pydantic model**: Ensure the LLM output follows a Pydantic model.\n",
-    "* **JSON Schema**: Ensure the LLM output follows a JSON Schema.\n",
-    "\n",
-    "Outlines can support major proprietary LLM APIs (e.g. OpenAI's via vLLM). However, one of its key advantages is the ability to ensure structured output for Open Source models, which often lack such guarantees by default."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "```bash\n",
-    "pip install outlines\n",
-    "pip install transformers\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this example, we will use a `Qwen2.5-0.5B` model, a lightweight open source model from Alibaba Cloud known for its strong performance despite its small size."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import outlines\n",
-    "\n",
-    "model = outlines.models.transformers(\"Qwen/Qwen2.5-0.5B-Instruct\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Negative\n"
-     ]
-    }
-   ],
-   "source": [
-    "TOP = 100\n",
-    "prompt = f\"\"\"You are a sentiment-labelling assistant specialized in Financial Statements.\n",
-    "Is the following document positive or negative?\n",
-    "\n",
-    "Document: {sec_filing[:TOP]}\n",
-    "\"\"\"\n",
-    "\n",
-    "generator = outlines.generate.choice(model, [\"Positive\", \"Negative\"])\n",
-    "answer = generator(prompt)\n",
-    "print(answer)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this simple example, we use Outlines' `choice` method to constrain the model output to a predefined set of options (\"Positive\" or \"Negative\"). This ensures the model can only return one of these values, avoiding any unexpected or malformed responses.\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Outlines allows to guide the generation process so the output is guaranteed to follow a JSON schema or Pydantic model. Now we will go back to our example of extracting entities and places from a SEC filing. In order to do so, we simply need to pass our Pydantic model to the `json` method in Outlines' `generate` module."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "BASE_PROMPT = \"You are an expert at structured data extraction. You will be given unstructured text from a SEC filing and extracted names of mentioned entities and places and should convert the response into the given structure.\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "prompt = f\"{BASE_PROMPT} Document: {sec_filing[:TOP]}\"\n",
-    "generator = outlines.generate.json(model, SECExtraction)\n",
-    "sec_extraction_outlines = generator(prompt)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Extracted entities: ['Zsp', 'ZiCorp']\n",
-      "Extracted places: ['California']\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"Extracted entities:\", sec_extraction_outlines.mentioned_entities)\n",
-    "print(\"Extracted places:\", sec_extraction_outlines.mentioned_places)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We observe that the model was able to extract the entities and places from the input text, and return them in the specified format. However, it is interesting to see that the model hallucinates a few entities, a phenomenon that is common for smaller Open Source models that were not fine-tuned on the task of entity extraction.\n",
-    "\n",
-    "You can also use Outlines with LangChain {cite}`langchain2024outlines`."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Ollama\n",
-    "\n",
-    "Ollama is a popular tool that allows you to run large language models (LLMs) locally. It has recently added support for structured output generation. The current `ollama` implementation leverages llama.cpp GBNF (GGML BNF) grammars {cite}`llama_cpp_grammars` to enable structured output generation. \n",
-    "\n",
-    "llama.cpp GBNF forces language models to generate output in specific, predefined formats by constraining their outputs to follow precise rules and patterns. The system accomplishes this through a formal grammar specification that defines exactly how valid outputs can be constructed. It's essentially an extension of BNF (Backus-Naur Form) {cite}`backus_naur_form` with some modern regex-like features added. These rules carefully define what elements are allowed, how they can be combined, and what patterns of repetition and sequencing are valid. By enforcing these constraints during generation, GBNF ensures the model's output strictly adheres to the desired format.\n",
-    "\n",
-    "Ollama first introduced structured output generation in version 0.5.1 providing support for JSON output but highlighting additional formats are coming soon.\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's replicate our previous structured output generation example with Ollama. First, make sure you have Ollama installed. You can find installation instructions [here](https://ollama.com/docs/installation).\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "```bash\n",
-    "curl -fsSL https://ollama.com/install.sh | sh\n",
-    "pip install ollama\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The code below demonstrates how to use Ollama's structured output capabilities with a Pydantic model as we did before with OpenAI, LangChain and Outlines. The SECExtraction model defines the expected structure with two fields: mentioned_entities and mentioned_places as lists of strings we expect the model to return given an input SEC filing. The `extract_entities_from_sec_filing` function uses Ollama's chat API to analyze SEC filings and extract entities in a structured format, with temperature set to 0 for deterministic results. We pass the Pydantic model's JSON schema to Ollama via the `format` parameter. We append a suffix to the prompt instructing the model to return the response as JSON (\"Return as JSON.\") as recommended by Ollama maintainers.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from ollama import chat\n",
-    "from pydantic import BaseModel\n",
-    "\n",
-    "class SECExtraction(BaseModel):\n",
-    "    mentioned_entities: list[str]\n",
-    "    mentioned_places: list[str]\n",
-    "\n",
-    "OLLAMA_STRUCTURED_OUTPUT_PROMPT_SUFFIX = \"Return as JSON.\"\n",
-    "OLLAMA_STRUCTURED_OUTPUT_TEMPERATURE = 0\n",
-    "\n",
-    "def extract_entities_from_sec_filing(doc: str, model: str) -> dict:\n",
-    "    \"\"\"\n",
-    "    Extract entities and places from an SEC filing using Ollama chat.\n",
-    "    \n",
-    "    Args:\n",
-    "        doc: The SEC filing text to analyze\n",
-    "        model: The Ollama model to use for extraction\n",
-    "        \n",
-    "    Returns:\n",
-    "        The raw response from the chat model\n",
-    "    \"\"\"\n",
-    "    response = chat(\n",
-    "        messages=[\n",
-    "            {\n",
-    "                'role': 'user',\n",
-    "                'content': f\"\"\"{BASE_PROMPT}\n",
-    "                {OLLAMA_STRUCTURED_OUTPUT_PROMPT_SUFFIX}\n",
-    "                \n",
-    "                Document: {doc}\"\"\"\n",
-    "            }\n",
-    "        ],\n",
-    "        model=model,  # You can also use other models like 'mistral' or 'llama2-uncensored'\n",
-    "        format=SECExtraction.model_json_schema(),\n",
-    "        options={'temperature': OLLAMA_STRUCTURED_OUTPUT_TEMPERATURE}  # Set to 0 for more deterministic output\n",
-    "    )\n",
-    "    return response\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We can now run the function and print the extracted entities and places. But first we need to start the Ollama server with our target LLM model (Qwen2.5-0.5B) running locally."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "```bash\n",
-    "ollama run qwen2.5:0.5b\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "doc = sec_filing[:TOP]\n",
-    "model = \"qwen2.5:0.5b\"\n",
-    "\n",
-    "response = extract_entities_from_sec_filing(doc, model)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "response_json = json.loads(response.message.content)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Extracted entities: ['United States', 'SECURITIES AND EXCHANGE COMMISSION']\n",
-      "Extracted places: []\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"Extracted entities:\", response_json.get('mentioned_entities'))\n",
-    "print(\"Extracted places:\", response_json.get('mentioned_places'))\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The extracted entities and places were quite different from those previously extracted using Outlines and Langchain, as expected since this depends mostly on the underlying model which is quite small. We do observe though that we have successfully obtained results in JSON format as specified, even with such a small underlying model (0.5B parameters).\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Discussion\n",
-    "\n",
-    "### Comparing Solutions\n",
-    "\n",
-    "\n",
-    "The choice of framework for structured LLM output depends heavily on specific constraints, requirements and use cases. LangChain is the most used LLM framework today with a large developer community base however its structured output support depends on the underlying LLM provider support. Ollama enables straightforward local deployment and experimentation democratizing access to LLMs while fostering privacy and control, however today it only offers JSON format with further formats to come. Outlines emerges as a solution with great flexibility and control over output structure while providing support for a wide range of LLMs. {numref}`structured_output_frameworks` provides a summary comparison of the different frameworks.\n",
-    "\n",
-    "```{table} Structured Output Frameworks Comparison\n",
-    ":name: structured_output_frameworks\n",
-    "| Feature | LangChain | Outlines | Ollama |\n",
-    "|---------|-----------|----------|---------|\n",
-    "| **Implementation Approach** | Wrapper around LLM's native structured output APIs using with_structured_output method | Adjusts probability distribution of model's output logits to guide generation | Uses llama.cpp GBNF grammars to constrain output format |\n",
-    "| **Model Support** | Limited to LLMs with built-in structured output APIs | Broad support for open-source models via transformers, llama.cpp, exllama2, mlx-lm and vllm | Focused on running open-source models locally |\n",
-    "| **Output Format Support** | - TypedDict<br>- JSON Schema<br>- Pydantic class | - Multiple choice generation<br>- Regex-based structure<br>- Pydantic model<br>- JSON Schema | - Currently JSON only<br>- Additional formats planned |\n",
-    "| **Key Advantages** | - Simple integration with supported LLMs<br>- Good for production environments using major LLM providers | - Most flexible output structure options<br>- Fine-grained control over generation<br>- Strong open-source model support | - Excellent for local deployment<br>- Simple setup and usage<br>- Built-in model serving |\n",
-    "| **Use Case Focus** | Enterprise applications using commercial LLMs | Applications requiring precise output control or using open-source models | Local deployment and/or experimentation |\n",
-    "| **Complexity Level** | Medium - requires understanding of LangChain abstractions | Low | Low |\n",
-    "```\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Best Practices\n",
-    "\n",
-    "\n",
-    "* **Clear Schema Definition**: Define the desired output structure clearly. This can be done in several ways including schemas, types, or Pydantic models as appropriate. This ensures the LLM knows exactly what format is expected.\n",
-    "\n",
-    "* **Descriptive Naming**: Use meaningful names for fields and elements in your schema. This makes the output more understandable and easier to work with.\n",
-    "\n",
-    "* **Detailed Prompting**: Guide the LLM with well-crafted prompts that include examples and clear instructions.  A well-structured prompt improves the chances of getting the desired output.\n",
-    "\n",
-    "* **Integration**: If you are connecting the model to tools, functions, data, etc. in your system, then you are highly encouraged to use a typed structured output (e.g. Pydantic models) to ensure the model's output can be processed correctly by downstream systems.\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Research and Ongoing Debate\n",
-    "\n",
-    "The use of structured output for Large Language Models (LLMs) is a developing area. While the ability to constrain LLM outputs offer clear benefits in parsing, robustness, and integration, there is growing debate on whether it also potentially comes at the cost of performance as well as reasoning abilities. Research in this area should be taken with a grain of salt since findings are mixed and often depend on the specific task and model family at hand furthermore model families are not always comparable and are getting updated by the day! Nonetheless, early findings provide some interesting insights as to why there is no one-size-fits-all solution when it comes to LLMs structured output.\n",
-    "\n",
-    "\n",
-    "There is some evidence indicating that LLMs may have bias in their handling of different output formats {cite}`long2024llms`. The study examined common output structures like multiple-choice answers, wrapped text, lists, and key-value mappings. The authors analyzed key LLM model families, namely Gemma, Mistral, and ChatGPT, uncovering bias across multiple tasks and formats.  The researchers attributed these biases to the models' underlying token distributions for different formats. An example of this format bias emerged in the comparison between JSON and YAML outputs. While models like Mistral and Gemma excelled at generating JSON structures, they performed notably worse with YAML. Their YAML outputs often contained extraneous information that degrades output quality. This disparity likely stems from JSON's prevalence in training data, highlighting how a format's popularity directly influences model performance. While the studied models can be probably considered outdated by now since models are getting updated on a rapidly fashion, it is important to remark that addressing format bias is critical for advancing LLMs and ensuring their reliable application in real-world scenarios.\n",
-    "\n",
-    "Recent research \"Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models\" {cite}`tam2024letspeakfreelystudy` suggests that imposing format restrictions on LLMs might impact their performance, particularly in reasoning-intensive tasks. Further evidence {cite}`aider2024codejson` suggests LLMs may produce lower quality code if they’re asked to return it as part of a structured JSON response, in particular:\n",
-    "\n",
-    "* **Potential performance degradation:**  Enforcing structured output, especially through constrained decoding methods like JSON-mode, can negatively impact an LLM's reasoning abilities. This is particularly evident in tasks that require multi-step reasoning or complex thought processes.\n",
-    "\n",
-    "* **Overly restrictive schemas:** Imposing strict schemas can limit the expressiveness of LLM outputs and may hinder their ability to generate creative or nuanced responses.  In certain cases, the strictness of the schema might outweigh the benefits of structured output.\n",
-    "\n",
-    "* **Increased complexity in prompt engineering:** Crafting prompts that effectively guide LLMs to generate structured outputs while maintaining performance can be challenging. It often requires careful consideration of the schema, the task instructions, and the desired level of detail in the response.\n",
-    "\n",
-    "On the other hand, those findings are not without criticism. The .txt team challenges the work of {cite}`tam2024letspeakfreelystudy`. The rebuttal argues that **structured generation, when done correctly, actually *improves* performance**.\n",
-    "\n",
-    "\n",
-    "```{figure} ../_static/structured_output/rebuttal.png\n",
-    "---\n",
-    "name: structured_vs_unstructured\n",
-    "alt: Structured vs Unstructured Results by .txt team\n",
-    "scale: 50%\n",
-    "align: center\n",
-    "---\n",
-    "Structured vs Unstructured Results by .txt team.\n",
-    "```\n",
-    "\n",
-    "The .txt team presents compelling evidence through their reproduction of the paper's experiments. While their unstructured results align with the original paper's findings, their structured results paint a dramatically different picture - demonstrating that structured generation actually improves performance (see {numref}`structured_vs_unstructured`). The team has made their experimental notebooks publicly available on GitHub for independent verification {cite}`dottxt2024demos`.\n",
-    "\n",
-    "\n",
-    ".txt team identifies several flaws in the methodology of \"Let Me Speak Freely?\" that they believe led to inaccurate conclusions:\n",
-    "\n",
-    "*   The paper finds that structured output improves performance on classification tasks but doesn't reconcile this finding with its overall negative conclusion about structured output. \n",
-    "*   The prompts used for unstructured generation were different from those used for structured generation, making the comparison uneven. \n",
-    "*   The prompts used for structured generation, particularly in JSON-mode, didn't provide the LLM with sufficient information to properly complete the task. \n",
-    "*   The paper conflates \"structured generation\" with \"JSON-mode\", when they are not the same thing. \n",
-    "\n",
-    "It is important to note that while .txt provides a compelling and verifiable argument in favor of (proper) structured output generation in LLMs further research and exploration are needed to comprehensively understand the nuances and trade-offs involved in using structured output for various LLM tasks and applications.\n",
-    "\n",
-    "In summary, the debate surrounding structured output highlights the ongoing challenges in balancing LLM capabilities with real-world application requirements. While structured outputs offer clear benefits in parsing, robustness, and integration, their potential impact on performance, particularly in reasoning tasks is a topic of ongoing debate. \n",
-    "\n",
-    "The ideal approach likely involves a nuanced strategy that considers the specific task, the desired level of structure, and the available LLM capabilities. Further research and development efforts are needed to mitigate potential drawbacks and unlock the full potential of LLMs for a wider range of applications. \n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Conclusion\n",
-    "\n",
-    "Extracting structured output from LLMs is crucial for integrating them into real-world applications. By understanding the challenges and employing appropriate strategies and tools, developers can improve the reliability and usability of LLM-powered systems, unlocking their potential to automate complex tasks and generate valuable insights. \n",
-    "\n",
-    "## Acknowledgements\n",
-    "\n",
-    "We would like to thank [Cameron Pfiffer](https://x.com/cameron_pfiffer) from the .txt team for his insightful review and feedback.\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Citation\n",
-    "[![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]\n",
-    "\n",
-    "[cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/\n",
-    "[cc-by-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png\n",
-    "[cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC-BY--NC--SA-4.0-lightgrey.svg\n",
-    "\n",
-    "```\n",
-    "@misc{tharsistpsouza2024tamingllms,\n",
-    "  author = {Tharsis T. P. Souza},\n",
-    "  title = {Taming LLMs: A Practical Guide to LLM Pitfalls with Open Source Software},\n",
-    "  year = {2024},\n",
-    "  chapter = {Wrestling with Structured Output},\n",
-    "  journal = {GitHub repository},\n",
-    "  url = {https://github.com/souzatharsis/tamingLLMs)\n",
-    "}\n",
-    "```\n",
-    "## References\n",
-    "```{bibliography}\n",
-    ":filter: docname in docnames\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/tamingllms/notebooks/taming_utils.py b/tamingllms/notebooks/taming_utils.py
index a70db65..4ed012a 100644
--- a/tamingllms/notebooks/taming_utils.py
+++ b/tamingllms/notebooks/taming_utils.py
@@ -350,3 +350,26 @@ def calculate_validator_metrics(scoring_results, scoring_prompts, bad_sources, g
     # Create a DataFrame from the results
     results_df = pd.DataFrame(results)
     return results_df
+
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+
+def load_gguf(model_name="bartowski/SmolLM2-135M-Instruct-GGUF", 
+              gguf_file="SmolLM2-135M-Instruct-Q2_K.gguf",
+              device_map="auto"):
+    """
+    Load a pre-trained language model.
+    
+    Args:
+        model_name (str): Name/path of the model to load
+        gguf_file (str): Name of the GGUF file to use
+        device_map (str): Device mapping strategy
+        
+    Returns:
+        AutoModelForCausalLM: Loaded model
+    """
+    model = AutoModelForCausalLM.from_pretrained(model_name,
+                                                gguf_file=gguf_file,
+                                                device_map=device_map)
+    return model
diff --git a/tamingllms/references.bib b/tamingllms/references.bib
index 3e50dc3..e5a56fb 100644
--- a/tamingllms/references.bib
+++ b/tamingllms/references.bib
@@ -702,6 +702,16 @@ @misc{deshpande2024glidergradingllminteractions
       url={https://arxiv.org/abs/2412.14140}, 
 }
 
+@misc{unsloth2024llama3,
+    title={Llama-3.3-70B-Instruct-GGUF},
+    author={{Unsloth}},
+    year={2024},
+    howpublished={Hugging Face Model},
+    url={https://huggingface.co/unsloth/Llama-3.3-70B-Instruct-GGUF},
+    note={GGUF quantized version of Meta's Llama 3.3 70B instruction-tuned model}
+}
+
+
 @misc{nvidia2024logitsprocessorzoo,
     title={Logits Processor Zoo},
     author={{NVIDIA}},
@@ -750,6 +760,26 @@ @misc{salesforce2024wikitext
     note={Large-scale dataset derived from verified Good and Featured articles on Wikipedia}
 }
 
+@misc{wang20241bitaiinfra11,
+      title={1-bit AI Infra: Part 1.1, Fast and Lossless BitNet b1.58 Inference on CPUs}, 
+      author={Jinheng Wang and Hansong Zhou and Ting Song and Shaoguang Mao and Shuming Ma and Hongyu Wang and Yan Xia and Furu Wei},
+      year={2024},
+      eprint={2410.16144},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2410.16144}, 
+}
+
+@misc{a16z2024llmflation,
+    title={LLMflation: Understanding and Mitigating LLM Inference Cost},
+    author={{Andreessen Horowitz}},
+    year={2024},
+    howpublished={Blog Post},
+    url={https://a16z.com/llmflation-llm-inference-cost/},
+    note={Analysis of LLM inference costs and strategies for optimization}
+}
+
+
 @misc{huggingface2024quantization,
     title={GGUF Quantization Types},
     author={{Hugging Face}},