From 689e398872be03ec3bf60f3752cebfb2d7207506 Mon Sep 17 00:00:00 2001
From: Wauplin <lucainp@gmail.com>
Date: Thu, 5 Sep 2024 10:34:19 +0200
Subject: [PATCH 1/3] Update getting started examples

---
 docs/api-inference/getting-started.md | 65 +++++++++++++++++++++------
 1 file changed, 51 insertions(+), 14 deletions(-)

diff --git a/docs/api-inference/getting-started.md b/docs/api-inference/getting-started.md
index c0647b935..633cee968 100644
--- a/docs/api-inference/getting-started.md
+++ b/docs/api-inference/getting-started.md
@@ -14,11 +14,15 @@ For more details about user tokens, check out [this guide](https://huggingface.c
 ## cURL
 
 ```bash
-curl https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment-latest \
-    -X POST \
-    -d '{"inputs": "Today is a nice day"}' \
-    -H "Authorization: Bearer hf_***" \
-    -H "Content-Type: application/json"
+curl 'https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-8B-Instruct/v1/chat/completions' \
+-H "Authorization: Bearer hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" \
+-H 'Content-Type: application/json' \
+-d '{
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "max_tokens": 500,
+    "stream": false
+}'
 ```
 
 ## Python
@@ -28,21 +32,35 @@ You can use the `requests` library to make a request to the Inference API.
 ```python
 import requests
 
-API_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment-latest"
-headers = {"Authorization": "Bearer hf_***"}
+API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-8B-Instruct/v1/chat/completions"
+headers = {"Authorization": "Bearer hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"}
+payload = {
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "max_tokens": 500,
+    "stream": False
+}
 
-payload = {"inputs": "Today is a nice day"}
 response = requests.post(API_URL, headers=headers, json=payload)
 response.json()
 ```
 
-Hugging Face also provides a [`InferenceClient`](https://huggingface.co/docs/huggingface_hub/guides/inference) that handles inference, caching, async, and more. Make sure to install it with `pip install huggingface_hub` first
+Hugging Face also provides a [`InferenceClient`](https://huggingface.co/docs/huggingface_hub/guides/inference) that handles inference for you. Make sure to install it with `pip install huggingface_hub` first.
 
 ```python
 from huggingface_hub import InferenceClient
 
-client = InferenceClient(model="cardiffnlp/twitter-roberta-base-sentiment-latest", token="hf_***")
-client.text_classification("Today is a nice day")
+client = InferenceClient(
+    "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    token="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
+)
+
+for message in client.chat_completion(
+	messages=[{"role": "user", "content": "What is the capital of France?"}],
+	max_tokens=500,
+	stream=True,
+):
+    print(message.choices[0].delta.content, end="")
 ```
 
 ## JavaScript
@@ -52,11 +70,11 @@ import fetch from "node-fetch";
 
 async function query(data) {
     const response = await fetch(
-        "https://api-inference.huggingface.co/models/MODEL_ID",
+        "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-8B-Instruct/v1/chat/completions",
         {
             method: "POST",
             headers: {
-                Authorization: `Bearer cardiffnlp/twitter-roberta-base-sentiment-latest`,
+                Authorization: `Bearer hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx`,
                 "Content-Type": "application/json",
             },
             body: JSON.stringify(data),
@@ -67,12 +85,31 @@ async function query(data) {
 }
 
 query({
-    inputs: "Today is a nice day"
+	"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+	"messages": [{"role": "user", "content": "What is the capital of France?"}],
+	"max_tokens": 500,
+	"stream": false
 }).then((response) => {
     console.log(JSON.stringify(response, null, 2));
 });
 ```
 
+Hugging Face also provides a [`HfInference`](https://huggingface.co/docs/huggingface.js/inference/classes/HfInference) client that handles inference. Make sure to install it with `npm install @huggingface/inference` first.
+
+```js
+import { HfInference } from "@huggingface/inference";
+
+const inference = new HfInference("hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
+
+for await (const chunk of inference.chatCompletionStream({
+	model: "meta-llama/Meta-Llama-3.1-8B-Instruct",
+	messages: [{ role: "user", content: "What is the capital of France?" }],
+	max_tokens: 500,
+})) {
+	process.stdout.write(chunk.choices[0]?.delta?.content || "");
+}
+```
+
 ## Next Steps
 
 Now that you know the basics, you can explore the [API Reference](./parameters.md) to learn more about task-specific settings and parameters. 
\ No newline at end of file

From 72f924b08ef1e0fe6c6e49be79c6c41871794972 Mon Sep 17 00:00:00 2001
From: Wauplin <lucainp@gmail.com>
Date: Thu, 5 Sep 2024 10:42:31 +0200
Subject: [PATCH 2/3] Move snippets above specification

---
 .../tasks/audio-classification.md             |  62 ++++-----
 .../tasks/automatic-speech-recognition.md     |  96 ++++++-------
 docs/api-inference/tasks/chat-completion.md   | 127 ++++++++---------
 .../api-inference/tasks/feature-extraction.md |  61 ++++----
 docs/api-inference/tasks/fill-mask.md         |  66 ++++-----
 .../tasks/image-classification.md             |  63 ++++-----
 .../api-inference/tasks/image-segmentation.md |  76 +++++-----
 docs/api-inference/tasks/image-to-image.md    |  13 +-
 docs/api-inference/tasks/object-detection.md  |  70 +++++-----
 .../api-inference/tasks/question-answering.md |  80 +++++------
 docs/api-inference/tasks/summarization.md     |  60 ++++----
 .../tasks/table-question-answering.md         |  66 ++++-----
 .../tasks/text-classification.md              |  62 ++++-----
 docs/api-inference/tasks/text-generation.md   | 130 +++++++++---------
 docs/api-inference/tasks/text-to-image.md     |  71 +++++-----
 .../tasks/token-classification.md             |  97 ++++++-------
 docs/api-inference/tasks/translation.md       |  64 ++++-----
 .../tasks/zero-shot-classification.md         |  66 ++++-----
 .../task/audio-classification.handlebars      |   8 +-
 .../automatic-speech-recognition.handlebars   |   8 +-
 .../templates/task/chat-completion.handlebars |   7 +-
 .../task/feature-extraction.handlebars        |   7 +-
 .../templates/task/fill-mask.handlebars       |   8 +-
 .../task/image-classification.handlebars      |   7 +-
 .../task/image-segmentation.handlebars        |   7 +-
 .../templates/task/image-to-image.handlebars  |   7 +-
 .../task/object-detection.handlebars          |   8 +-
 .../task/question-answering.handlebars        |   8 +-
 .../templates/task/summarization.handlebars   |   8 +-
 .../task/table-question-answering.handlebars  |   8 +-
 .../task/text-classification.handlebars       |   8 +-
 .../templates/task/text-generation.handlebars |   8 +-
 .../templates/task/text-to-image.handlebars   |   8 +-
 .../task/token-classification.handlebars      |   7 +-
 .../templates/task/translation.handlebars     |   8 +-
 .../task/zero-shot-classification.handlebars  |   8 +-
 36 files changed, 741 insertions(+), 727 deletions(-)

diff --git a/docs/api-inference/tasks/audio-classification.md b/docs/api-inference/tasks/audio-classification.md
index 099afedf6..2819beed8 100644
--- a/docs/api-inference/tasks/audio-classification.md
+++ b/docs/api-inference/tasks/audio-classification.md
@@ -27,37 +27,6 @@ For more details about the `audio-classification` task, check out its [dedicated
 
 This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=audio-classification&sort=trending).
 
-### API specification
-
-#### Request
-
-| Payload |  |  |
-| :--- | :--- | :--- |
-| **inputs*** | _string_ | The input audio data as a base64-encoded string. If no `parameters` are provided, you can also provide the audio data as a raw bytes payload. |
-| **parameters** | _object_ | Additional inference parameters for Audio Classification |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;function_to_apply** | _enum_ | Possible values: sigmoid, softmax, none. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;top_k** | _integer_ | When specified, limits the output to the top K most probable classes. |
-
-
-Some options can be configured by passing headers to the Inference API. Here are the available headers:
-
-| Headers |   |    |
-| :--- | :--- | :--- |
-| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
-| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
-| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
-
-For more information about Inference API headers, check out the parameters [guide](../parameters).
-
-#### Response
-
-| Body |  |
-| :--- | :--- | :--- |
-| **(array)** | _object[]_ | Output is an array of objects. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;label** | _string_ | The predicted class label. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;score** | _number_ | The corresponding probability. |
-
-
 ### Using the API
 
 
@@ -122,3 +91,34 @@ To use the JavaScript client, see `huggingface.js`'s [package reference](https:/
 </inferencesnippet>
 
 
+
+### API specification
+
+#### Request
+
+| Payload |  |  |
+| :--- | :--- | :--- |
+| **inputs*** | _string_ | The input audio data as a base64-encoded string. If no `parameters` are provided, you can also provide the audio data as a raw bytes payload. |
+| **parameters** | _object_ | Additional inference parameters for Audio Classification |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;function_to_apply** | _enum_ | Possible values: sigmoid, softmax, none. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;top_k** | _integer_ | When specified, limits the output to the top K most probable classes. |
+
+
+Some options can be configured by passing headers to the Inference API. Here are the available headers:
+
+| Headers |   |    |
+| :--- | :--- | :--- |
+| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
+| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
+| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
+
+For more information about Inference API headers, check out the parameters [guide](../parameters).
+
+#### Response
+
+| Body |  |
+| :--- | :--- | :--- |
+| **(array)** | _object[]_ | Output is an array of objects. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;label** | _string_ | The predicted class label. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;score** | _number_ | The corresponding probability. |
+
diff --git a/docs/api-inference/tasks/automatic-speech-recognition.md b/docs/api-inference/tasks/automatic-speech-recognition.md
index 88e98003a..81a456f00 100644
--- a/docs/api-inference/tasks/automatic-speech-recognition.md
+++ b/docs/api-inference/tasks/automatic-speech-recognition.md
@@ -28,54 +28,6 @@ For more details about the `automatic-speech-recognition` task, check out its [d
 
 This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=automatic-speech-recognition&sort=trending).
 
-### API specification
-
-#### Request
-
-| Payload |  |  |
-| :--- | :--- | :--- |
-| **inputs*** | _string_ | The input audio data as a base64-encoded string. If no `parameters` are provided, you can also provide the audio data as a raw bytes payload. |
-| **parameters** | _object_ | Additional inference parameters for Automatic Speech Recognition |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;return_timestamps** | _boolean_ | Whether to output corresponding timestamps with the generated text |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;generate** | _object_ | Ad-hoc parametrization of the text generation process |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;temperature** | _number_ | The value used to modulate the next token probabilities. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;top_k** | _integer_ | The number of highest probability vocabulary tokens to keep for top-k-filtering. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;top_p** | _number_ | If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;typical_p** | _number_ |  Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to typical_p or higher are kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;epsilon_cutoff** | _number_ | If set to float strictly between 0 and 1, only tokens with a conditional probability greater than epsilon_cutoff will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;eta_cutoff** | _number_ | Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;max_length** | _integer_ | The maximum length (in tokens) of the generated text, including the input. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;max_new_tokens** | _integer_ | The maximum number of tokens to generate. Takes precedence over maxLength. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;min_length** | _integer_ | The minimum length (in tokens) of the generated text, including the input. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;min_new_tokens** | _integer_ | The minimum number of tokens to generate. Takes precedence over maxLength. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;do_sample** | _boolean_ | Whether to use sampling instead of greedy decoding when generating new tokens. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;early_stopping** | _enum_ | Possible values: never, true, false. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;num_beams** | _integer_ | Number of beams to use for beam search. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;num_beam_groups** | _integer_ | Number of groups to divide num_beams into in order to ensure diversity among different groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;penalty_alpha** | _number_ | The value balances the model confidence and the degeneration penalty in contrastive search decoding. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;use_cache** | _boolean_ | Whether the model should use the past last key/values attentions to speed up decoding |
-
-
-Some options can be configured by passing headers to the Inference API. Here are the available headers:
-
-| Headers |   |    |
-| :--- | :--- | :--- |
-| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
-| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
-| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
-
-For more information about Inference API headers, check out the parameters [guide](../parameters).
-
-#### Response
-
-| Body |  |
-| :--- | :--- | :--- |
-| **text** | _string_ | The recognized text. |
-| **chunks** | _object[]_ | When returnTimestamps is enabled, chunks contains a list of audio chunks identified by the model. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;text** | _string_ | A chunk of text identified by the model |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;timestamps** | _number[]_ | The start and end timestamps corresponding with the text |
-
-
 ### Using the API
 
 
@@ -140,3 +92,51 @@ To use the JavaScript client, see `huggingface.js`'s [package reference](https:/
 </inferencesnippet>
 
 
+
+### API specification
+
+#### Request
+
+| Payload |  |  |
+| :--- | :--- | :--- |
+| **inputs*** | _string_ | The input audio data as a base64-encoded string. If no `parameters` are provided, you can also provide the audio data as a raw bytes payload. |
+| **parameters** | _object_ | Additional inference parameters for Automatic Speech Recognition |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;return_timestamps** | _boolean_ | Whether to output corresponding timestamps with the generated text |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;generate** | _object_ | Ad-hoc parametrization of the text generation process |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;temperature** | _number_ | The value used to modulate the next token probabilities. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;top_k** | _integer_ | The number of highest probability vocabulary tokens to keep for top-k-filtering. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;top_p** | _number_ | If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;typical_p** | _number_ |  Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to typical_p or higher are kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;epsilon_cutoff** | _number_ | If set to float strictly between 0 and 1, only tokens with a conditional probability greater than epsilon_cutoff will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;eta_cutoff** | _number_ | Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;max_length** | _integer_ | The maximum length (in tokens) of the generated text, including the input. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;max_new_tokens** | _integer_ | The maximum number of tokens to generate. Takes precedence over maxLength. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;min_length** | _integer_ | The minimum length (in tokens) of the generated text, including the input. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;min_new_tokens** | _integer_ | The minimum number of tokens to generate. Takes precedence over maxLength. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;do_sample** | _boolean_ | Whether to use sampling instead of greedy decoding when generating new tokens. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;early_stopping** | _enum_ | Possible values: never, true, false. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;num_beams** | _integer_ | Number of beams to use for beam search. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;num_beam_groups** | _integer_ | Number of groups to divide num_beams into in order to ensure diversity among different groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;penalty_alpha** | _number_ | The value balances the model confidence and the degeneration penalty in contrastive search decoding. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;use_cache** | _boolean_ | Whether the model should use the past last key/values attentions to speed up decoding |
+
+
+Some options can be configured by passing headers to the Inference API. Here are the available headers:
+
+| Headers |   |    |
+| :--- | :--- | :--- |
+| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
+| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
+| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
+
+For more information about Inference API headers, check out the parameters [guide](../parameters).
+
+#### Response
+
+| Body |  |
+| :--- | :--- | :--- |
+| **text** | _string_ | The recognized text. |
+| **chunks** | _object[]_ | When returnTimestamps is enabled, chunks contains a list of audio chunks identified by the model. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;text** | _string_ | A chunk of text identified by the model |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;timestamps** | _number[]_ | The start and end timestamps corresponding with the text |
+
diff --git a/docs/api-inference/tasks/chat-completion.md b/docs/api-inference/tasks/chat-completion.md
index 561081309..213cf5cae 100644
--- a/docs/api-inference/tasks/chat-completion.md
+++ b/docs/api-inference/tasks/chat-completion.md
@@ -29,6 +29,70 @@ This is a subtask of [`text-generation`](./text_generation) designed to generate
 
 
 
+### Using the API
+
+
+<inferencesnippet>
+
+<curl>
+```bash
+curl 'https://api-inference.huggingface.co/models/google/gemma-2-2b-it/v1/chat/completions' \
+-H "Authorization: Bearer hf_***" \
+-H 'Content-Type: application/json' \
+-d '{
+	"model": "google/gemma-2-2b-it",
+	"messages": [{"role": "user", "content": "What is the capital of France?"}],
+	"max_tokens": 500,
+	"stream": false
+}'
+
+```
+</curl>
+
+<python>
+```py
+from huggingface_hub import InferenceClient
+
+client = InferenceClient(
+    "google/gemma-2-2b-it",
+    token="hf_***",
+)
+
+for message in client.chat_completion(
+	messages=[{"role": "user", "content": "What is the capital of France?"}],
+	max_tokens=500,
+	stream=True,
+):
+    print(message.choices[0].delta.content, end="")
+
+```
+
+To use the Python client, see `huggingface_hub`'s [package reference](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion).
+</python>
+
+<js>
+```js
+import { HfInference } from "@huggingface/inference";
+
+const inference = new HfInference("hf_***");
+
+for await (const chunk of inference.chatCompletionStream({
+	model: "google/gemma-2-2b-it",
+	messages: [{ role: "user", content: "What is the capital of France?" }],
+	max_tokens: 500,
+})) {
+	process.stdout.write(chunk.choices[0]?.delta?.content || "");
+}
+
+```
+
+To use the JavaScript client, see `huggingface.js`'s [package reference](https://huggingface.co/docs/huggingface.js/inference/classes/HfInference#chatcompletion).
+</js>
+
+</inferencesnippet>
+
+
+
 ### API specification
 
 #### Request
@@ -150,66 +214,3 @@ For more information about streaming, check out [this guide](https://huggingface
 | **system_fingerprint** | _string_ |  |
 
 
-### Using the API
-
-
-<inferencesnippet>
-
-<curl>
-```bash
-curl 'https://api-inference.huggingface.co/models/google/gemma-2-2b-it/v1/chat/completions' \
--H "Authorization: Bearer hf_***" \
--H 'Content-Type: application/json' \
--d '{
-	"model": "google/gemma-2-2b-it",
-	"messages": [{"role": "user", "content": "What is the capital of France?"}],
-	"max_tokens": 500,
-	"stream": false
-}'
-
-```
-</curl>
-
-<python>
-```py
-from huggingface_hub import InferenceClient
-
-client = InferenceClient(
-    "google/gemma-2-2b-it",
-    token="hf_***",
-)
-
-for message in client.chat_completion(
-	messages=[{"role": "user", "content": "What is the capital of France?"}],
-	max_tokens=500,
-	stream=True,
-):
-    print(message.choices[0].delta.content, end="")
-
-```
-
-To use the Python client, see `huggingface_hub`'s [package reference](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion).
-</python>
-
-<js>
-```js
-import { HfInference } from "@huggingface/inference";
-
-const inference = new HfInference("hf_***");
-
-for await (const chunk of inference.chatCompletionStream({
-	model: "google/gemma-2-2b-it",
-	messages: [{ role: "user", content: "What is the capital of France?" }],
-	max_tokens: 500,
-})) {
-	process.stdout.write(chunk.choices[0]?.delta?.content || "");
-}
-
-```
-
-To use the JavaScript client, see `huggingface.js`'s [package reference](https://huggingface.co/docs/huggingface.js/inference/classes/HfInference#chatcompletion).
-</js>
-
-</inferencesnippet>
-
-
diff --git a/docs/api-inference/tasks/feature-extraction.md b/docs/api-inference/tasks/feature-extraction.md
index 2762418b8..5c37b4e9c 100644
--- a/docs/api-inference/tasks/feature-extraction.md
+++ b/docs/api-inference/tasks/feature-extraction.md
@@ -29,36 +29,6 @@ For more details about the `feature-extraction` task, check out its [dedicated p
 
 This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=feature-extraction&sort=trending).
 
-### API specification
-
-#### Request
-
-| Payload |  |  |
-| :--- | :--- | :--- |
-| **inputs*** | _string_ | The text to embed. |
-| **normalize** | _boolean_ |  |
-| **prompt_name** | _string_ | The name of the prompt that should be used by for encoding. If not set, no prompt will be applied.  Must be a key in the `Sentence Transformers` configuration `prompts` dictionary.  For example if ``prompt_name`` is "query" and the ``prompts`` is {"query": "query: ", ...}, then the sentence "What is the capital of France?" will be encoded as "query: What is the capital of France?" because the prompt text will be prepended before any text to encode. |
-| **truncate** | _boolean_ |  |
-| **truncation_direction** | _enum_ | Possible values: Left, Right. |
-
-
-Some options can be configured by passing headers to the Inference API. Here are the available headers:
-
-| Headers |   |    |
-| :--- | :--- | :--- |
-| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
-| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
-| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
-
-For more information about Inference API headers, check out the parameters [guide](../parameters).
-
-#### Response
-
-| Body |  |
-| :--- | :--- | :--- |
-| **(array)** | _array[]_ | Output is an array of arrays. |
-
-
 ### Using the API
 
 
@@ -123,3 +93,34 @@ To use the JavaScript client, see `huggingface.js`'s [package reference](https:/
 </inferencesnippet>
 
 
+
+### API specification
+
+#### Request
+
+| Payload |  |  |
+| :--- | :--- | :--- |
+| **inputs*** | _string_ | The text to embed. |
+| **normalize** | _boolean_ |  |
+| **prompt_name** | _string_ | The name of the prompt that should be used by for encoding. If not set, no prompt will be applied.  Must be a key in the `Sentence Transformers` configuration `prompts` dictionary.  For example if ``prompt_name`` is "query" and the ``prompts`` is {"query": "query: ", ...}, then the sentence "What is the capital of France?" will be encoded as "query: What is the capital of France?" because the prompt text will be prepended before any text to encode. |
+| **truncate** | _boolean_ |  |
+| **truncation_direction** | _enum_ | Possible values: Left, Right. |
+
+
+Some options can be configured by passing headers to the Inference API. Here are the available headers:
+
+| Headers |   |    |
+| :--- | :--- | :--- |
+| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
+| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
+| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
+
+For more information about Inference API headers, check out the parameters [guide](../parameters).
+
+#### Response
+
+| Body |  |
+| :--- | :--- | :--- |
+| **(array)** | _array[]_ | Output is an array of arrays. |
+
+
diff --git a/docs/api-inference/tasks/fill-mask.md b/docs/api-inference/tasks/fill-mask.md
index d9bdacc2a..54b93832e 100644
--- a/docs/api-inference/tasks/fill-mask.md
+++ b/docs/api-inference/tasks/fill-mask.md
@@ -29,39 +29,6 @@ For more details about the `fill-mask` task, check out its [dedicated page](http
 
 This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=fill-mask&sort=trending).
 
-### API specification
-
-#### Request
-
-| Payload |  |  |
-| :--- | :--- | :--- |
-| **inputs*** | _string_ | The text with masked tokens |
-| **parameters** | _object_ | Additional inference parameters for Fill Mask |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;top_k** | _integer_ | When passed, overrides the number of predictions to return. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;targets** | _string[]_ | When passed, the model will limit the scores to the passed targets instead of looking up in the whole vocabulary. If the provided targets are not in the model vocab, they will be tokenized and the first resulting token will be used (with a warning, and that might be slower). |
-
-
-Some options can be configured by passing headers to the Inference API. Here are the available headers:
-
-| Headers |   |    |
-| :--- | :--- | :--- |
-| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
-| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
-| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
-
-For more information about Inference API headers, check out the parameters [guide](../parameters).
-
-#### Response
-
-| Body |  |
-| :--- | :--- | :--- |
-| **(array)** | _object[]_ | Output is an array of objects. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;sequence** | _string_ | The corresponding input with the mask token prediction. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;score** | _number_ | The corresponding probability |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;token** | _integer_ | The predicted token id (to replace the masked one). |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;token_str** | _string_ | The predicted token (to replace the masked one). |
-
-
 ### Using the API
 
 
@@ -126,3 +93,36 @@ To use the JavaScript client, see `huggingface.js`'s [package reference](https:/
 </inferencesnippet>
 
 
+
+### API specification
+
+#### Request
+
+| Payload |  |  |
+| :--- | :--- | :--- |
+| **inputs*** | _string_ | The text with masked tokens |
+| **parameters** | _object_ | Additional inference parameters for Fill Mask |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;top_k** | _integer_ | When passed, overrides the number of predictions to return. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;targets** | _string[]_ | When passed, the model will limit the scores to the passed targets instead of looking up in the whole vocabulary. If the provided targets are not in the model vocab, they will be tokenized and the first resulting token will be used (with a warning, and that might be slower). |
+
+
+Some options can be configured by passing headers to the Inference API. Here are the available headers:
+
+| Headers |   |    |
+| :--- | :--- | :--- |
+| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
+| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
+| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
+
+For more information about Inference API headers, check out the parameters [guide](../parameters).
+
+#### Response
+
+| Body |  |
+| :--- | :--- | :--- |
+| **(array)** | _object[]_ | Output is an array of objects. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;sequence** | _string_ | The corresponding input with the mask token prediction. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;score** | _number_ | The corresponding probability |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;token** | _integer_ | The predicted token id (to replace the masked one). |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;token_str** | _string_ | The predicted token (to replace the masked one). |
+
diff --git a/docs/api-inference/tasks/image-classification.md b/docs/api-inference/tasks/image-classification.md
index 3f418adc9..c906acd75 100644
--- a/docs/api-inference/tasks/image-classification.md
+++ b/docs/api-inference/tasks/image-classification.md
@@ -28,37 +28,6 @@ For more details about the `image-classification` task, check out its [dedicated
 
 This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=image-classification&sort=trending).
 
-### API specification
-
-#### Request
-
-| Payload |  |  |
-| :--- | :--- | :--- |
-| **inputs*** | _string_ | The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload. |
-| **parameters** | _object_ | Additional inference parameters for Image Classification |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;function_to_apply** | _enum_ | Possible values: sigmoid, softmax, none. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;top_k** | _integer_ | When specified, limits the output to the top K most probable classes. |
-
-
-Some options can be configured by passing headers to the Inference API. Here are the available headers:
-
-| Headers |   |    |
-| :--- | :--- | :--- |
-| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
-| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
-| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
-
-For more information about Inference API headers, check out the parameters [guide](../parameters).
-
-#### Response
-
-| Body |  |
-| :--- | :--- | :--- |
-| **(array)** | _object[]_ | Output is an array of objects. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;label** | _string_ | The predicted class label. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;score** | _number_ | The corresponding probability. |
-
-
 ### Using the API
 
 
@@ -123,3 +92,35 @@ To use the JavaScript client, see `huggingface.js`'s [package reference](https:/
 </inferencesnippet>
 
 
+
+### API specification
+
+#### Request
+
+| Payload |  |  |
+| :--- | :--- | :--- |
+| **inputs*** | _string_ | The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload. |
+| **parameters** | _object_ | Additional inference parameters for Image Classification |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;function_to_apply** | _enum_ | Possible values: sigmoid, softmax, none. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;top_k** | _integer_ | When specified, limits the output to the top K most probable classes. |
+
+
+Some options can be configured by passing headers to the Inference API. Here are the available headers:
+
+| Headers |   |    |
+| :--- | :--- | :--- |
+| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
+| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
+| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
+
+For more information about Inference API headers, check out the parameters [guide](../parameters).
+
+#### Response
+
+| Body |  |
+| :--- | :--- | :--- |
+| **(array)** | _object[]_ | Output is an array of objects. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;label** | _string_ | The predicted class label. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;score** | _number_ | The corresponding probability. |
+
+
diff --git a/docs/api-inference/tasks/image-segmentation.md b/docs/api-inference/tasks/image-segmentation.md
index 610da6fa5..d5956506b 100644
--- a/docs/api-inference/tasks/image-segmentation.md
+++ b/docs/api-inference/tasks/image-segmentation.md
@@ -24,44 +24,11 @@ For more details about the `image-segmentation` task, check out its [dedicated p
 
 ### Recommended models
 
+- [facebook/detr-resnet-50-panoptic](https://huggingface.co/facebook/detr-resnet-50-panoptic): Solid panoptic segmentation model trained on the COCO 2017 benchmark dataset.
 - [nvidia/segformer-b0-finetuned-ade-512-512](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512): Semantic segmentation model trained on ADE20k benchmark dataset with 512x512 resolution.
 
 This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=image-segmentation&sort=trending).
 
-### API specification
-
-#### Request
-
-| Payload |  |  |
-| :--- | :--- | :--- |
-| **inputs*** | _string_ | The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload. |
-| **parameters** | _object_ | Additional inference parameters for Image Segmentation |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;mask_threshold** | _number_ | Threshold to use when turning the predicted masks into binary values. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;overlap_mask_area_threshold** | _number_ | Mask overlap threshold to eliminate small, disconnected segments. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;subtask** | _enum_ | Possible values: instance, panoptic, semantic. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;threshold** | _number_ | Probability threshold to filter out predicted masks. |
-
-
-Some options can be configured by passing headers to the Inference API. Here are the available headers:
-
-| Headers |   |    |
-| :--- | :--- | :--- |
-| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
-| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
-| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
-
-For more information about Inference API headers, check out the parameters [guide](../parameters).
-
-#### Response
-
-| Body |  |
-| :--- | :--- | :--- |
-| **(array)** | _object[]_ | A predicted mask / segment |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;label** | _string_ | The label of the predicted segment. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;mask** | _string_ | The corresponding mask as a black-and-white image (base64-encoded). |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;score** | _number_ | The score or confidence degree the model has. |
-
-
 ### Using the API
 
 
@@ -69,7 +36,7 @@ For more information about Inference API headers, check out the parameters [guid
 
 <curl>
 ```bash
-curl https://api-inference.huggingface.co/models/nvidia/segformer-b0-finetuned-ade-512-512 \
+curl https://api-inference.huggingface.co/models/facebook/detr-resnet-50-panoptic \
 	-X POST \
 	--data-binary '@cats.jpg' \
 	-H "Authorization: Bearer hf_***"
@@ -81,7 +48,7 @@ curl https://api-inference.huggingface.co/models/nvidia/segformer-b0-finetuned-a
 ```py
 import requests
 
-API_URL = "https://api-inference.huggingface.co/models/nvidia/segformer-b0-finetuned-ade-512-512"
+API_URL = "https://api-inference.huggingface.co/models/facebook/detr-resnet-50-panoptic"
 headers = {"Authorization": "Bearer hf_***"}
 
 def query(filename):
@@ -101,7 +68,7 @@ To use the Python client, see `huggingface_hub`'s [package reference](https://hu
 async function query(filename) {
 	const data = fs.readFileSync(filename);
 	const response = await fetch(
-		"https://api-inference.huggingface.co/models/nvidia/segformer-b0-finetuned-ade-512-512",
+		"https://api-inference.huggingface.co/models/facebook/detr-resnet-50-panoptic",
 		{
 			headers: {
 				Authorization: "Bearer hf_***"
@@ -126,3 +93,38 @@ To use the JavaScript client, see `huggingface.js`'s [package reference](https:/
 </inferencesnippet>
 
 
+
+### API specification
+
+#### Request
+
+| Payload |  |  |
+| :--- | :--- | :--- |
+| **inputs*** | _string_ | The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload. |
+| **parameters** | _object_ | Additional inference parameters for Image Segmentation |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;mask_threshold** | _number_ | Threshold to use when turning the predicted masks into binary values. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;overlap_mask_area_threshold** | _number_ | Mask overlap threshold to eliminate small, disconnected segments. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;subtask** | _enum_ | Possible values: instance, panoptic, semantic. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;threshold** | _number_ | Probability threshold to filter out predicted masks. |
+
+
+Some options can be configured by passing headers to the Inference API. Here are the available headers:
+
+| Headers |   |    |
+| :--- | :--- | :--- |
+| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
+| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
+| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
+
+For more information about Inference API headers, check out the parameters [guide](../parameters).
+
+#### Response
+
+| Body |  |
+| :--- | :--- | :--- |
+| **(array)** | _object[]_ | A predicted mask / segment |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;label** | _string_ | The label of the predicted segment. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;mask** | _string_ | The corresponding mask as a black-and-white image (base64-encoded). |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;score** | _number_ | The score or confidence degree the model has. |
+
+
diff --git a/docs/api-inference/tasks/image-to-image.md b/docs/api-inference/tasks/image-to-image.md
index 078c6f0f3..9605c0a64 100644
--- a/docs/api-inference/tasks/image-to-image.md
+++ b/docs/api-inference/tasks/image-to-image.md
@@ -35,6 +35,13 @@ For more details about the `image-to-image` task, check out its [dedicated page]
 
 This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=image-to-image&sort=trending).
 
+### Using the API
+
+
+No snippet available for this task.
+
+
+
 ### API specification
 
 #### Request
@@ -68,9 +75,3 @@ For more information about Inference API headers, check out the parameters [guid
 | **image** | _unknown_ | The output image returned as raw bytes in the payload. |
 
 
-### Using the API
-
-
-No snippet available for this task.
-
-
diff --git a/docs/api-inference/tasks/object-detection.md b/docs/api-inference/tasks/object-detection.md
index 57cf86143..c3ba578f1 100644
--- a/docs/api-inference/tasks/object-detection.md
+++ b/docs/api-inference/tasks/object-detection.md
@@ -29,41 +29,6 @@ For more details about the `object-detection` task, check out its [dedicated pag
 
 This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=object-detection&sort=trending).
 
-### API specification
-
-#### Request
-
-| Payload |  |  |
-| :--- | :--- | :--- |
-| **inputs*** | _string_ | The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload. |
-| **parameters** | _object_ | Additional inference parameters for Object Detection |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;threshold** | _number_ | The probability necessary to make a prediction. |
-
-
-Some options can be configured by passing headers to the Inference API. Here are the available headers:
-
-| Headers |   |    |
-| :--- | :--- | :--- |
-| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
-| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
-| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
-
-For more information about Inference API headers, check out the parameters [guide](../parameters).
-
-#### Response
-
-| Body |  |
-| :--- | :--- | :--- |
-| **(array)** | _object[]_ | Output is an array of objects. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;label** | _string_ | The predicted label for the bounding box. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;score** | _number_ | The associated score / probability. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;box** | _object_ |  |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;xmin** | _integer_ | The x-coordinate of the top-left corner of the bounding box. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;xmax** | _integer_ | The x-coordinate of the bottom-right corner of the bounding box. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;ymin** | _integer_ | The y-coordinate of the top-left corner of the bounding box. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;ymax** | _integer_ | The y-coordinate of the bottom-right corner of the bounding box. |
-
-
 ### Using the API
 
 
@@ -128,3 +93,38 @@ To use the JavaScript client, see `huggingface.js`'s [package reference](https:/
 </inferencesnippet>
 
 
+
+### API specification
+
+#### Request
+
+| Payload |  |  |
+| :--- | :--- | :--- |
+| **inputs*** | _string_ | The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload. |
+| **parameters** | _object_ | Additional inference parameters for Object Detection |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;threshold** | _number_ | The probability necessary to make a prediction. |
+
+
+Some options can be configured by passing headers to the Inference API. Here are the available headers:
+
+| Headers |   |    |
+| :--- | :--- | :--- |
+| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
+| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
+| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
+
+For more information about Inference API headers, check out the parameters [guide](../parameters).
+
+#### Response
+
+| Body |  |
+| :--- | :--- | :--- |
+| **(array)** | _object[]_ | Output is an array of objects. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;label** | _string_ | The predicted label for the bounding box. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;score** | _number_ | The associated score / probability. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;box** | _object_ |  |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;xmin** | _integer_ | The x-coordinate of the top-left corner of the bounding box. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;xmax** | _integer_ | The x-coordinate of the bottom-right corner of the bounding box. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;ymin** | _integer_ | The y-coordinate of the top-left corner of the bounding box. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;ymax** | _integer_ | The y-coordinate of the bottom-right corner of the bounding box. |
+
diff --git a/docs/api-inference/tasks/question-answering.md b/docs/api-inference/tasks/question-answering.md
index 5fc2b9766..0a62e9a33 100644
--- a/docs/api-inference/tasks/question-answering.md
+++ b/docs/api-inference/tasks/question-answering.md
@@ -28,46 +28,6 @@ For more details about the `question-answering` task, check out its [dedicated p
 
 This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=question-answering&sort=trending).
 
-### API specification
-
-#### Request
-
-| Payload |  |  |
-| :--- | :--- | :--- |
-| **inputs*** | _object_ | One (context, question) pair to answer |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;context*** | _string_ | The context to be used for answering the question |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;question*** | _string_ | The question to be answered |
-| **parameters** | _object_ | Additional inference parameters for Question Answering |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;top_k** | _integer_ | The number of answers to return (will be chosen by order of likelihood). Note that we return less than topk answers if there are not enough options available within the context. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;doc_stride** | _integer_ | If the context is too long to fit with the question for the model, it will be split in several chunks with some overlap. This argument controls the size of that overlap. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;max_answer_len** | _integer_ | The maximum length of predicted answers (e.g., only answers with a shorter length are considered). |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;max_seq_len** | _integer_ | The maximum length of the total sentence (context + question) in tokens of each chunk passed to the model. The context will be split in several chunks (using docStride as overlap) if needed. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;max_question_len** | _integer_ | The maximum length of the question after tokenization. It will be truncated if needed. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;handle_impossible_answer** | _boolean_ | Whether to accept impossible as an answer. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;align_to_words** | _boolean_ | Attempts to align the answer to real words. Improves quality on space separated languages. Might hurt on non-space-separated languages (like Japanese or Chinese) |
-
-
-Some options can be configured by passing headers to the Inference API. Here are the available headers:
-
-| Headers |   |    |
-| :--- | :--- | :--- |
-| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
-| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
-| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
-
-For more information about Inference API headers, check out the parameters [guide](../parameters).
-
-#### Response
-
-| Body |  |
-| :--- | :--- | :--- |
-| **(array)** | _object[]_ | Output is an array of objects. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;answer** | _string_ | The answer to the question. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;score** | _number_ | The probability associated to the answer. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;start** | _integer_ | The character position in the input where the answer begins. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;end** | _integer_ | The character position in the input where the answer ends. |
-
-
 ### Using the API
 
 
@@ -138,3 +98,43 @@ To use the JavaScript client, see `huggingface.js`'s [package reference](https:/
 </inferencesnippet>
 
 
+
+### API specification
+
+#### Request
+
+| Payload |  |  |
+| :--- | :--- | :--- |
+| **inputs*** | _object_ | One (context, question) pair to answer |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;context*** | _string_ | The context to be used for answering the question |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;question*** | _string_ | The question to be answered |
+| **parameters** | _object_ | Additional inference parameters for Question Answering |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;top_k** | _integer_ | The number of answers to return (will be chosen by order of likelihood). Note that we return less than topk answers if there are not enough options available within the context. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;doc_stride** | _integer_ | If the context is too long to fit with the question for the model, it will be split in several chunks with some overlap. This argument controls the size of that overlap. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;max_answer_len** | _integer_ | The maximum length of predicted answers (e.g., only answers with a shorter length are considered). |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;max_seq_len** | _integer_ | The maximum length of the total sentence (context + question) in tokens of each chunk passed to the model. The context will be split in several chunks (using docStride as overlap) if needed. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;max_question_len** | _integer_ | The maximum length of the question after tokenization. It will be truncated if needed. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;handle_impossible_answer** | _boolean_ | Whether to accept impossible as an answer. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;align_to_words** | _boolean_ | Attempts to align the answer to real words. Improves quality on space separated languages. Might hurt on non-space-separated languages (like Japanese or Chinese) |
+
+
+Some options can be configured by passing headers to the Inference API. Here are the available headers:
+
+| Headers |   |    |
+| :--- | :--- | :--- |
+| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
+| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
+| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
+
+For more information about Inference API headers, check out the parameters [guide](../parameters).
+
+#### Response
+
+| Body |  |
+| :--- | :--- | :--- |
+| **(array)** | _object[]_ | Output is an array of objects. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;answer** | _string_ | The answer to the question. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;score** | _number_ | The probability associated to the answer. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;start** | _integer_ | The character position in the input where the answer begins. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;end** | _integer_ | The character position in the input where the answer ends. |
+
diff --git a/docs/api-inference/tasks/summarization.md b/docs/api-inference/tasks/summarization.md
index 4cf5d706e..c10a1828b 100644
--- a/docs/api-inference/tasks/summarization.md
+++ b/docs/api-inference/tasks/summarization.md
@@ -28,36 +28,6 @@ For more details about the `summarization` task, check out its [dedicated page](
 
 This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=summarization&sort=trending).
 
-### API specification
-
-#### Request
-
-| Payload |  |  |
-| :--- | :--- | :--- |
-| **inputs*** | _string_ | The input text to summarize. |
-| **parameters** | _object_ | Additional inference parameters for summarization. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;clean_up_tokenization_spaces** | _boolean_ | Whether to clean up the potential extra spaces in the text output. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;truncation** | _enum_ | Possible values: do_not_truncate, longest_first, only_first, only_second. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;generate_parameters** | _object_ | Additional parametrization of the text generation algorithm. |
-
-
-Some options can be configured by passing headers to the Inference API. Here are the available headers:
-
-| Headers |   |    |
-| :--- | :--- | :--- |
-| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
-| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
-| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
-
-For more information about Inference API headers, check out the parameters [guide](../parameters).
-
-#### Response
-
-| Body |  |
-| :--- | :--- | :--- |
-| **summary_text** | _string_ | The summarized text. |
-
-
 ### Using the API
 
 
@@ -122,3 +92,33 @@ To use the JavaScript client, see `huggingface.js`'s [package reference](https:/
 </inferencesnippet>
 
 
+
+### API specification
+
+#### Request
+
+| Payload |  |  |
+| :--- | :--- | :--- |
+| **inputs*** | _string_ | The input text to summarize. |
+| **parameters** | _object_ | Additional inference parameters for summarization. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;clean_up_tokenization_spaces** | _boolean_ | Whether to clean up the potential extra spaces in the text output. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;truncation** | _enum_ | Possible values: do_not_truncate, longest_first, only_first, only_second. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;generate_parameters** | _object_ | Additional parametrization of the text generation algorithm. |
+
+
+Some options can be configured by passing headers to the Inference API. Here are the available headers:
+
+| Headers |   |    |
+| :--- | :--- | :--- |
+| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
+| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
+| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
+
+For more information about Inference API headers, check out the parameters [guide](../parameters).
+
+#### Response
+
+| Body |  |
+| :--- | :--- | :--- |
+| **summary_text** | _string_ | The summarized text. |
+
diff --git a/docs/api-inference/tasks/table-question-answering.md b/docs/api-inference/tasks/table-question-answering.md
index d1d115f3b..3eb659892 100644
--- a/docs/api-inference/tasks/table-question-answering.md
+++ b/docs/api-inference/tasks/table-question-answering.md
@@ -27,39 +27,6 @@ For more details about the `table-question-answering` task, check out its [dedic
 
 This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=table-question-answering&sort=trending).
 
-### API specification
-
-#### Request
-
-| Payload |  |  |
-| :--- | :--- | :--- |
-| **inputs*** | _object_ | One (table, question) pair to answer |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;table*** | _object_ | The table to serve as context for the questions |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;question*** | _string_ | The question to be answered about the table |
-| **parameters** | _object_ | Additional inference parameters for Table Question Answering |
-
-
-Some options can be configured by passing headers to the Inference API. Here are the available headers:
-
-| Headers |   |    |
-| :--- | :--- | :--- |
-| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
-| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
-| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
-
-For more information about Inference API headers, check out the parameters [guide](../parameters).
-
-#### Response
-
-| Body |  |
-| :--- | :--- | :--- |
-| **(array)** | _object[]_ | Output is an array of objects. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;answer** | _string_ | The answer of the question given the table. If there is an aggregator, the answer will be preceded by `AGGREGATOR >`. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;coordinates** | _array[]_ | Coordinates of the cells of the answers. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;cells** | _string[]_ | List of strings made up of the answer cell values. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;aggregator** | _string_ | If the model has an aggregator, this returns the aggregator. |
-
-
 ### Using the API
 
 
@@ -148,3 +115,36 @@ To use the JavaScript client, see `huggingface.js`'s [package reference](https:/
 </inferencesnippet>
 
 
+
+### API specification
+
+#### Request
+
+| Payload |  |  |
+| :--- | :--- | :--- |
+| **inputs*** | _object_ | One (table, question) pair to answer |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;table*** | _object_ | The table to serve as context for the questions |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;question*** | _string_ | The question to be answered about the table |
+| **parameters** | _object_ | Additional inference parameters for Table Question Answering |
+
+
+Some options can be configured by passing headers to the Inference API. Here are the available headers:
+
+| Headers |   |    |
+| :--- | :--- | :--- |
+| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
+| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
+| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
+
+For more information about Inference API headers, check out the parameters [guide](../parameters).
+
+#### Response
+
+| Body |  |
+| :--- | :--- | :--- |
+| **(array)** | _object[]_ | Output is an array of objects. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;answer** | _string_ | The answer of the question given the table. If there is an aggregator, the answer will be preceded by `AGGREGATOR >`. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;coordinates** | _array[]_ | Coordinates of the cells of the answers. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;cells** | _string[]_ | List of strings made up of the answer cell values. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;aggregator** | _string_ | If the model has an aggregator, this returns the aggregator. |
+
diff --git a/docs/api-inference/tasks/text-classification.md b/docs/api-inference/tasks/text-classification.md
index 7f99c5cfd..bf932c4f3 100644
--- a/docs/api-inference/tasks/text-classification.md
+++ b/docs/api-inference/tasks/text-classification.md
@@ -28,37 +28,6 @@ For more details about the `text-classification` task, check out its [dedicated
 
 This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=text-classification&sort=trending).
 
-### API specification
-
-#### Request
-
-| Payload |  |  |
-| :--- | :--- | :--- |
-| **inputs*** | _string_ | The text to classify |
-| **parameters** | _object_ | Additional inference parameters for Text Classification |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;function_to_apply** | _enum_ | Possible values: sigmoid, softmax, none. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;top_k** | _integer_ | When specified, limits the output to the top K most probable classes. |
-
-
-Some options can be configured by passing headers to the Inference API. Here are the available headers:
-
-| Headers |   |    |
-| :--- | :--- | :--- |
-| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
-| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
-| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
-
-For more information about Inference API headers, check out the parameters [guide](../parameters).
-
-#### Response
-
-| Body |  |
-| :--- | :--- | :--- |
-| **(array)** | _object[]_ | Output is an array of objects. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;label** | _string_ | The predicted class label. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;score** | _number_ | The corresponding probability. |
-
-
 ### Using the API
 
 
@@ -123,3 +92,34 @@ To use the JavaScript client, see `huggingface.js`'s [package reference](https:/
 </inferencesnippet>
 
 
+
+### API specification
+
+#### Request
+
+| Payload |  |  |
+| :--- | :--- | :--- |
+| **inputs*** | _string_ | The text to classify |
+| **parameters** | _object_ | Additional inference parameters for Text Classification |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;function_to_apply** | _enum_ | Possible values: sigmoid, softmax, none. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;top_k** | _integer_ | When specified, limits the output to the top K most probable classes. |
+
+
+Some options can be configured by passing headers to the Inference API. Here are the available headers:
+
+| Headers |   |    |
+| :--- | :--- | :--- |
+| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
+| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
+| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
+
+For more information about Inference API headers, check out the parameters [guide](../parameters).
+
+#### Response
+
+| Body |  |
+| :--- | :--- | :--- |
+| **(array)** | _object[]_ | Output is an array of objects. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;label** | _string_ | The predicted class label. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;score** | _number_ | The corresponding probability. |
+
diff --git a/docs/api-inference/tasks/text-generation.md b/docs/api-inference/tasks/text-generation.md
index eca329d91..22ee84e1a 100644
--- a/docs/api-inference/tasks/text-generation.md
+++ b/docs/api-inference/tasks/text-generation.md
@@ -35,6 +35,71 @@ For more details about the `text-generation` task, check out its [dedicated page
 
 This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=text-generation&sort=trending).
 
+### Using the API
+
+
+<inferencesnippet>
+
+<curl>
+```bash
+curl https://api-inference.huggingface.co/models/google/gemma-2-2b-it \
+	-X POST \
+	-d '{"inputs": "Can you please let us know more details about your "}' \
+	-H 'Content-Type: application/json' \
+	-H "Authorization: Bearer hf_***"
+
+```
+</curl>
+
+<python>
+```py
+import requests
+
+API_URL = "https://api-inference.huggingface.co/models/google/gemma-2-2b-it"
+headers = {"Authorization": "Bearer hf_***"}
+
+def query(payload):
+	response = requests.post(API_URL, headers=headers, json=payload)
+	return response.json()
+	
+output = query({
+	"inputs": "Can you please let us know more details about your ",
+})
+```
+
+To use the Python client, see `huggingface_hub`'s [package reference](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation).
+</python>
+
+<js>
+```js
+async function query(data) {
+	const response = await fetch(
+		"https://api-inference.huggingface.co/models/google/gemma-2-2b-it",
+		{
+			headers: {
+				Authorization: "Bearer hf_***"
+				"Content-Type": "application/json",
+			},
+			method: "POST",
+			body: JSON.stringify(data),
+		}
+	);
+	const result = await response.json();
+	return result;
+}
+
+query({"inputs": "Can you please let us know more details about your "}).then((response) => {
+	console.log(JSON.stringify(response));
+});
+```
+
+To use the JavaScript client, see `huggingface.js`'s [package reference](https://huggingface.co/docs/huggingface.js/inference/classes/HfInference#textgeneration).
+</js>
+
+</inferencesnippet>
+
+
+
 ### API specification
 
 #### Request
@@ -149,68 +214,3 @@ For more information about streaming, check out [this guide](https://huggingface
 | **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;special** | _boolean_ |  |
 | **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;text** | _string_ |  |
 
-
-### Using the API
-
-
-<inferencesnippet>
-
-<curl>
-```bash
-curl https://api-inference.huggingface.co/models/google/gemma-2-2b-it \
-	-X POST \
-	-d '{"inputs": "Can you please let us know more details about your "}' \
-	-H 'Content-Type: application/json' \
-	-H "Authorization: Bearer hf_***"
-
-```
-</curl>
-
-<python>
-```py
-import requests
-
-API_URL = "https://api-inference.huggingface.co/models/google/gemma-2-2b-it"
-headers = {"Authorization": "Bearer hf_***"}
-
-def query(payload):
-	response = requests.post(API_URL, headers=headers, json=payload)
-	return response.json()
-	
-output = query({
-	"inputs": "Can you please let us know more details about your ",
-})
-```
-
-To use the Python client, see `huggingface_hub`'s [package reference](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation).
-</python>
-
-<js>
-```js
-async function query(data) {
-	const response = await fetch(
-		"https://api-inference.huggingface.co/models/google/gemma-2-2b-it",
-		{
-			headers: {
-				Authorization: "Bearer hf_***"
-				"Content-Type": "application/json",
-			},
-			method: "POST",
-			body: JSON.stringify(data),
-		}
-	);
-	const result = await response.json();
-	return result;
-}
-
-query({"inputs": "Can you please let us know more details about your "}).then((response) => {
-	console.log(JSON.stringify(response));
-});
-```
-
-To use the JavaScript client, see `huggingface.js`'s [package reference](https://huggingface.co/docs/huggingface.js/inference/classes/HfInference#textgeneration).
-</js>
-
-</inferencesnippet>
-
-
diff --git a/docs/api-inference/tasks/text-to-image.md b/docs/api-inference/tasks/text-to-image.md
index 0ac92293c..77d525d41 100644
--- a/docs/api-inference/tasks/text-to-image.md
+++ b/docs/api-inference/tasks/text-to-image.md
@@ -26,45 +26,11 @@ For more details about the `text-to-image` task, check out its [dedicated page](
 
 - [black-forest-labs/FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev): One of the most powerful image generation models that can generate realistic outputs.
 - [latent-consistency/lcm-lora-sdxl](https://huggingface.co/latent-consistency/lcm-lora-sdxl): A powerful yet fast image generation model.
+- [Kwai-Kolors/Kolors](https://huggingface.co/Kwai-Kolors/Kolors): Text-to-image model for photorealistic generation.
 - [stabilityai/stable-diffusion-3-medium-diffusers](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers): A powerful text-to-image model.
 
 This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=text-to-image&sort=trending).
 
-### API specification
-
-#### Request
-
-| Payload |  |  |
-| :--- | :--- | :--- |
-| **inputs*** | _string_ | The input text data (sometimes called "prompt") |
-| **parameters** | _object_ | Additional inference parameters for Text To Image |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;guidance_scale** | _number_ | A higher guidance scale value encourages the model to generate images closely linked to the text prompt, but values too high may cause saturation and other artifacts. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;negative_prompt** | _string[]_ | One or several prompt to guide what NOT to include in image generation. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;num_inference_steps** | _integer_ | The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;target_size** | _object_ | The size in pixel of the output image |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;width*** | _integer_ |  |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;height*** | _integer_ |  |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;scheduler** | _string_ | Override the scheduler with a compatible one. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;seed** | _integer_ | Seed for the random number generator. |
-
-
-Some options can be configured by passing headers to the Inference API. Here are the available headers:
-
-| Headers |   |    |
-| :--- | :--- | :--- |
-| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
-| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
-| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
-
-For more information about Inference API headers, check out the parameters [guide](../parameters).
-
-#### Response
-
-| Body |  |
-| :--- | :--- | :--- |
-| **image** | _unknown_ | The generated image returned as raw bytes in the payload. |
-
-
 ### Using the API
 
 
@@ -131,3 +97,38 @@ To use the JavaScript client, see `huggingface.js`'s [package reference](https:/
 </inferencesnippet>
 
 
+
+### API specification
+
+#### Request
+
+| Payload |  |  |
+| :--- | :--- | :--- |
+| **inputs*** | _string_ | The input text data (sometimes called "prompt") |
+| **parameters** | _object_ | Additional inference parameters for Text To Image |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;guidance_scale** | _number_ | A higher guidance scale value encourages the model to generate images closely linked to the text prompt, but values too high may cause saturation and other artifacts. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;negative_prompt** | _string[]_ | One or several prompt to guide what NOT to include in image generation. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;num_inference_steps** | _integer_ | The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;target_size** | _object_ | The size in pixel of the output image |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;width*** | _integer_ |  |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;height*** | _integer_ |  |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;scheduler** | _string_ | Override the scheduler with a compatible one. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;seed** | _integer_ | Seed for the random number generator. |
+
+
+Some options can be configured by passing headers to the Inference API. Here are the available headers:
+
+| Headers |   |    |
+| :--- | :--- | :--- |
+| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
+| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
+| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
+
+For more information about Inference API headers, check out the parameters [guide](../parameters).
+
+#### Response
+
+| Body |  |
+| :--- | :--- | :--- |
+| **image** | _unknown_ | The generated image returned as raw bytes in the payload. |
+
diff --git a/docs/api-inference/tasks/token-classification.md b/docs/api-inference/tasks/token-classification.md
index 888ef8093..9da5edcb2 100644
--- a/docs/api-inference/tasks/token-classification.md
+++ b/docs/api-inference/tasks/token-classification.md
@@ -29,54 +29,6 @@ For more details about the `token-classification` task, check out its [dedicated
 
 This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=token-classification&sort=trending).
 
-### API specification
-
-#### Request
-
-| Payload |  |  |
-| :--- | :--- | :--- |
-| **inputs*** | _string_ | The input text data |
-| **parameters** | _object_ | Additional inference parameters for Token Classification |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;ignore_labels** | _string[]_ | A list of labels to ignore |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;stride** | _integer_ | The number of overlapping tokens between chunks when splitting the input text. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;aggregation_strategy** | _string_ | One of the following: |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;(#1)** | _&#x27;none&#x27;_ | Do not aggregate tokens |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;(#2)** | _&#x27;simple&#x27;_ | Group consecutive tokens with the same label in a single entity. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;(#3)** | _&#x27;first&#x27;_ | Similar to "simple", also preserves word integrity (use the label predicted for the first token in a word). |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;(#4)** | _&#x27;average&#x27;_ | Similar to "simple", also preserves word integrity (uses the label with the highest score, averaged across the word's tokens). |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;(#5)** | _&#x27;max&#x27;_ | Similar to "simple", also preserves word integrity (uses the label with the highest score across the word's tokens). |
-
-
-Some options can be configured by passing headers to the Inference API. Here are the available headers:
-
-| Headers |   |    |
-| :--- | :--- | :--- |
-| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
-| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
-| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
-
-For more information about Inference API headers, check out the parameters [guide](../parameters).
-
-#### Response
-
-Output type depends on the `stream` input parameter.
-If `stream` is `false` (default), the response will be a JSON object with the following fields:
-
-| Body |  |
-| :--- | :--- | :--- |
-| **(array)** | _object[]_ | Output is an array of objects. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;entity_group** | _string_ | The predicted label for that group of tokens |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;score** | _number_ | The associated score / probability |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;word** | _string_ | The corresponding text |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;start** | _integer_ | The character position in the input where this group begins. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;end** | _integer_ | The character position in the input where this group ends. |
-
-
-If `stream` is `true`, generated tokens are returned as a stream, using Server-Sent Events (SSE).
-For more information about streaming, check out [this guide](https://huggingface.co/docs/token-classification-inference/conceptual/streaming).
-
-
-
 ### Using the API
 
 
@@ -141,3 +93,52 @@ To use the JavaScript client, see `huggingface.js`'s [package reference](https:/
 </inferencesnippet>
 
 
+
+### API specification
+
+#### Request
+
+| Payload |  |  |
+| :--- | :--- | :--- |
+| **inputs*** | _string_ | The input text data |
+| **parameters** | _object_ | Additional inference parameters for Token Classification |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;ignore_labels** | _string[]_ | A list of labels to ignore |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;stride** | _integer_ | The number of overlapping tokens between chunks when splitting the input text. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;aggregation_strategy** | _string_ | One of the following: |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;(#1)** | _&#x27;none&#x27;_ | Do not aggregate tokens |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;(#2)** | _&#x27;simple&#x27;_ | Group consecutive tokens with the same label in a single entity. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;(#3)** | _&#x27;first&#x27;_ | Similar to "simple", also preserves word integrity (use the label predicted for the first token in a word). |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;(#4)** | _&#x27;average&#x27;_ | Similar to "simple", also preserves word integrity (uses the label with the highest score, averaged across the word's tokens). |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;(#5)** | _&#x27;max&#x27;_ | Similar to "simple", also preserves word integrity (uses the label with the highest score across the word's tokens). |
+
+
+Some options can be configured by passing headers to the Inference API. Here are the available headers:
+
+| Headers |   |    |
+| :--- | :--- | :--- |
+| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
+| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
+| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
+
+For more information about Inference API headers, check out the parameters [guide](../parameters).
+
+#### Response
+
+Output type depends on the `stream` input parameter.
+If `stream` is `false` (default), the response will be a JSON object with the following fields:
+
+| Body |  |
+| :--- | :--- | :--- |
+| **(array)** | _object[]_ | Output is an array of objects. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;entity_group** | _string_ | The predicted label for that group of tokens |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;score** | _number_ | The associated score / probability |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;word** | _string_ | The corresponding text |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;start** | _integer_ | The character position in the input where this group begins. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;end** | _integer_ | The character position in the input where this group ends. |
+
+
+If `stream` is `true`, generated tokens are returned as a stream, using Server-Sent Events (SSE).
+For more information about streaming, check out [this guide](https://huggingface.co/docs/token-classification-inference/conceptual/streaming).
+
+
+
diff --git a/docs/api-inference/tasks/translation.md b/docs/api-inference/tasks/translation.md
index c924a8bd0..1b6284abe 100644
--- a/docs/api-inference/tasks/translation.md
+++ b/docs/api-inference/tasks/translation.md
@@ -29,38 +29,6 @@ For more details about the `translation` task, check out its [dedicated page](ht
 
 This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=translation&sort=trending).
 
-### API specification
-
-#### Request
-
-| Payload |  |  |
-| :--- | :--- | :--- |
-| **inputs*** | _string_ | The text to translate. |
-| **parameters** | _object_ | Additional inference parameters for Translation |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;src_lang** | _string_ | The source language of the text. Required for models that can translate from multiple languages. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;tgt_lang** | _string_ | Target language to translate to. Required for models that can translate to multiple languages. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;clean_up_tokenization_spaces** | _boolean_ | Whether to clean up the potential extra spaces in the text output. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;truncation** | _enum_ | Possible values: do_not_truncate, longest_first, only_first, only_second. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;generate_parameters** | _object_ | Additional parametrization of the text generation algorithm. |
-
-
-Some options can be configured by passing headers to the Inference API. Here are the available headers:
-
-| Headers |   |    |
-| :--- | :--- | :--- |
-| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
-| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
-| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
-
-For more information about Inference API headers, check out the parameters [guide](../parameters).
-
-#### Response
-
-| Body |  |
-| :--- | :--- | :--- |
-| **translation_text** | _string_ | The translated text. |
-
-
 ### Using the API
 
 
@@ -125,3 +93,35 @@ To use the JavaScript client, see `huggingface.js`'s [package reference](https:/
 </inferencesnippet>
 
 
+
+### API specification
+
+#### Request
+
+| Payload |  |  |
+| :--- | :--- | :--- |
+| **inputs*** | _string_ | The text to translate. |
+| **parameters** | _object_ | Additional inference parameters for Translation |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;src_lang** | _string_ | The source language of the text. Required for models that can translate from multiple languages. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;tgt_lang** | _string_ | Target language to translate to. Required for models that can translate to multiple languages. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;clean_up_tokenization_spaces** | _boolean_ | Whether to clean up the potential extra spaces in the text output. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;truncation** | _enum_ | Possible values: do_not_truncate, longest_first, only_first, only_second. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;generate_parameters** | _object_ | Additional parametrization of the text generation algorithm. |
+
+
+Some options can be configured by passing headers to the Inference API. Here are the available headers:
+
+| Headers |   |    |
+| :--- | :--- | :--- |
+| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
+| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
+| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
+
+For more information about Inference API headers, check out the parameters [guide](../parameters).
+
+#### Response
+
+| Body |  |
+| :--- | :--- | :--- |
+| **translation_text** | _string_ | The translated text. |
+
diff --git a/docs/api-inference/tasks/zero-shot-classification.md b/docs/api-inference/tasks/zero-shot-classification.md
index ab3404d62..89b505be1 100644
--- a/docs/api-inference/tasks/zero-shot-classification.md
+++ b/docs/api-inference/tasks/zero-shot-classification.md
@@ -28,39 +28,6 @@ For more details about the `zero-shot-classification` task, check out its [dedic
 
 This is only a subset of the supported models. Find the model that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=zero-shot-classification&sort=trending).
 
-### API specification
-
-#### Request
-
-| Payload |  |  |
-| :--- | :--- | :--- |
-| **inputs*** | _object_ | The input text data, with candidate labels |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;text*** | _string_ | The text to classify |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;candidateLabels*** | _string[]_ | The set of possible class labels to classify the text into. |
-| **parameters** | _object_ | Additional inference parameters for Zero Shot Classification |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;hypothesis_template** | _string_ | The sentence used in conjunction with candidateLabels to attempt the text classification by replacing the placeholder with the candidate labels. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;multi_label** | _boolean_ | Whether multiple candidate labels can be true. If false, the scores are normalized such that the sum of the label likelihoods for each sequence is 1. If true, the labels are considered independent and probabilities are normalized for each candidate. |
-
-
-Some options can be configured by passing headers to the Inference API. Here are the available headers:
-
-| Headers |   |    |
-| :--- | :--- | :--- |
-| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
-| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
-| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
-
-For more information about Inference API headers, check out the parameters [guide](../parameters).
-
-#### Response
-
-| Body |  |
-| :--- | :--- | :--- |
-| **(array)** | _object[]_ | Output is an array of objects. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;label** | _string_ | The predicted class label. |
-| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;score** | _number_ | The corresponding probability. |
-
-
 ### Using the API
 
 
@@ -126,3 +93,36 @@ To use the JavaScript client, see `huggingface.js`'s [package reference](https:/
 </inferencesnippet>
 
 
+
+### API specification
+
+#### Request
+
+| Payload |  |  |
+| :--- | :--- | :--- |
+| **inputs*** | _object_ | The input text data, with candidate labels |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;text*** | _string_ | The text to classify |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;candidateLabels*** | _string[]_ | The set of possible class labels to classify the text into. |
+| **parameters** | _object_ | Additional inference parameters for Zero Shot Classification |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;hypothesis_template** | _string_ | The sentence used in conjunction with candidateLabels to attempt the text classification by replacing the placeholder with the candidate labels. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;multi_label** | _boolean_ | Whether multiple candidate labels can be true. If false, the scores are normalized such that the sum of the label likelihoods for each sequence is 1. If true, the labels are considered independent and probabilities are normalized for each candidate. |
+
+
+Some options can be configured by passing headers to the Inference API. Here are the available headers:
+
+| Headers |   |    |
+| :--- | :--- | :--- |
+| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
+| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
+| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
+
+For more information about Inference API headers, check out the parameters [guide](../parameters).
+
+#### Response
+
+| Body |  |
+| :--- | :--- | :--- |
+| **(array)** | _object[]_ | Output is an array of objects. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;label** | _string_ | The predicted class label. |
+| **&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;score** | _number_ | The corresponding probability. |
+
diff --git a/scripts/api-inference/templates/task/audio-classification.handlebars b/scripts/api-inference/templates/task/audio-classification.handlebars
index 5f866f728..9567f39ca 100644
--- a/scripts/api-inference/templates/task/audio-classification.handlebars
+++ b/scripts/api-inference/templates/task/audio-classification.handlebars
@@ -12,6 +12,10 @@ Audio classification is the task of assigning a label or class to a given audio.
 
 {{{tips.listModelsLink.audio-classification}}}
 
+### Using the API
+
+{{{snippets.audio-classification}}}
+
 ### API specification
 
 #### Request
@@ -23,7 +27,3 @@ Audio classification is the task of assigning a label or class to a given audio.
 #### Response
 
 {{{specs.audio-classification.output}}}
-
-### Using the API
-
-{{{snippets.audio-classification}}}
diff --git a/scripts/api-inference/templates/task/automatic-speech-recognition.handlebars b/scripts/api-inference/templates/task/automatic-speech-recognition.handlebars
index 008c65030..8e200fd2a 100644
--- a/scripts/api-inference/templates/task/automatic-speech-recognition.handlebars
+++ b/scripts/api-inference/templates/task/automatic-speech-recognition.handlebars
@@ -11,6 +11,10 @@ Automatic Speech Recognition (ASR), also known as Speech to Text (STT), is the t
 
 {{{tips.listModelsLink.automatic-speech-recognition}}}
 
+### Using the API
+
+{{{snippets.automatic-speech-recognition}}}
+
 ### API specification
 
 #### Request
@@ -22,7 +26,3 @@ Automatic Speech Recognition (ASR), also known as Speech to Text (STT), is the t
 #### Response
 
 {{{specs.automatic-speech-recognition.output}}}
-
-### Using the API
-
-{{{snippets.automatic-speech-recognition}}}
diff --git a/scripts/api-inference/templates/task/chat-completion.handlebars b/scripts/api-inference/templates/task/chat-completion.handlebars
index f1274f5c5..fd2d189fa 100644
--- a/scripts/api-inference/templates/task/chat-completion.handlebars
+++ b/scripts/api-inference/templates/task/chat-completion.handlebars
@@ -13,6 +13,10 @@ This is a subtask of [`text-generation`](./text_generation) designed to generate
 
 {{{tips.listModelsLink.chat-completion}}}
 
+### Using the API
+
+{{{snippets.chat-completion}}}
+
 ### API specification
 
 #### Request
@@ -33,6 +37,3 @@ For more information about streaming, check out [this guide](https://huggingface
 
 {{{specs.chat-completion.stream_output}}}
 
-### Using the API
-
-{{{snippets.chat-completion}}}
diff --git a/scripts/api-inference/templates/task/feature-extraction.handlebars b/scripts/api-inference/templates/task/feature-extraction.handlebars
index 7e6f1b4be..adc28262d 100644
--- a/scripts/api-inference/templates/task/feature-extraction.handlebars
+++ b/scripts/api-inference/templates/task/feature-extraction.handlebars
@@ -13,6 +13,10 @@ Extracting features is useful for subtasks like sentence similarity, reranking a
 
 {{{tips.listModelsLink.feature-extraction}}}
 
+### Using the API
+
+{{{snippets.feature-extraction}}}
+
 ### API specification
 
 #### Request
@@ -25,6 +29,3 @@ Extracting features is useful for subtasks like sentence similarity, reranking a
 
 {{{specs.feature-extraction.output}}}
 
-### Using the API
-
-{{{snippets.feature-extraction}}}
diff --git a/scripts/api-inference/templates/task/fill-mask.handlebars b/scripts/api-inference/templates/task/fill-mask.handlebars
index 663d2ab9f..c9c131e22 100644
--- a/scripts/api-inference/templates/task/fill-mask.handlebars
+++ b/scripts/api-inference/templates/task/fill-mask.handlebars
@@ -12,6 +12,10 @@ Mask filling is the task of predicting the right word (token to be precise) in t
 
 {{{tips.listModelsLink.fill-mask}}}
 
+### Using the API
+
+{{{snippets.fill-mask}}}
+
 ### API specification
 
 #### Request
@@ -23,7 +27,3 @@ Mask filling is the task of predicting the right word (token to be precise) in t
 #### Response
 
 {{{specs.fill-mask.output}}}
-
-### Using the API
-
-{{{snippets.fill-mask}}}
diff --git a/scripts/api-inference/templates/task/image-classification.handlebars b/scripts/api-inference/templates/task/image-classification.handlebars
index abfa0a147..88461e7be 100644
--- a/scripts/api-inference/templates/task/image-classification.handlebars
+++ b/scripts/api-inference/templates/task/image-classification.handlebars
@@ -12,6 +12,10 @@ Image classification is the task of assigning a label or class to an entire imag
 
 {{{tips.listModelsLink.image-classification}}}
 
+### Using the API
+
+{{{snippets.image-classification}}}
+
 ### API specification
 
 #### Request
@@ -24,6 +28,3 @@ Image classification is the task of assigning a label or class to an entire imag
 
 {{{specs.image-classification.output}}}
 
-### Using the API
-
-{{{snippets.image-classification}}}
diff --git a/scripts/api-inference/templates/task/image-segmentation.handlebars b/scripts/api-inference/templates/task/image-segmentation.handlebars
index 8f81ad5d2..e4cec3a01 100644
--- a/scripts/api-inference/templates/task/image-segmentation.handlebars
+++ b/scripts/api-inference/templates/task/image-segmentation.handlebars
@@ -12,6 +12,10 @@ Image Segmentation divides an image into segments where each pixel in the image
 
 {{{tips.listModelsLink.image-segmentation}}}
 
+### Using the API
+
+{{{snippets.image-segmentation}}}
+
 ### API specification
 
 #### Request
@@ -24,6 +28,3 @@ Image Segmentation divides an image into segments where each pixel in the image
 
 {{{specs.image-segmentation.output}}}
 
-### Using the API
-
-{{{snippets.image-segmentation}}}
diff --git a/scripts/api-inference/templates/task/image-to-image.handlebars b/scripts/api-inference/templates/task/image-to-image.handlebars
index 258dec814..93d5f6f00 100644
--- a/scripts/api-inference/templates/task/image-to-image.handlebars
+++ b/scripts/api-inference/templates/task/image-to-image.handlebars
@@ -19,6 +19,10 @@ Use cases heavily depend on the model and the dataset it was trained on, but som
 
 {{{tips.listModelsLink.image-to-image}}}
 
+### Using the API
+
+{{{snippets.image-to-image}}}
+
 ### API specification
 
 #### Request
@@ -31,6 +35,3 @@ Use cases heavily depend on the model and the dataset it was trained on, but som
 
 {{{specs.image-to-image.output}}}
 
-### Using the API
-
-{{{snippets.image-to-image}}}
diff --git a/scripts/api-inference/templates/task/object-detection.handlebars b/scripts/api-inference/templates/task/object-detection.handlebars
index 5e90a3092..f3b4e085b 100644
--- a/scripts/api-inference/templates/task/object-detection.handlebars
+++ b/scripts/api-inference/templates/task/object-detection.handlebars
@@ -12,6 +12,10 @@ Object Detection models allow users to identify objects of certain defined class
 
 {{{tips.listModelsLink.object-detection}}}
 
+### Using the API
+
+{{{snippets.object-detection}}}
+
 ### API specification
 
 #### Request
@@ -23,7 +27,3 @@ Object Detection models allow users to identify objects of certain defined class
 #### Response
 
 {{{specs.object-detection.output}}}
-
-### Using the API
-
-{{{snippets.object-detection}}}
diff --git a/scripts/api-inference/templates/task/question-answering.handlebars b/scripts/api-inference/templates/task/question-answering.handlebars
index 101d00fcc..3ca4e93d3 100644
--- a/scripts/api-inference/templates/task/question-answering.handlebars
+++ b/scripts/api-inference/templates/task/question-answering.handlebars
@@ -12,6 +12,10 @@ Question Answering models can retrieve the answer to a question from a given tex
 
 {{{tips.listModelsLink.question-answering}}}
 
+### Using the API
+
+{{{snippets.question-answering}}}
+
 ### API specification
 
 #### Request
@@ -23,7 +27,3 @@ Question Answering models can retrieve the answer to a question from a given tex
 #### Response
 
 {{{specs.question-answering.output}}}
-
-### Using the API
-
-{{{snippets.question-answering}}}
diff --git a/scripts/api-inference/templates/task/summarization.handlebars b/scripts/api-inference/templates/task/summarization.handlebars
index 890487215..1df382189 100644
--- a/scripts/api-inference/templates/task/summarization.handlebars
+++ b/scripts/api-inference/templates/task/summarization.handlebars
@@ -12,6 +12,10 @@ Summarization is the task of producing a shorter version of a document while pre
 
 {{{tips.listModelsLink.summarization}}}
 
+### Using the API
+
+{{{snippets.summarization}}}
+
 ### API specification
 
 #### Request
@@ -23,7 +27,3 @@ Summarization is the task of producing a shorter version of a document while pre
 #### Response
 
 {{{specs.summarization.output}}}
-
-### Using the API
-
-{{{snippets.summarization}}}
diff --git a/scripts/api-inference/templates/task/table-question-answering.handlebars b/scripts/api-inference/templates/task/table-question-answering.handlebars
index 4ae8b53fc..087ff53bf 100644
--- a/scripts/api-inference/templates/task/table-question-answering.handlebars
+++ b/scripts/api-inference/templates/task/table-question-answering.handlebars
@@ -12,6 +12,10 @@ Table Question Answering (Table QA) is the answering a question about an informa
 
 {{{tips.listModelsLink.table-question-answering}}}
 
+### Using the API
+
+{{{snippets.table-question-answering}}}
+
 ### API specification
 
 #### Request
@@ -23,7 +27,3 @@ Table Question Answering (Table QA) is the answering a question about an informa
 #### Response
 
 {{{specs.table-question-answering.output}}}
-
-### Using the API
-
-{{{snippets.table-question-answering}}}
diff --git a/scripts/api-inference/templates/task/text-classification.handlebars b/scripts/api-inference/templates/task/text-classification.handlebars
index 99c3cabe8..123d1f92a 100644
--- a/scripts/api-inference/templates/task/text-classification.handlebars
+++ b/scripts/api-inference/templates/task/text-classification.handlebars
@@ -12,6 +12,10 @@ Text Classification is the task of assigning a label or class to a given text. S
 
 {{{tips.listModelsLink.text-classification}}}
 
+### Using the API
+
+{{{snippets.text-classification}}}
+
 ### API specification
 
 #### Request
@@ -23,7 +27,3 @@ Text Classification is the task of assigning a label or class to a given text. S
 #### Response
 
 {{{specs.text-classification.output}}}
-
-### Using the API
-
-{{{snippets.text-classification}}}
diff --git a/scripts/api-inference/templates/task/text-generation.handlebars b/scripts/api-inference/templates/task/text-generation.handlebars
index 85bbba97a..9720cc175 100644
--- a/scripts/api-inference/templates/task/text-generation.handlebars
+++ b/scripts/api-inference/templates/task/text-generation.handlebars
@@ -14,6 +14,10 @@ If you are interested in a Chat Completion task, which generates a response base
 
 {{{tips.listModelsLink.text-generation}}}
 
+### Using the API
+
+{{{snippets.text-generation}}}
+
 ### API specification
 
 #### Request
@@ -33,7 +37,3 @@ If `stream` is `true`, generated tokens are returned as a stream, using Server-S
 For more information about streaming, check out [this guide](https://huggingface.co/docs/text-generation-inference/conceptual/streaming).
 
 {{{specs.text-generation.stream_output}}}
-
-### Using the API
-
-{{{snippets.text-generation}}}
diff --git a/scripts/api-inference/templates/task/text-to-image.handlebars b/scripts/api-inference/templates/task/text-to-image.handlebars
index 6e6ffd0c6..ac65056e6 100644
--- a/scripts/api-inference/templates/task/text-to-image.handlebars
+++ b/scripts/api-inference/templates/task/text-to-image.handlebars
@@ -12,6 +12,10 @@ Generate an image based on a given text prompt.
 
 {{{tips.listModelsLink.text-to-image}}}
 
+### Using the API
+
+{{{snippets.text-to-image}}}
+
 ### API specification
 
 #### Request
@@ -23,7 +27,3 @@ Generate an image based on a given text prompt.
 #### Response
 
 {{{specs.text-to-image.output}}}
-
-### Using the API
-
-{{{snippets.text-to-image}}}
diff --git a/scripts/api-inference/templates/task/token-classification.handlebars b/scripts/api-inference/templates/task/token-classification.handlebars
index 44f682145..4a627783f 100644
--- a/scripts/api-inference/templates/task/token-classification.handlebars
+++ b/scripts/api-inference/templates/task/token-classification.handlebars
@@ -12,6 +12,10 @@ Token classification is a task in which a label is assigned to some tokens in a
 
 {{{tips.listModelsLink.token-classification}}}
 
+### Using the API
+
+{{{snippets.token-classification}}}
+
 ### API specification
 
 #### Request
@@ -32,6 +36,3 @@ For more information about streaming, check out [this guide](https://huggingface
 
 {{{specs.token-classification.stream_output}}}
 
-### Using the API
-
-{{{snippets.token-classification}}}
diff --git a/scripts/api-inference/templates/task/translation.handlebars b/scripts/api-inference/templates/task/translation.handlebars
index 02892102b..7cbede05d 100644
--- a/scripts/api-inference/templates/task/translation.handlebars
+++ b/scripts/api-inference/templates/task/translation.handlebars
@@ -12,6 +12,10 @@ Translation is the task of converting text from one language to another.
 
 {{{tips.listModelsLink.translation}}}
 
+### Using the API
+
+{{{snippets.translation}}}
+
 ### API specification
 
 #### Request
@@ -23,7 +27,3 @@ Translation is the task of converting text from one language to another.
 #### Response
 
 {{{specs.translation.output}}}
-
-### Using the API
-
-{{{snippets.translation}}}
diff --git a/scripts/api-inference/templates/task/zero-shot-classification.handlebars b/scripts/api-inference/templates/task/zero-shot-classification.handlebars
index fd631a656..e0e830e93 100644
--- a/scripts/api-inference/templates/task/zero-shot-classification.handlebars
+++ b/scripts/api-inference/templates/task/zero-shot-classification.handlebars
@@ -12,6 +12,10 @@ Zero-shot text classification is super useful to try out classification with zer
 
 {{{tips.listModelsLink.zero-shot-classification}}}
 
+### Using the API
+
+{{{snippets.zero-shot-classification}}}
+
 ### API specification
 
 #### Request
@@ -23,7 +27,3 @@ Zero-shot text classification is super useful to try out classification with zer
 #### Response
 
 {{{specs.zero-shot-classification.output}}}
-
-### Using the API
-
-{{{snippets.zero-shot-classification}}}

From 51d5e41c14926c360a5f4afefea1cce07ff2bee9 Mon Sep 17 00:00:00 2001
From: Wauplin <lucainp@gmail.com>
Date: Thu, 5 Sep 2024 10:43:47 +0200
Subject: [PATCH 3/3] custom link for finegrained token

---
 docs/api-inference/getting-started.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/api-inference/getting-started.md b/docs/api-inference/getting-started.md
index 633cee968..31a77e4b3 100644
--- a/docs/api-inference/getting-started.md
+++ b/docs/api-inference/getting-started.md
@@ -6,7 +6,7 @@ We'll do a minimal example using a [sentiment classification model](https://hugg
 
 ## Getting a Token
 
-Using the Serverless Inference API requires passing a user token in the request headers. You can get a token by signing up on the Hugging Face website and then going to the [tokens page](https://huggingface.co/settings/tokens). We recommend creating a `Fine-grained` token with the scope to `Make calls to the serverless Inference API`.
+Using the Serverless Inference API requires passing a user token in the request headers. You can get a token by signing up on the Hugging Face website and then going to the [tokens page](https://huggingface.co/settings/tokens/new?globalPermissions=inference.serverless.write&tokenType=fineGrained). We recommend creating a `Fine-grained` token with the scope to `Make calls to the serverless Inference API`.
 
 TODO: add screenshot
 For more details about user tokens, check out [this guide](https://huggingface.co/docs/hub/en/security-tokens).