huggingface · merveenoyan · Jan 15, 2025 · Jan 13, 2025 · Jan 15, 2025 · Jan 15, 2025
@@ -39,8 +39,8 @@ const taskData: TaskDataCustom = {
 	],
 	models: [
 		{
-			description: "A solid model of audio source separation.",
-			id: "speechbrain/sepformer-wham",
+			description: "A robust model to compress speech.",
+			id: "stabilityai/stable-codec-speech-16k",
 		},
 		{
 			description: "A speech enhancement model.",

@@ -61,8 +61,8 @@ const taskData: TaskDataCustom = {
 	],
 	models: [
 		{
-			description: "The famous BERT model.",
-			id: "google-bert/bert-base-uncased",
+			description: "State-of-the-art masked language model.",
+			id: "answerdotai/ModernBERT-large",
 		},
 		{
 			description: "A multilingual model trained on 100 languages.",

@@ -74,9 +74,8 @@ const taskData: TaskDataCustom = {
 	],
 	spaces: [
 		{
-			// TO DO: write description
-			description: "An application that classifies what a given image is about.",
-			id: "nielsr/perceiver-image-classification",
+			description: "A leaderboard to evaluate different image classification models.",
+			id: "timm/leaderboard",
 		},
 	],
 	summary:

@@ -43,15 +43,20 @@ const taskData: TaskDataCustom = {
 			id: "facebook/dino-vitb16",
 		},
 		{
-			description: "Strong image feature extraction model made for information retrieval from documents.",
-			id: "vidore/colpali",
+			description: "Cutting-edge image feature extraction model.",
+			id: "apple/aimv2-large-patch14-336-distilled",
 		},
 		{
 			description: "Strong image feature extraction model that can be used on images and documents.",
 			id: "OpenGVLab/InternViT-6B-448px-V1-2",
 		},
 	],
-	spaces: [],
+	spaces: [
+		{
+			description: "A leaderboard to evaluate different image-feature-extraction models on classification performances",
+			id: "timm/leaderboard",
+		},
+	],
 	summary: "Image feature extraction is the task of extracting features learnt in a computer vision model.",
 	widgetModels: [],
 };

@@ -24,12 +24,16 @@ Vision language models trained on image-text pairs can be used for visual questi
 
 ### Document Question Answering and Retrieval
 
-Documents often consist of different layouts, charts, tables, images, and more. Vision language models trained on formatted documents can extract information from them. This is an OCR-free approach; the inputs skip OCR, and documents are directly fed to vision language models.
+Documents often consist of different layouts, charts, tables, images, and more. Vision language models trained on formatted documents can extract information from them. This is an OCR-free approach; the inputs skip OCR, and documents are directly fed to vision language models. To find the relevant documents to be fed, models like [ColPali](https://huggingface.co/blog/manu/colpali) are used. An example workflow can be found [here](https://github.com/merveenoyan/smol-vision/blob/main/ColPali_%2B_Qwen2_VL.ipynb).
 
 ### Image Recognition with Instructions
 
 Vision language models can recognize images through descriptions. When given detailed descriptions of specific entities, it can classify the entities in an image.
 
+### Computer Use
+
+Image-text-to-text models can be used to control computers with agentic workflows. Models like [ShowUI](https://huggingface.co/showlab/ShowUI-2B) and [OmniParser](https://huggingface.co/microsoft/OmniParser) are used to parse screenshots to later take actions on the computer autonomously.
+
 ## Inference
 
 You can use the Transformers library to interact with [vision-language models](https://huggingface.co/models?pipeline_tag=image-text-to-text&transformers). Specifically, `pipeline` makes it easy to infer models.
@@ -82,7 +86,8 @@ curl https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-11B-Vision
 ## Useful Resources
 
 - [Vision Language Models Explained](https://huggingface.co/blog/vlms)
-- [Open-source Multimodality and How to Achieve it using Hugging Face](https://www.youtube.com/watch?v=IoGaGfU1CIg&t=601s)
-- [Introducing Idefics2: A Powerful 8B Vision-Language Model for the community](https://huggingface.co/blog/idefics2)
+- [Welcome PaliGemma 2 – New vision language models by Google](https://huggingface.co/blog/paligemma2)
+- [SmolVLM - small yet mighty Vision Language Model](https://huggingface.co/blog/smolvlm)
+- [Multimodal RAG using ColPali and Qwen2-VL](https://github.com/merveenoyan/smol-vision/blob/main/ColPali_%2B_Qwen2_VL.ipynb)
 - [Image-text-to-text task guide](https://huggingface.co/tasks/image-text-to-text)
 - [Preference Optimization for Vision Language Models with TRL](https://huggingface.co/blog/dpo_vlm)
@@ -7,8 +7,8 @@ const taskData: TaskDataCustom = {
 			id: "liuhaotian/LLaVA-Instruct-150K",
 		},
 		{
-			description: "Conversation turns where questions involve image and text.",
-			id: "liuhaotian/LLaVA-Pretrain",
+			description: "Collection of image-text pairs on scientific topics.",
+			id: "DAMO-NLP-SG/multimodal_textbook",
 		},
 		{
 			description: "A collection of datasets made for model fine-tuning.",
@@ -43,11 +43,15 @@ const taskData: TaskDataCustom = {
 	metrics: [],
 	models: [
 		{
-			description: "Powerful vision language model with great visual understanding and reasoning capabilities.",
-			id: "meta-llama/Llama-3.2-11B-Vision-Instruct",
+			description: "Small and efficient yet powerful vision language model.",
+			id: "HuggingFaceTB/SmolVLM-Instruct",
 		},
 		{
-			description: "Cutting-edge vision language models.",
+			description: "A screenshot understanding model used to control computers.",
+			id: "showlab/ShowUI-2B",
+		},
+		{
+			description: "Cutting-edge vision language model.",
 			id: "allenai/Molmo-7B-D-0924",
 		},
 		{
@@ -59,8 +63,8 @@ const taskData: TaskDataCustom = {
 			id: "Qwen/Qwen2-VL-7B-Instruct",
 		},
 		{
-			description: "Strong image-text-to-text model.",
-			id: "mistralai/Pixtral-12B-2409",
+			description: "Image-text-to-text model with reasoning capabilities.",
+			id: "Qwen/QVQ-72B-Preview",
 		},
 		{
 			description: "Strong image-text-to-text model focused on documents.",
@@ -84,14 +88,18 @@ const taskData: TaskDataCustom = {
 			description: "An image-text-to-text application focused on documents.",
 			id: "stepfun-ai/GOT_official_online_demo",
 		},
-		{
-			description: "An application to compare outputs of different vision language models.",
-			id: "merve/compare_VLMs",
-		},
 		{
 			description: "An application for chatting with an image-text-to-text model.",
 			id: "GanymedeNil/Qwen2-VL-7B",
 		},
+		{
+			description: "An application that parses screenshots into actions.",
+			id: "showlab/ShowUI",
+		},
+		{
+			description: "An application that detects gaze.",
+			id: "smoondream/gaze-demo",
+		},
 	],
 	summary:
 		"Image-text-to-text models take in an image and text prompt and output text. These models are also called vision-language models, or VLMs. The difference from image-to-text models is that these models take an additional text input, not restricting the model to certain use cases like image captioning, and may also be trained to accept a conversation as input.",

@@ -41,8 +41,8 @@ const taskData: TaskDataCustom = {
 			id: "hwjiang/Real3D",
 		},
 		{
-			description: "Generative 3D gaussian splatting model.",
-			id: "ashawkey/LGM",
+			description: "Consistent image-to-3d generation model.",
+			id: "stabilityai/stable-point-aware-3d",
 		},
 	],
 	spaces: [
@@ -55,8 +55,8 @@ const taskData: TaskDataCustom = {
 			id: "TencentARC/InstantMesh",
 		},
 		{
-			description: "Image-to-3D demo with mesh outputs.",
-			id: "stabilityai/TripoSR",
+			description: "Image-to-3D demo.",
+			id: "stabilityai/stable-point-aware-3d",
 		},
 		{
 			description: "Image-to-3D demo with mesh outputs.",

@@ -10,6 +10,10 @@ const taskData: TaskDataCustom = {
 			description: "Multiple images of celebrities, used for facial expression translation",
 			id: "huggan/CelebA-faces",
 		},
+		{
+			description: "12M image-caption pairs.",
+			id: "Spawning/PD12M",
+		},
 	],
 	demo: {
 		inputs: [
@@ -54,16 +58,20 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description:
-				"A model that creates a set of variations of the input image in the style of DALL-E using Stable Diffusion.",
-			id: "lambdalabs/sd-image-variations-diffusers",
+				"A model for applying edits to images through image controls.",
+			id: "Yuanshi/OminiControl",
 		},
 		{
 			description: "A model that generates images based on segments in the input image and the text prompt.",
 			id: "mfidabel/controlnet-segment-anything",
 		},
 		{
-			description: "A model that takes an image and an instruction to edit the image.",
-			id: "timbrooks/instruct-pix2pix",
+			description: "Strong model for inpainting and outpainting.",
+			id: "black-forest-labs/FLUX.1-Fill-dev",
+		},
+		{
+			description: "Strong model for image editing using depth maps.",
+			id: "black-forest-labs/FLUX.1-Depth-dev-lora",
 		},
 	],
 	spaces: [

@@ -132,7 +132,7 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
 	"video-classification": ["transformers"],
 	"mask-generation": ["transformers"],
 	"multiple-choice": ["transformers"],
-	"object-detection": ["transformers", "transformers.js"],
+	"object-detection": ["transformers", "transformers.js", "ultralytics"],
 	other: [],
 	"question-answering": ["adapter-transformers", "allennlp", "transformers", "transformers.js"],
 	robotics: [],

@@ -31,6 +31,10 @@ const taskData: TaskDataCustom = {
 			description: "Strong keypoint detection model used to detect human pose.",
 			id: "facebook/sapiens-pose-1b",
 		},
+		{
+			description: "Powerful keypoint detection model used to detect human pose.",
+			id: "usyd-community/vitpose-plus-base",
+		},
 	],
 	spaces: [
 		{

@@ -54,6 +54,10 @@ const taskData: TaskDataCustom = {
 			description: "Fast and accurate object detection model trained on COCO and Object365 datasets.",
 			id: "PekingU/rtdetr_r18vd_coco_o365",
 		},
+		{
+			description: "Object detection model for low-lying objects.",
+			id: "StephanST/WALDO30",
+		},
 	],
 	spaces: [
 		{
@@ -65,8 +69,8 @@ const taskData: TaskDataCustom = {
 			id: "Gradio-Blocks/Object-Detection-With-DETR-and-YOLOS",
 		},
 		{
-			description: "An application that shows multiple cutting edge techniques for object detection and tracking.",
-			id: "kadirnar/torchyolo",
+			description: "A cutting-edge object detection application.",
+			id: "Ultralytics/YOLO11",
 		},
 		{
 			description: "An object tracking, segmentation and inpainting application.",

@@ -69,9 +69,13 @@ const taskData: TaskDataCustom = {
 			id: "sentence-transformers/all-mpnet-base-v2",
 		},
 		{
-			description: "A multilingual robust sentence similarity model..",
+			description: "A multilingual robust sentence similarity model.",
 			id: "BAAI/bge-m3",
 		},
+		{
+			description: "A robust sentence similarity model.",
+			id: "HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5",
+		},
 	],
 	spaces: [
 		{

@@ -3,25 +3,25 @@ import type { TaskDataCustom } from "../index.js";
 const taskData: TaskDataCustom = {
 	datasets: [
 		{
-			description: "A large multilingual dataset of text crawled from the web.",
-			id: "mc4",
+			description: "Multilingual dataset used to evaluate text generation models.",
+			id: "CohereForAI/Global-MMLU",
 		},
 		{
 			description:
-				"Diverse open-source data consisting of 22 smaller high-quality datasets. It was used to train GPT-Neo.",
-			id: "the_pile",
+				"High quality multilingual data used to train text-generation models.",
+			id: "HuggingFaceFW/fineweb-2",
 		},
 		{
 			description: "Truly open-source, curated and cleaned dialogue dataset.",
 			id: "HuggingFaceH4/ultrachat_200k",
 		},
 		{
-			description: "An instruction dataset with preference ratings on responses.",
-			id: "openbmb/UltraFeedback",
+			description: "A multilingual instruction dataset with preference ratings on responses.",
+			id: "allenai/llama-3.1-tulu-3-8b-preference-mixture",
 		},
 		{
 			description: "A large synthetic dataset for alignment of text generation models.",
-			id: "argilla/magpie-ultra-v0.1",
+			id: "HuggingFaceTB/smoltalk",
 		},
 	],
 	demo: {
@@ -63,20 +63,20 @@ const taskData: TaskDataCustom = {
 			id: "meta-llama/Meta-Llama-3.1-8B-Instruct",
 		},
 		{
-			description: "Small yet powerful text generation model.",
-			id: "microsoft/Phi-3-mini-4k-instruct",
+			description: "P text generation model.",
+			id: "microsoft/phi-4",
 		},
 		{
-			description: "A very powerful model that can solve mathematical problems.",
-			id: "AI-MO/NuminaMath-7B-TIR",
+			description: "A very powerful model with reasoning capabilities.",
+			id: "PowerInfer/SmallThinker-3B-Preview",
 		},
 		{
 			description: "Strong text generation model to follow instructions.",
 			id: "Qwen/Qwen2.5-7B-Instruct",
 		},
 		{
-			description: "Very strong open-source large language model.",
-			id: "nvidia/Llama-3.1-Nemotron-70B-Instruct",
+			description: "Text generation model used to write code.",
+			id: "Qwen/Qwen2.5-Coder-32B-Instruct",
 		},
 	],
 	spaces: [

@@ -10,6 +10,10 @@ const taskData: TaskDataCustom = {
 			description: "Conceptual Captions is a dataset consisting of ~3.3M images annotated with captions.",
 			id: "conceptual_captions",
 		},
+		{
+			description: "12M image-caption pairs.",
+			id: "Spawning/PD12M",
+		},
 	],
 	demo: {
 		inputs: [

@@ -12,7 +12,7 @@ const taskData: TaskDataCustom = {
 			id: "mythicinfinity/libritts_r",
 		},
 		{
-			description: "Mulit-lingual dataset.",
+			description: "Multi-lingual dataset.",
 			id: "facebook/multilingual_librispeech",
 		},
 	],
@@ -48,26 +48,26 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "A massively multi-lingual TTS model.",
-			id: "coqui/XTTS-v2",
+			id: "fishaudio/fish-speech-1.5",
 		},
 		{
 			description: "A powerful TTS model.",
-			id: "amphion/MaskGCT",
+			id: "OuteAI/OuteTTS-0.1-350M",
 		},
 		{
-			description: "A Llama based TTS model.",
-			id: "OuteAI/OuteTTS-0.1-350M",
+			description: "Small yet powerful TTS model.",
+			id: "hexgrad/Kokoro-82M",
 		},
 	],
 	spaces: [
 		{
-			description: "An application for generate highly realistic, multilingual speech.",
-			id: "suno/bark",
+			description: "An application for generate high quality speech in different languages.",
+			id: "hexgrad/Kokoro-TTS",
 		},
 		{
 			description:
-				"An application on XTTS, a voice generation model that lets you clone voices into different languages.",
-			id: "coqui/xtts",
+				"A multilingual text-to-speech application.",
+			id: "fishaudio/fish-speech-1",
 		},
 		{
 			description: "An application that generates speech in different styles in English and Chinese.",