From 3fd87ec30a6541d984cbceffd8cfcdcae4fb3ac9 Mon Sep 17 00:00:00 2001 From: Omar Sanseviero Date: Tue, 2 Apr 2024 13:40:39 +0200 Subject: [PATCH] Update download count explanation (#1261) * Update models-download-stats.md * Update docs/hub/models-download-stats.md Co-authored-by: Julien Chaumond * Update docs/hub/models-download-stats.md Co-authored-by: Julien Chaumond * Update models-download-stats.md * Apply suggestions from code review Co-authored-by: Pedro Cuenca --------- Co-authored-by: Julien Chaumond Co-authored-by: Pedro Cuenca --- docs/hub/models-download-stats.md | 187 +++++++----------------------- 1 file changed, 41 insertions(+), 146 deletions(-) diff --git a/docs/hub/models-download-stats.md b/docs/hub/models-download-stats.md index 4acfa9785..c7e993acd 100644 --- a/docs/hub/models-download-stats.md +++ b/docs/hub/models-download-stats.md @@ -2,156 +2,51 @@ ## How are download stats generated for models? -Counting the number of downloads for models is not a trivial task as a single model repository might contain multiple files, including multiple model weight files (e.g., with sharded models), and different formats depending on the library. To avoid double counting downloads (e.g., counting a single download of a model as multiple downloads), the Hub uses a set of query files that are employed for download counting. No information is sent from the user, and no additional calls are made for this. The count is done server-side as we serve files for downloads. +Counting the number of downloads for models is not a trivial task, as a single model repository might contain multiple files, including multiple model weight files (e.g., with sharded models) and different formats depending on the library (GGUF, PyTorch, TensorFlow, etc.). To avoid double counting downloads (e.g., counting a single download of a model as multiple downloads), the Hub uses a set of query files that are employed for download counting. No information is sent from the user, and no additional calls are made for this. The count is done server-side as the Hub serves files for downloads. -Every HTTP request to these files, including `GET` and `HEAD` will be counted as a download. By default, when no library is specified, the Hub uses `config.json` as the default query file. Otherwise, the query file depends on each library, and the Hub might examine files such as `pytorch_model.bin` and `adapter_config.json`. +Every HTTP request to these files, including `GET` and `HEAD`, will be counted as a download. By default, when no library is specified, the Hub uses `config.json` as the default query file. Otherwise, the query file depends on each library, and the Hub might examine files such as `pytorch_model.bin` or `adapter_config.json`. ## Which are the query files for different libraries? -By default, the Hub looks at `config.json`, `config.yaml`, `hyperparams.yaml`, and `meta.yaml`. For the following set of libraries, there are specific query files +By default, the Hub looks at `config.json`, `config.yaml`, `hyperparams.yaml`, and `meta.yaml`. Some libraries override these defaults by specifying their own filter (specifying `countDownloads`). The code that defines these overrides is [open-source](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries.ts). For example, for the `nemo` library, all files with `.nemo` extension are used to count downloads. -```json -{ - "adapter-transformers": { - filter: [ - { - term: { path: "adapter_config.json" }, - }, - ], - }, - "asteroid": { - filter: [ - { - term: { path: "pytorch_model.bin" }, - }, - ], - }, - "flair": { - filter: [ - { - term: { path: "pytorch_model.bin" }, - }, - ], - }, - "keras": { - filter: [ - { - term: { path: "saved_model.pb" }, - }, - ], - }, - "ml-agents": { - filter: [ - { - wildcard: { path: "*.onnx" }, - }, - ], - }, - "nemo": { - filter: [ - { - wildcard: { path: "*.nemo" }, - }, - ], - }, - "open_clip": { - filter: [ - { - wildcard: { path: "*pytorch_model.bin" }, - }, - ], - }, - "sample-factory": { - filter: [ - { - term: { path: "cfg.json" }, - }, - ], - }, - "paddlenlp": { - filter: [ - { - term: { path: "model_config.json" }, - }, - ], - }, - "speechbrain": { - filter: [ - { - term: { path: "hyperparams.yaml" }, - }, - ], - }, - "sklearn": { - filter: [ - { - term: { path: "sklearn_model.joblib" }, - }, - ], - }, - "spacy": { - filter: [ - { - wildcard: { path: "*.whl" }, - }, - ], - }, - "stanza": { - filter: [ - { - term: { path: "models/default.zip" }, - }, - ], - }, - "stable-baselines3": { - filter: [ - { - wildcard: { path: "*.zip" }, - }, - ], - }, - "timm": { - filter: [ - { - terms: { path: ["pytorch_model.bin", "model.safetensors"] }, - }, - ], - }, - "diffusers": { - /// Filter out nested safetensors and pickle weights to avoid double counting downloads from the diffusers lib - must_not: [ - { - wildcard: { path: "*/*.safetensors" }, - }, - { - wildcard: { path: "*/*.bin" }, - }, - ], - /// Include documents that match at least one of the following rules - should: [ - /// Downloaded from diffusers lib - { - term: { path: "model_index.json" }, - }, - /// Direct downloads (LoRa, Auto1111 and others) - { - wildcard: { path: "*.safetensors" }, - }, - { - wildcard: { path: "*.ckpt" }, - }, - { - wildcard: { path: "*.bin" }, - }, - ], - minimum_should_match: 1, - }, - "peft": { - filter: [ - { - term: { path: "adapter_config.json" }, - }, - ], - } +## Can I add my query files for my library? + +Yes, you can open a Pull Request [here](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries.ts). Here is a minimal [example](https://github.com/huggingface/huggingface.js/pull/561/files) adding download metrics for Grok-1. + +## How are `GGUF` files handled? + +GGUF files are self-contained and are not tied to a single library, so all of them are counted for downloads. This will double count downloads in the case a user performs cloning of a whole repository, but most users and interfaces download a single GGUF file for a given repo. + +## How is `diffusers` handled? + +The `diffusers` library is an edge case and has its filter configured in the internal codebase. The filter ensures repos tagged as `diffusers` count both files loaded via the library as well as through UIs that require users to manually download the top-level safetensors. + +``` +filter: [ + { + bool: { + /// Include documents that match at least one of the following rules + should: [ + /// Downloaded from diffusers lib + { + term: { path: "model_index.json" }, + }, + /// Direct downloads (LoRa, Auto1111 and others) + /// Filter out nested safetensors and pickle weights to avoid double counting downloads from the diffusers lib + { + regexp: { path: "[^/]*\\.safetensors" }, + }, + { + regexp: { path: "[^/]*\\.ckpt" }, + }, + { + regexp: { path: "[^/]*\\.bin" }, + }, + ], + minimum_should_match: 1, + }, + }, + ] } ```