Add florence2 model

* Add florence2-base model for all tasks * Update annotator.rs
jamjamjon · Sep 21, 2024 · f0fd493 · f0fd493
1 parent 5057c20
commit f0fd493
Show file tree

Hide file tree

Showing 22 changed files with 2,124 additions and 110 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "usls"
-version = "0.0.14"
+version = "0.0.15"
 edition = "2021"
 description = "A Rust library integrated with ONNXRuntime, providing a collection of ML models."
 repository = "https://github.com/jamjamjon/usls"
@@ -22,7 +22,6 @@ dirs = { version = "5.0.1" }
 ureq = { version = "2.9.1", default-features = true, features = [
     "socks-proxy",
 ] }
-walkdir = { version = "2.5.0" }  # TODO: remove
 tokenizers = { version = "0.15.2" }
 rayon = "1.10.0"
 indicatif = "0.17.8"

diff --git a/README.md b/README.md
@@ -37,7 +37,7 @@
 - **YOLO Models**: [YOLOv5](https://github.com/ultralytics/yolov5), [YOLOv6](https://github.com/meituan/YOLOv6), [YOLOv7](https://github.com/WongKinYiu/yolov7), [YOLOv8](https://github.com/ultralytics/ultralytics), [YOLOv9](https://github.com/WongKinYiu/yolov9), [YOLOv10](https://github.com/THU-MIG/yolov10)
 - **SAM Models**: [SAM](https://github.com/facebookresearch/segment-anything), [SAM2](https://github.com/facebookresearch/segment-anything-2), [MobileSAM](https://github.com/ChaoningZhang/MobileSAM), [EdgeSAM](https://github.com/chongzhou96/EdgeSAM), [SAM-HQ](https://github.com/SysCV/sam-hq), [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM)
 - **Vision Models**: [RTDETR](https://arxiv.org/abs/2304.08069), [RTMO](https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo), [DB](https://arxiv.org/abs/1911.08947), [SVTR](https://arxiv.org/abs/2205.00159), [Depth-Anything-v1-v2](https://github.com/LiheYoung/Depth-Anything), [DINOv2](https://github.com/facebookresearch/dinov2), [MODNet](https://github.com/ZHKKKe/MODNet), [Sapiens](https://arxiv.org/abs/2408.12569)
-- **Vision-Language Models**: [CLIP](https://github.com/openai/CLIP), [BLIP](https://arxiv.org/abs/2201.12086), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [YOLO-World](https://github.com/AILab-CVC/YOLO-World)
+- **Vision-Language Models**: [CLIP](https://github.com/openai/CLIP), [BLIP](https://arxiv.org/abs/2201.12086), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [YOLO-World](https://github.com/AILab-CVC/YOLO-World), [Florence2](https://arxiv.org/abs/2311.06242)
 
 <details>
 <summary>Click to expand Supported Models</summary>
@@ -71,6 +71,9 @@
 | [MODNet](https://github.com/ZHKKKe/MODNet)                         | Image Matting                                                                               | [demo](examples/modnet)    | ✅       | ✅       | ✅           | ✅           |
 | [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO)   | Open-Set Detection With Language                                                             | [demo](examples/grounding-dino) | ✅       | ✅       |              |              |
 | [Sapiens](https://github.com/facebookresearch/sapiens/tree/main)   | Body Part Segmentation                                   | [demo](examples/sapiens) | ✅       | ✅       |              |              |
+| [Florence2](https://arxiv.org/abs/2311.06242)   | a Variety of Vision Tasks | [demo](examples/florence2) | ✅       | ✅       |              |              |
+
+
 
 </details>
 

diff --git a/examples/blip/main.rs b/examples/blip/main.rs
@@ -10,7 +10,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     // textual
     let options_textual = Options::default()
         .with_model("blip/textual-base.onnx")?
-        // .with_tokenizer("blip/tokenizer.json")?
+        .with_tokenizer("blip/tokenizer.json")?
         .with_i00((1, 1, 4).into()) // input_id: batch
         .with_i01((1, 1, 4).into()) // input_id: seq_len
         .with_i10((1, 1, 4).into()) // attention_mask: batch

diff --git a/examples/dataloader/main.rs b/examples/dataloader/main.rs
@@ -18,26 +18,24 @@ fn main() -> anyhow::Result<()> {
 
     // build dataloader
     let dl = DataLoader::new(
+        // "images/bus.jpg",  // remote image
+        // "../images", // image folder
+        // "../demo.mp4",   // local video
+        // "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4", // remote video
+        // "rtsp://admin:[email protected]:554/h265/ch1/",  // rtsp h264 stream
         "./assets/bus.jpg", // local image
-                            // "images/bus.jpg",  // remote image
-                            // "../images", // image folder
-                            // "../demo.mp4",   // local video
-                            // "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4",  // remote video
-                            // "rtsp://admin:[email protected]:554/h265/ch1/",  // rtsp h264 stream
     )?
     .with_batch(1)
-    .with_progress_bar(true)
-    .with_bound(100)
     .build()?;
 
-    // // build annotator
+    // build annotator
     let annotator = Annotator::new()
         .with_bboxes_thickness(4)
         .with_saveout("YOLO-DataLoader");
 
     // run
     for (xs, _) in dl {
-        // std::thread::sleep(std::time::Duration::from_millis(1000));
+        // std::thread::sleep(std::time::Duration::from_millis(100));
         let ys = model.forward(&xs, false)?;
         annotator.annotate(&xs, &ys);
     }

diff --git a/examples/florence2/main.rs b/examples/florence2/main.rs
@@ -0,0 +1,252 @@
+use usls::{models::Florence2, Annotator, DataLoader, Options, Task};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // vision encoder
+    let options_vision_encoder = Options::default()
+        .with_model("florence2/base-vision-encoder.onnx")?
+        .with_i00((1, 2, 4).into())
+        .with_i02((512, 768, 800).into())
+        .with_i03((512, 768, 800).into())
+        .with_profile(false)
+        .with_cuda(0);
+
+    // text embed
+    let options_text_embed = Options::default()
+        .with_model("florence2/base-embed-tokens.onnx")?
+        .with_i00((1, 2, 4).into())
+        .with_i01((1, 2, 20).into()) // seq_length
+        .with_tokenizer("florence2/tokenizer.json")?
+        .with_profile(false);
+
+    // transformer encoder
+    let options_encoder = Options::default()
+        .with_model("florence2/base-encoder.onnx")?
+        .with_i00((1, 2, 4).into())
+        .with_i01((1, 2, 300).into()) // encoder_sequence_length
+        .with_i10((1, 2, 4).into())
+        .with_i11((1, 2, 300).into()) // encoder_sequence_length
+        .with_profile(false);
+
+    // transformer decoder
+    let options_decoder = Options::default()
+        .with_model("florence2/base-decoder.onnx")?
+        .with_i00((1, 2, 4).into())
+        .with_i01((1, 2, 300).into()) // encoder_sequence_length
+        .with_i10((1, 2, 4).into())
+        .with_i11((1, 2, 300).into()) // encoder_sequence_length
+        .with_i20((1, 2, 4).into())
+        .with_i21((1, 2, 300).into()) // encoder_sequence_length
+        .with_profile(false);
+
+    // transformer decoder merged
+    let options_decoder_merged = Options::default()
+        .with_model("florence2/base-decoder-merged.onnx")?
+        // encoder_attention_mask
+        .with_i00((1, 2, 4).into())
+        .with_i01((1, 2, 300).into()) // encoder_sequence_length
+        // encoder_hidden_states
+        .with_i10((1, 2, 4).into())
+        .with_i11((1, 2, 300).into()) // encoder_sequence_length
+        // inputs_embeds
+        .with_i20((1, 2, 4).into())
+        .with_i21((1, 2, 300).into()) // encoder_sequence_length
+        // past_key_values.0.decoder.key
+        .with_i30((1, 2, 4).into())
+        .with_i32_((1, 2, 1).into())
+        // past_key_values.0.decoder.value
+        .with_i40((1, 2, 4).into())
+        .with_i42((1, 2, 1).into())
+        // past_key_values.0.encoder.key
+        .with_i50((1, 2, 4).into())
+        .with_i52((1, 2, 1).into())
+        // past_key_values.0.decoder.value
+        .with_i60((1, 2, 4).into())
+        .with_i62((1, 2, 1).into())
+        // past_key_values.1.decoder.key
+        .with_i70((1, 2, 4).into())
+        .with_i72((1, 2, 1).into())
+        // past_key_values.1.decoder.value
+        .with_i80((1, 2, 4).into())
+        .with_i82((1, 2, 1).into())
+        // past_key_values.1.encoder.key
+        .with_i90((1, 2, 4).into())
+        .with_i92((1, 2, 1).into())
+        // past_key_values.1.decoder.value
+        .with_i100((1, 2, 4).into())
+        .with_i102((1, 2, 1).into())
+        // past_key_values.2.decoder.key
+        .with_i110((1, 2, 4).into())
+        .with_i112((1, 2, 1).into())
+        // past_key_values.2.decoder.value
+        .with_i120((1, 2, 4).into())
+        .with_i122((1, 2, 1).into())
+        // past_key_values.2.encoder.key
+        .with_i130((1, 2, 4).into())
+        .with_i132((1, 2, 1).into())
+        // past_key_values.2.decoder.value
+        .with_i140((1, 2, 4).into())
+        .with_i142((1, 2, 1).into())
+        // past_key_values.3.decoder.key
+        .with_i150((1, 2, 4).into())
+        .with_i152((1, 2, 1).into())
+        // past_key_values.3.decoder.value
+        .with_i160((1, 2, 4).into())
+        .with_i162((1, 2, 1).into())
+        // past_key_values.3.encoder.key
+        .with_i170((1, 2, 4).into())
+        .with_i172((1, 2, 1).into())
+        // past_key_values.3.decoder.value
+        .with_i180((1, 2, 4).into())
+        .with_i182((1, 2, 1).into())
+        // past_key_values.4.decoder.key
+        .with_i190((1, 2, 4).into())
+        .with_i192((1, 2, 1).into())
+        // past_key_values.4.decoder.value
+        .with_i200((1, 2, 4).into())
+        .with_i202((1, 2, 1).into())
+        // past_key_values.4.encoder.key
+        .with_i210((1, 2, 4).into())
+        .with_i212((1, 2, 1).into())
+        // past_key_values.4.decoder.value
+        .with_i220((1, 2, 4).into())
+        .with_i222((1, 2, 1).into())
+        // past_key_values.5.decoder.key
+        .with_i230((1, 2, 4).into())
+        .with_i232((1, 2, 1).into())
+        // past_key_values.5.decoder.value
+        .with_i240((1, 2, 4).into())
+        .with_i242((1, 2, 1).into())
+        // past_key_values.5.encoder.key
+        .with_i250((1, 2, 4).into())
+        .with_i252((1, 2, 1).into())
+        // past_key_values.5.decoder.value
+        .with_i260((1, 2, 4).into())
+        .with_i262((1, 2, 1).into())
+        //use_cache_branch
+        .with_i270((1, 2, 1).into())
+        .with_profile(false);
+
+    // build model
+    let mut model = Florence2::new(
+        options_vision_encoder,
+        options_text_embed,
+        options_encoder,
+        options_decoder,
+        options_decoder_merged,
+    )?;
+
+    // load images
+    let xs = [
+        // DataLoader::try_read("florence2/car.jpg")?, // for testing region-related tasks
+        DataLoader::try_read("florence2/car.jpg")?,
+        // DataLoader::try_read("images/db.png")?,
+        DataLoader::try_read("assets/bus.jpg")?,
+    ];
+
+    // region-related tasks
+    let quantizer = usls::Quantizer::default();
+    // let coords = [449., 270., 556., 372.];  // wheel
+    let coords = [31., 156., 581., 373.]; // car
+    let (width_car, height_car) = (xs[0].width(), xs[0].height());
+    let quantized_coords = quantizer.quantize(&coords, (width_car as _, height_car as _));
+
+    // run with tasks
+    let ys = model.run_with_tasks(
+        &xs,
+        &[
+            // w/ inputs
+            Task::Caption(0),
+            Task::Caption(1),
+            Task::Caption(2),
+            Task::Ocr,
+            Task::OcrWithRegion,
+            Task::RegionProposal,
+            Task::ObjectDetection,
+            Task::DenseRegionCaption,
+            // w/o inputs
+            Task::OpenSetDetection("a vehicle".into()),
+            Task::CaptionToPhraseGrounding(
+                "A vehicle with two wheels parked in front of a building.".into(),
+            ),
+            Task::ReferringExpressionSegmentation("a vehicle".into()),
+            Task::RegionToSegmentation(
+                quantized_coords[0],
+                quantized_coords[1],
+                quantized_coords[2],
+                quantized_coords[3],
+            ),
+            Task::RegionToCategory(
+                quantized_coords[0],
+                quantized_coords[1],
+                quantized_coords[2],
+                quantized_coords[3],
+            ),
+            Task::RegionToDescription(
+                quantized_coords[0],
+                quantized_coords[1],
+                quantized_coords[2],
+                quantized_coords[3],
+            ),
+        ],
+    )?;
+
+    // annotator
+    let annotator = Annotator::new()
+        .without_bboxes_conf(true)
+        .with_bboxes_thickness(3)
+        .with_saveout_subs(&["Florence2"]);
+    for (task, ys_) in ys.iter() {
+        match task {
+            Task::Caption(_)
+            | Task::Ocr
+            | Task::RegionToCategory(..)
+            | Task::RegionToDescription(..) => {
+                println!("Task: {:?}\n{:?}\n", task, ys_)
+            }
+            Task::DenseRegionCaption => {
+                let annotator = annotator.clone().with_saveout("Dense-Region-Caption");
+                annotator.annotate(&xs, ys_);
+            }
+            Task::RegionProposal => {
+                let annotator = annotator
+                    .clone()
+                    .without_bboxes_name(false)
+                    .with_saveout("Region-Proposal");
+
+                annotator.annotate(&xs, ys_);
+            }
+            Task::ObjectDetection => {
+                let annotator = annotator.clone().with_saveout("Object-Detection");
+                annotator.annotate(&xs, ys_);
+            }
+            Task::OpenSetDetection(_) => {
+                let annotator = annotator.clone().with_saveout("Open-Set-Detection");
+                annotator.annotate(&xs, ys_);
+            }
+            Task::CaptionToPhraseGrounding(_) => {
+                let annotator = annotator
+                    .clone()
+                    .with_saveout("Caption-To-Phrase-Grounding");
+                annotator.annotate(&xs, ys_);
+            }
+            Task::ReferringExpressionSegmentation(_) => {
+                let annotator = annotator
+                    .clone()
+                    .with_saveout("Referring-Expression-Segmentation");
+                annotator.annotate(&xs, ys_);
+            }
+            Task::RegionToSegmentation(..) => {
+                let annotator = annotator.clone().with_saveout("Region-To-Segmentation");
+                annotator.annotate(&xs, ys_);
+            }
+            Task::OcrWithRegion => {
+                let annotator = annotator.clone().with_saveout("Ocr-With-Region");
+                annotator.annotate(&xs, ys_);
+            }
+
+            _ => (),
+        }
+    }
+
+    Ok(())
+}