Skip to content

Commit

Permalink
Add florence2 model
Browse files Browse the repository at this point in the history
* Add florence2-base model for all tasks

* Update annotator.rs
  • Loading branch information
jamjamjon authored Sep 21, 2024
1 parent 5057c20 commit f0fd493
Show file tree
Hide file tree
Showing 22 changed files with 2,124 additions and 110 deletions.
3 changes: 1 addition & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "usls"
version = "0.0.14"
version = "0.0.15"
edition = "2021"
description = "A Rust library integrated with ONNXRuntime, providing a collection of ML models."
repository = "https://github.com/jamjamjon/usls"
Expand All @@ -22,7 +22,6 @@ dirs = { version = "5.0.1" }
ureq = { version = "2.9.1", default-features = true, features = [
"socks-proxy",
] }
walkdir = { version = "2.5.0" } # TODO: remove
tokenizers = { version = "0.15.2" }
rayon = "1.10.0"
indicatif = "0.17.8"
Expand Down
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
- **YOLO Models**: [YOLOv5](https://github.com/ultralytics/yolov5), [YOLOv6](https://github.com/meituan/YOLOv6), [YOLOv7](https://github.com/WongKinYiu/yolov7), [YOLOv8](https://github.com/ultralytics/ultralytics), [YOLOv9](https://github.com/WongKinYiu/yolov9), [YOLOv10](https://github.com/THU-MIG/yolov10)
- **SAM Models**: [SAM](https://github.com/facebookresearch/segment-anything), [SAM2](https://github.com/facebookresearch/segment-anything-2), [MobileSAM](https://github.com/ChaoningZhang/MobileSAM), [EdgeSAM](https://github.com/chongzhou96/EdgeSAM), [SAM-HQ](https://github.com/SysCV/sam-hq), [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM)
- **Vision Models**: [RTDETR](https://arxiv.org/abs/2304.08069), [RTMO](https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo), [DB](https://arxiv.org/abs/1911.08947), [SVTR](https://arxiv.org/abs/2205.00159), [Depth-Anything-v1-v2](https://github.com/LiheYoung/Depth-Anything), [DINOv2](https://github.com/facebookresearch/dinov2), [MODNet](https://github.com/ZHKKKe/MODNet), [Sapiens](https://arxiv.org/abs/2408.12569)
- **Vision-Language Models**: [CLIP](https://github.com/openai/CLIP), [BLIP](https://arxiv.org/abs/2201.12086), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [YOLO-World](https://github.com/AILab-CVC/YOLO-World)
- **Vision-Language Models**: [CLIP](https://github.com/openai/CLIP), [BLIP](https://arxiv.org/abs/2201.12086), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [YOLO-World](https://github.com/AILab-CVC/YOLO-World), [Florence2](https://arxiv.org/abs/2311.06242)

<details>
<summary>Click to expand Supported Models</summary>
Expand Down Expand Up @@ -71,6 +71,9 @@
| [MODNet](https://github.com/ZHKKKe/MODNet) | Image Matting | [demo](examples/modnet) |||||
| [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO) | Open-Set Detection With Language | [demo](examples/grounding-dino) ||| | |
| [Sapiens](https://github.com/facebookresearch/sapiens/tree/main) | Body Part Segmentation | [demo](examples/sapiens) ||| | |
| [Florence2](https://arxiv.org/abs/2311.06242) | a Variety of Vision Tasks | [demo](examples/florence2) ||| | |



</details>

Expand Down
2 changes: 1 addition & 1 deletion examples/blip/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
// textual
let options_textual = Options::default()
.with_model("blip/textual-base.onnx")?
// .with_tokenizer("blip/tokenizer.json")?
.with_tokenizer("blip/tokenizer.json")?
.with_i00((1, 1, 4).into()) // input_id: batch
.with_i01((1, 1, 4).into()) // input_id: seq_len
.with_i10((1, 1, 4).into()) // attention_mask: batch
Expand Down
16 changes: 7 additions & 9 deletions examples/dataloader/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,24 @@ fn main() -> anyhow::Result<()> {

// build dataloader
let dl = DataLoader::new(
// "images/bus.jpg", // remote image
// "../images", // image folder
// "../demo.mp4", // local video
// "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4", // remote video
// "rtsp://admin:[email protected]:554/h265/ch1/", // rtsp h264 stream
"./assets/bus.jpg", // local image
// "images/bus.jpg", // remote image
// "../images", // image folder
// "../demo.mp4", // local video
// "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4", // remote video
// "rtsp://admin:[email protected]:554/h265/ch1/", // rtsp h264 stream
)?
.with_batch(1)
.with_progress_bar(true)
.with_bound(100)
.build()?;

// // build annotator
// build annotator
let annotator = Annotator::new()
.with_bboxes_thickness(4)
.with_saveout("YOLO-DataLoader");

// run
for (xs, _) in dl {
// std::thread::sleep(std::time::Duration::from_millis(1000));
// std::thread::sleep(std::time::Duration::from_millis(100));
let ys = model.forward(&xs, false)?;
annotator.annotate(&xs, &ys);
}
Expand Down
252 changes: 252 additions & 0 deletions examples/florence2/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
use usls::{models::Florence2, Annotator, DataLoader, Options, Task};

fn main() -> Result<(), Box<dyn std::error::Error>> {
// vision encoder
let options_vision_encoder = Options::default()
.with_model("florence2/base-vision-encoder.onnx")?
.with_i00((1, 2, 4).into())
.with_i02((512, 768, 800).into())
.with_i03((512, 768, 800).into())
.with_profile(false)
.with_cuda(0);

// text embed
let options_text_embed = Options::default()
.with_model("florence2/base-embed-tokens.onnx")?
.with_i00((1, 2, 4).into())
.with_i01((1, 2, 20).into()) // seq_length
.with_tokenizer("florence2/tokenizer.json")?
.with_profile(false);

// transformer encoder
let options_encoder = Options::default()
.with_model("florence2/base-encoder.onnx")?
.with_i00((1, 2, 4).into())
.with_i01((1, 2, 300).into()) // encoder_sequence_length
.with_i10((1, 2, 4).into())
.with_i11((1, 2, 300).into()) // encoder_sequence_length
.with_profile(false);

// transformer decoder
let options_decoder = Options::default()
.with_model("florence2/base-decoder.onnx")?
.with_i00((1, 2, 4).into())
.with_i01((1, 2, 300).into()) // encoder_sequence_length
.with_i10((1, 2, 4).into())
.with_i11((1, 2, 300).into()) // encoder_sequence_length
.with_i20((1, 2, 4).into())
.with_i21((1, 2, 300).into()) // encoder_sequence_length
.with_profile(false);

// transformer decoder merged
let options_decoder_merged = Options::default()
.with_model("florence2/base-decoder-merged.onnx")?
// encoder_attention_mask
.with_i00((1, 2, 4).into())
.with_i01((1, 2, 300).into()) // encoder_sequence_length
// encoder_hidden_states
.with_i10((1, 2, 4).into())
.with_i11((1, 2, 300).into()) // encoder_sequence_length
// inputs_embeds
.with_i20((1, 2, 4).into())
.with_i21((1, 2, 300).into()) // encoder_sequence_length
// past_key_values.0.decoder.key
.with_i30((1, 2, 4).into())
.with_i32_((1, 2, 1).into())
// past_key_values.0.decoder.value
.with_i40((1, 2, 4).into())
.with_i42((1, 2, 1).into())
// past_key_values.0.encoder.key
.with_i50((1, 2, 4).into())
.with_i52((1, 2, 1).into())
// past_key_values.0.decoder.value
.with_i60((1, 2, 4).into())
.with_i62((1, 2, 1).into())
// past_key_values.1.decoder.key
.with_i70((1, 2, 4).into())
.with_i72((1, 2, 1).into())
// past_key_values.1.decoder.value
.with_i80((1, 2, 4).into())
.with_i82((1, 2, 1).into())
// past_key_values.1.encoder.key
.with_i90((1, 2, 4).into())
.with_i92((1, 2, 1).into())
// past_key_values.1.decoder.value
.with_i100((1, 2, 4).into())
.with_i102((1, 2, 1).into())
// past_key_values.2.decoder.key
.with_i110((1, 2, 4).into())
.with_i112((1, 2, 1).into())
// past_key_values.2.decoder.value
.with_i120((1, 2, 4).into())
.with_i122((1, 2, 1).into())
// past_key_values.2.encoder.key
.with_i130((1, 2, 4).into())
.with_i132((1, 2, 1).into())
// past_key_values.2.decoder.value
.with_i140((1, 2, 4).into())
.with_i142((1, 2, 1).into())
// past_key_values.3.decoder.key
.with_i150((1, 2, 4).into())
.with_i152((1, 2, 1).into())
// past_key_values.3.decoder.value
.with_i160((1, 2, 4).into())
.with_i162((1, 2, 1).into())
// past_key_values.3.encoder.key
.with_i170((1, 2, 4).into())
.with_i172((1, 2, 1).into())
// past_key_values.3.decoder.value
.with_i180((1, 2, 4).into())
.with_i182((1, 2, 1).into())
// past_key_values.4.decoder.key
.with_i190((1, 2, 4).into())
.with_i192((1, 2, 1).into())
// past_key_values.4.decoder.value
.with_i200((1, 2, 4).into())
.with_i202((1, 2, 1).into())
// past_key_values.4.encoder.key
.with_i210((1, 2, 4).into())
.with_i212((1, 2, 1).into())
// past_key_values.4.decoder.value
.with_i220((1, 2, 4).into())
.with_i222((1, 2, 1).into())
// past_key_values.5.decoder.key
.with_i230((1, 2, 4).into())
.with_i232((1, 2, 1).into())
// past_key_values.5.decoder.value
.with_i240((1, 2, 4).into())
.with_i242((1, 2, 1).into())
// past_key_values.5.encoder.key
.with_i250((1, 2, 4).into())
.with_i252((1, 2, 1).into())
// past_key_values.5.decoder.value
.with_i260((1, 2, 4).into())
.with_i262((1, 2, 1).into())
//use_cache_branch
.with_i270((1, 2, 1).into())
.with_profile(false);

// build model
let mut model = Florence2::new(
options_vision_encoder,
options_text_embed,
options_encoder,
options_decoder,
options_decoder_merged,
)?;

// load images
let xs = [
// DataLoader::try_read("florence2/car.jpg")?, // for testing region-related tasks
DataLoader::try_read("florence2/car.jpg")?,
// DataLoader::try_read("images/db.png")?,
DataLoader::try_read("assets/bus.jpg")?,
];

// region-related tasks
let quantizer = usls::Quantizer::default();
// let coords = [449., 270., 556., 372.]; // wheel
let coords = [31., 156., 581., 373.]; // car
let (width_car, height_car) = (xs[0].width(), xs[0].height());
let quantized_coords = quantizer.quantize(&coords, (width_car as _, height_car as _));

// run with tasks
let ys = model.run_with_tasks(
&xs,
&[
// w/ inputs
Task::Caption(0),
Task::Caption(1),
Task::Caption(2),
Task::Ocr,
Task::OcrWithRegion,
Task::RegionProposal,
Task::ObjectDetection,
Task::DenseRegionCaption,
// w/o inputs
Task::OpenSetDetection("a vehicle".into()),
Task::CaptionToPhraseGrounding(
"A vehicle with two wheels parked in front of a building.".into(),
),
Task::ReferringExpressionSegmentation("a vehicle".into()),
Task::RegionToSegmentation(
quantized_coords[0],
quantized_coords[1],
quantized_coords[2],
quantized_coords[3],
),
Task::RegionToCategory(
quantized_coords[0],
quantized_coords[1],
quantized_coords[2],
quantized_coords[3],
),
Task::RegionToDescription(
quantized_coords[0],
quantized_coords[1],
quantized_coords[2],
quantized_coords[3],
),
],
)?;

// annotator
let annotator = Annotator::new()
.without_bboxes_conf(true)
.with_bboxes_thickness(3)
.with_saveout_subs(&["Florence2"]);
for (task, ys_) in ys.iter() {
match task {
Task::Caption(_)
| Task::Ocr
| Task::RegionToCategory(..)
| Task::RegionToDescription(..) => {
println!("Task: {:?}\n{:?}\n", task, ys_)
}
Task::DenseRegionCaption => {
let annotator = annotator.clone().with_saveout("Dense-Region-Caption");
annotator.annotate(&xs, ys_);
}
Task::RegionProposal => {
let annotator = annotator
.clone()
.without_bboxes_name(false)
.with_saveout("Region-Proposal");

annotator.annotate(&xs, ys_);
}
Task::ObjectDetection => {
let annotator = annotator.clone().with_saveout("Object-Detection");
annotator.annotate(&xs, ys_);
}
Task::OpenSetDetection(_) => {
let annotator = annotator.clone().with_saveout("Open-Set-Detection");
annotator.annotate(&xs, ys_);
}
Task::CaptionToPhraseGrounding(_) => {
let annotator = annotator
.clone()
.with_saveout("Caption-To-Phrase-Grounding");
annotator.annotate(&xs, ys_);
}
Task::ReferringExpressionSegmentation(_) => {
let annotator = annotator
.clone()
.with_saveout("Referring-Expression-Segmentation");
annotator.annotate(&xs, ys_);
}
Task::RegionToSegmentation(..) => {
let annotator = annotator.clone().with_saveout("Region-To-Segmentation");
annotator.annotate(&xs, ys_);
}
Task::OcrWithRegion => {
let annotator = annotator.clone().with_saveout("Ocr-With-Region");
annotator.annotate(&xs, ys_);
}

_ => (),
}
}

Ok(())
}
Loading

0 comments on commit f0fd493

Please sign in to comment.