Skip to content

Commit

Permalink
Updated data from scicom run
Browse files Browse the repository at this point in the history
  • Loading branch information
matthewcoole committed Dec 13, 2024
1 parent dfeb31f commit 9aae2bf
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 47 deletions.
100 changes: 55 additions & 45 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ stages:
outs:
- path: data/eidc_metadata.json
hash: md5
md5: fb338ea98ce71bf7f002be952b6db0e1
size: 12275265
md5: 413f59888bf033c30cc27b84b1a3f40b
size: 12313041
prepare:
cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
deps:
Expand All @@ -33,140 +33,150 @@ stages:
deps:
- path: data/eidc_metadata.json
hash: md5
md5: fb338ea98ce71bf7f002be952b6db0e1
size: 12275265
md5: 413f59888bf033c30cc27b84b1a3f40b
size: 12313041
- path: scripts/extract_metadata.py
hash: md5
md5: e66f21369c5106eaaad4476612c6fb5e
size: 1313
outs:
- path: data/extracted_metadata.json
hash: md5
md5: 9f4fc9cb1e8af8e0f2d1c95b311989fc
size: 4616342
md5: 226225c5bd64e15d803ba88560810c5a
size: 4629991
chunk-data:
cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 250 -ol 75 data/extracted_metadata.json data/supporting-docs.json -m
0
deps:
- path: data/extracted_metadata.json
hash: md5
md5: 9f4fc9cb1e8af8e0f2d1c95b311989fc
size: 4616342
md5: 226225c5bd64e15d803ba88560810c5a
size: 4629991
- path: data/supporting-docs.json
hash: md5
md5: 0b14da8f2e73dc8e15747f693c0f70ce
size: 72383140
md5: e2581aff9abe25942e8009214b88d0a5
size: 72680213
- path: scripts/chunk_data.py
hash: md5
md5: 3ad449140b03e1c2904b22a5b401a12e
size: 2705
outs:
- path: data/chunked_data.json
hash: md5
md5: b107dfb052c12ea47b04a5176e8bab4a
size: 176342129
md5: 9893d839409c8cf4561e99ab5f747f20
size: 177068127
create-embeddings:
cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json
-m all-MiniLM-L6-v2
deps:
- path: data/chunked_data.json
hash: md5
md5: b107dfb052c12ea47b04a5176e8bab4a
size: 176342129
md5: 9893d839409c8cf4561e99ab5f747f20
size: 177068127
- path: scripts/create_embeddings.py
hash: md5
md5: 87bd2ed6373552bea229c9f3465fd3db
size: 1594
md5: b0d8f7cb90f244e709656b1f38723e2d
size: 1552
outs:
- path: data/embeddings.json
hash: md5
md5: 68a9de7fcf765be8ae2f4d3ff6537228
size: 3739724900
md5: 5c8ca3cdde4d5bc559fa2e701ff090a8
size: 3754990368
upload-to-docstore:
cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data -em
all-MiniLM-L6-v2 -c eidc-data
deps:
- path: data/embeddings.json
hash: md5
md5: 68a9de7fcf765be8ae2f4d3ff6537228
size: 3739724900
md5: 5c8ca3cdde4d5bc559fa2e701ff090a8
size: 3754990368
- path: scripts/upload_to_docstore.py
hash: md5
md5: 930456cedd43723c1d643ad90c146952
size: 2793
outs:
- path: data/chroma-data
hash: md5
md5: 486d560a81dc951bdd85772996e62f00.dir
size: 1815042692
md5: c06796220fbfe9db3b08b8439edf87b4.dir
size: 3081399131
nfiles: 6
run-rag-pipeline:
cmd: python scripts/run_rag_pipeline.py -i data/eidc_rag_testset.csv -o data/evaluation_data.csv -ds
data/chroma-data -c eidc-data -m llama3.1 -p data/pipeline.yml
deps:
- path: data/chroma-data
hash: md5
md5: 486d560a81dc951bdd85772996e62f00.dir
size: 1815042692
md5: c06796220fbfe9db3b08b8439edf87b4.dir
size: 3081399131
nfiles: 6
- path: data/eidc_rag_testset.csv
hash: md5
md5: a371d83c5822d256286e80d64d58c3fe
size: 7524
md5: 90d23c9bfcaddf9f152109a7b51e3151
size: 149155
- path: scripts/run_rag_pipeline.py
hash: md5
md5: 2d6dc886728d4bd46676ecd1882f1fd1
size: 5838
md5: 35eb5f65605242094a1581b92e9b2ef4
size: 5862
outs:
- path: data/evaluation_data.csv
hash: md5
md5: a473732be874c8256f7178ef3f4dc7a9
size: 9576
md5: 1b5f226c52d70bda7e2551d7778c1e89
size: 385945
- path: data/pipeline.yml
hash: md5
md5: 8e3c4e49d4d97f613e83468d010a96e9
size: 3440
md5: 70385a724cdf687c287596b8360e1448
size: 3334
generate-testset:
cmd: head -n 101 data/synthetic-datasets/eidc_rag_test_sample.csv > data/eidc_rag_testset.csv
cmd: python scripts/generate_synthetic_testset.py data/extracted_metadata.json
data/eidc_rag_testset.csv 200
deps:
- path: data/extracted_metadata.json
hash: md5
md5: 226225c5bd64e15d803ba88560810c5a
size: 4629991
- path: scripts/generate_synthetic_testset.py
hash: md5
md5: fdac8b2f28de8f3b4e5025ca47bb94ca
size: 2175
outs:
- path: data/eidc_rag_testset.csv
hash: md5
md5: a371d83c5822d256286e80d64d58c3fe
size: 7524
md5: 90d23c9bfcaddf9f152109a7b51e3151
size: 149155
fetch-supporting-docs:
cmd: python scripts/fetch_supporting_docs.py data/eidc_metadata.json data/supporting-docs.json
deps:
- path: data/eidc_metadata.json
hash: md5
md5: fb338ea98ce71bf7f002be952b6db0e1
size: 12275265
md5: 413f59888bf033c30cc27b84b1a3f40b
size: 12313041
- path: scripts/fetch_supporting_docs.py
hash: md5
md5: 02b94a2cc7bff711784cbdec3650b618
size: 1718
outs:
- path: data/supporting-docs.json
hash: md5
md5: 0b14da8f2e73dc8e15747f693c0f70ce
size: 72383140
md5: e2581aff9abe25942e8009214b88d0a5
size: 72680213
evaluate:
cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json
-img data/eval.png
deps:
- path: data/evaluation_data.csv
hash: md5
md5: a473732be874c8256f7178ef3f4dc7a9
size: 9576
md5: 1b5f226c52d70bda7e2551d7778c1e89
size: 385945
- path: scripts/evaluate.py
hash: md5
md5: 4154acf8e74c1d8bcd0b0da72af038e0
size: 2728
outs:
- path: data/eval.png
hash: md5
md5: 7bfd424fa4c9a3550d6e9605bb2f6af2
size: 89143
md5: 3308b984c5168a996805443d25697026
size: 83001
- path: data/metrics.json
hash: md5
md5: f768092fe2696328ff4da565e763e743
size: 270
md5: 709909482614d6cb47c160506088f53e
size: 287
2 changes: 1 addition & 1 deletion params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ files:
pipeline: data/pipeline.yml
sub-sample: 0 # sample n datasets for testing (0 will use all datasets)
max-length: 0 # truncate longer texts for testing (0 will use all data)
test-set-size: 101 # reduce the size of the test set for faster testing
test-set-size: 200 # reduce the size of the test set for faster testing
rag:
model: llama3.1
prompt: >-
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ dependencies = [
"nbformat == 5.10.4",
"langchain == 0.2.7",
"pygit2 == 1.14.1",
"pysqlite3 == 0.5.4",
"pysqlite3-binary == 0.5.4",
]

[project.optional-dependencies]
Expand Down

0 comments on commit 9aae2bf

Please sign in to comment.