diff --git a/dvc.lock b/dvc.lock index dc7c598..28837ad 100644 --- a/dvc.lock +++ b/dvc.lock @@ -10,8 +10,8 @@ stages: outs: - path: data/eidc_metadata.json hash: md5 - md5: fb338ea98ce71bf7f002be952b6db0e1 - size: 12275265 + md5: 413f59888bf033c30cc27b84b1a3f40b + size: 12313041 prepare: cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json deps: @@ -33,8 +33,8 @@ stages: deps: - path: data/eidc_metadata.json hash: md5 - md5: fb338ea98ce71bf7f002be952b6db0e1 - size: 12275265 + md5: 413f59888bf033c30cc27b84b1a3f40b + size: 12313041 - path: scripts/extract_metadata.py hash: md5 md5: e66f21369c5106eaaad4476612c6fb5e @@ -42,20 +42,20 @@ stages: outs: - path: data/extracted_metadata.json hash: md5 - md5: 9f4fc9cb1e8af8e0f2d1c95b311989fc - size: 4616342 + md5: 226225c5bd64e15d803ba88560810c5a + size: 4629991 chunk-data: cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 250 -ol 75 data/extracted_metadata.json data/supporting-docs.json -m 0 deps: - path: data/extracted_metadata.json hash: md5 - md5: 9f4fc9cb1e8af8e0f2d1c95b311989fc - size: 4616342 + md5: 226225c5bd64e15d803ba88560810c5a + size: 4629991 - path: data/supporting-docs.json hash: md5 - md5: 0b14da8f2e73dc8e15747f693c0f70ce - size: 72383140 + md5: e2581aff9abe25942e8009214b88d0a5 + size: 72680213 - path: scripts/chunk_data.py hash: md5 md5: 3ad449140b03e1c2904b22a5b401a12e @@ -63,33 +63,33 @@ stages: outs: - path: data/chunked_data.json hash: md5 - md5: b107dfb052c12ea47b04a5176e8bab4a - size: 176342129 + md5: 9893d839409c8cf4561e99ab5f747f20 + size: 177068127 create-embeddings: cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json -m all-MiniLM-L6-v2 deps: - path: data/chunked_data.json hash: md5 - md5: b107dfb052c12ea47b04a5176e8bab4a - size: 176342129 + md5: 9893d839409c8cf4561e99ab5f747f20 + size: 177068127 - path: scripts/create_embeddings.py hash: md5 - md5: 87bd2ed6373552bea229c9f3465fd3db - size: 1594 + md5: b0d8f7cb90f244e709656b1f38723e2d + size: 1552 outs: - path: data/embeddings.json hash: md5 - md5: 68a9de7fcf765be8ae2f4d3ff6537228 - size: 3739724900 + md5: 5c8ca3cdde4d5bc559fa2e701ff090a8 + size: 3754990368 upload-to-docstore: cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data -em all-MiniLM-L6-v2 -c eidc-data deps: - path: data/embeddings.json hash: md5 - md5: 68a9de7fcf765be8ae2f4d3ff6537228 - size: 3739724900 + md5: 5c8ca3cdde4d5bc559fa2e701ff090a8 + size: 3754990368 - path: scripts/upload_to_docstore.py hash: md5 md5: 930456cedd43723c1d643ad90c146952 @@ -97,8 +97,8 @@ stages: outs: - path: data/chroma-data hash: md5 - md5: 486d560a81dc951bdd85772996e62f00.dir - size: 1815042692 + md5: c06796220fbfe9db3b08b8439edf87b4.dir + size: 3081399131 nfiles: 6 run-rag-pipeline: cmd: python scripts/run_rag_pipeline.py -i data/eidc_rag_testset.csv -o data/evaluation_data.csv -ds @@ -106,40 +106,50 @@ stages: deps: - path: data/chroma-data hash: md5 - md5: 486d560a81dc951bdd85772996e62f00.dir - size: 1815042692 + md5: c06796220fbfe9db3b08b8439edf87b4.dir + size: 3081399131 nfiles: 6 - path: data/eidc_rag_testset.csv hash: md5 - md5: a371d83c5822d256286e80d64d58c3fe - size: 7524 + md5: 90d23c9bfcaddf9f152109a7b51e3151 + size: 149155 - path: scripts/run_rag_pipeline.py hash: md5 - md5: 2d6dc886728d4bd46676ecd1882f1fd1 - size: 5838 + md5: 35eb5f65605242094a1581b92e9b2ef4 + size: 5862 outs: - path: data/evaluation_data.csv hash: md5 - md5: a473732be874c8256f7178ef3f4dc7a9 - size: 9576 + md5: 1b5f226c52d70bda7e2551d7778c1e89 + size: 385945 - path: data/pipeline.yml hash: md5 - md5: 8e3c4e49d4d97f613e83468d010a96e9 - size: 3440 + md5: 70385a724cdf687c287596b8360e1448 + size: 3334 generate-testset: - cmd: head -n 101 data/synthetic-datasets/eidc_rag_test_sample.csv > data/eidc_rag_testset.csv + cmd: python scripts/generate_synthetic_testset.py data/extracted_metadata.json + data/eidc_rag_testset.csv 200 + deps: + - path: data/extracted_metadata.json + hash: md5 + md5: 226225c5bd64e15d803ba88560810c5a + size: 4629991 + - path: scripts/generate_synthetic_testset.py + hash: md5 + md5: fdac8b2f28de8f3b4e5025ca47bb94ca + size: 2175 outs: - path: data/eidc_rag_testset.csv hash: md5 - md5: a371d83c5822d256286e80d64d58c3fe - size: 7524 + md5: 90d23c9bfcaddf9f152109a7b51e3151 + size: 149155 fetch-supporting-docs: cmd: python scripts/fetch_supporting_docs.py data/eidc_metadata.json data/supporting-docs.json deps: - path: data/eidc_metadata.json hash: md5 - md5: fb338ea98ce71bf7f002be952b6db0e1 - size: 12275265 + md5: 413f59888bf033c30cc27b84b1a3f40b + size: 12313041 - path: scripts/fetch_supporting_docs.py hash: md5 md5: 02b94a2cc7bff711784cbdec3650b618 @@ -147,16 +157,16 @@ stages: outs: - path: data/supporting-docs.json hash: md5 - md5: 0b14da8f2e73dc8e15747f693c0f70ce - size: 72383140 + md5: e2581aff9abe25942e8009214b88d0a5 + size: 72680213 evaluate: cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json -img data/eval.png deps: - path: data/evaluation_data.csv hash: md5 - md5: a473732be874c8256f7178ef3f4dc7a9 - size: 9576 + md5: 1b5f226c52d70bda7e2551d7778c1e89 + size: 385945 - path: scripts/evaluate.py hash: md5 md5: 4154acf8e74c1d8bcd0b0da72af038e0 @@ -164,9 +174,9 @@ stages: outs: - path: data/eval.png hash: md5 - md5: 7bfd424fa4c9a3550d6e9605bb2f6af2 - size: 89143 + md5: 3308b984c5168a996805443d25697026 + size: 83001 - path: data/metrics.json hash: md5 - md5: f768092fe2696328ff4da565e763e743 - size: 270 + md5: 709909482614d6cb47c160506088f53e + size: 287 diff --git a/params.yaml b/params.yaml index b2947b8..b3dc785 100644 --- a/params.yaml +++ b/params.yaml @@ -19,7 +19,7 @@ files: pipeline: data/pipeline.yml sub-sample: 0 # sample n datasets for testing (0 will use all datasets) max-length: 0 # truncate longer texts for testing (0 will use all data) -test-set-size: 101 # reduce the size of the test set for faster testing +test-set-size: 200 # reduce the size of the test set for faster testing rag: model: llama3.1 prompt: >- diff --git a/pyproject.toml b/pyproject.toml index f1b6413..6e7914d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ dependencies = [ "nbformat == 5.10.4", "langchain == 0.2.7", "pygit2 == 1.14.1", - "pysqlite3 == 0.5.4", + "pysqlite3-binary == 0.5.4", ] [project.optional-dependencies]