From 23e1d9fd0294804f484543ff7d6c4be9793e8ac5 Mon Sep 17 00:00:00 2001 From: thanasornsawan Date: Thu, 19 Dec 2024 11:51:47 +0200 Subject: [PATCH] add code and update readme --- .github/workflows/ci_cd_pipeline.yml | 54 +++++++ .gitignore | 4 + Dockerfile | 22 +++ README.md | 63 ++++++++- docker-compose.yml | 16 +++ orders_test_data.xlsx | Bin 0 -> 9750 bytes requirements.txt | 5 + sql/Dockerfile | 7 + sql/sqlite_db/db_queries.py | 102 ++++++++++++++ sql/sqlite_db/setup_db.py | 39 ++++++ tests/load_data.py | 68 +++++++++ tests/test_etl.py | 201 +++++++++++++++++++++++++++ 12 files changed, 574 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/ci_cd_pipeline.yml create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 docker-compose.yml create mode 100644 orders_test_data.xlsx create mode 100644 requirements.txt create mode 100644 sql/Dockerfile create mode 100644 sql/sqlite_db/db_queries.py create mode 100644 sql/sqlite_db/setup_db.py create mode 100644 tests/load_data.py create mode 100644 tests/test_etl.py diff --git a/.github/workflows/ci_cd_pipeline.yml b/.github/workflows/ci_cd_pipeline.yml new file mode 100644 index 0000000..5e547ba --- /dev/null +++ b/.github/workflows/ci_cd_pipeline.yml @@ -0,0 +1,54 @@ +name: ETL CI/CD Pipeline + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout the code + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.9 + + - name: Install Docker and Docker Compose + run: | + sudo apt-get update + sudo apt-get install -y docker.io + sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose + sudo chmod +x /usr/local/bin/docker-compose + + - name: Build and start Docker containers with docker-compose + run: | + docker-compose -f docker-compose.yml up -d + sleep 10 # Wait for the DB to start properly (adjust if needed) + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Run database setup + run: python sql/sqlite_db/setup_db.py + + - name: Load data into database + run: tests/load_data.py + + - name: Run tests + run: | + pytest tests/test_etl.py + continue-on-error: true + + - name: Clean up Docker containers + run: | + docker-compose -f docker-compose.yml down \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cd215b4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +__pycache__ +etl.db +.venv +.pytest_cache \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..361dedd --- /dev/null +++ b/Dockerfile @@ -0,0 +1,22 @@ +# Dockerfile + +# Use a base Python image +FROM python:3.9-slim + +# Set the working directory +WORKDIR /app + +# Copy the requirements file into the container +COPY requirements.txt . + +# Install the dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the project files into the container +COPY . . + +# Set environment variables (if necessary) +ENV DB_PATH="/opt/airflow/sqlite_db/etl.db" + +# Command to run when the container starts +CMD ["pytest", "tests/test_etl.py"] diff --git a/README.md b/README.md index 0083a36..68ce4f5 100644 --- a/README.md +++ b/README.md @@ -21,10 +21,59 @@ Handle missing or invalid values for Quantity (e.g., replace negative values wit ## Test Plan -| **Test Case ID** | **Test Case Description** | **Steps to Execute** | **Expected Result** | **Business Rule Compliance** | **Risk Level** | **Test Data** | -|------------------|-----------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------|------------------------------------------------------------------|----------------------------------|-----------------------------------------------------------| -| TC_01 | **Validate Customer_ID Uniqueness** | - Insert two orders with the same Customer_ID.
- Check if the system raises an error or rejects the second order. | **Failure**: The system should reject the second order with the same Customer_ID. | Duplicate Customer_ID violates uniqueness in the orders table. | **Critical** – Affects data integrity. | Customer_ID: 1234 (used for two orders)
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: 2 | -| TC_02 | **Validate Correct Date Format** | - Insert an order with an invalid date format (e.g., `12/01/2024` for `Order_Date`).
- Attempt to save the order. | **Failure**: The system should reject the order due to incorrect date format. | The `Order_Date` must follow a standardized format. | **High** – Incorrect data can cause parsing issues and errors in reporting. | Customer_ID: 1234
Order_Date: "12/01/2024" (invalid format)
Product_ID: 567
Quantity: 2 | -| TC_03 | **Validate Missing Customer_Name** | - Insert an order with a missing `Customer_Name` value.
- Attempt to save the order. | **Failure**: The system should reject the order due to missing customer name. | The `Customer_Name` field is mandatory for all orders. | **High** – Missing customer information affects order processing and analysis. | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: 2 (Customer_Name: NULL) | -| TC_04 | **Validate Negative Quantity** | - Insert an order with a negative `Quantity` value.
- Attempt to save the order. | **Failure**: The system should reject the order due to invalid quantity. | `Quantity` must always be a positive number. | **High** – Negative quantity violates business logic and can affect financial calculations. | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: -5 | -| TC_05 | **Validate Missing Order Date** | - Insert an order with a missing `Order_Date` value.
- Attempt to save the order. | **Failure**: The system should reject the order due to missing order date. | `Order_Date` cannot be missing. | **Critical** – Missing order dates make the data unusable for time-based analysis. | Customer_ID: 1234
Customer_Name: "John Doe"
Product_ID: 567
Quantity: 2 (Order_Date: NULL) | \ No newline at end of file +| Test Case ID | Test Case Description | Steps to Execute | Expected Result | Risk Level | Test Data | +|--------------|------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------|----------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------| +| TC_001 | **Validate Customer ID Uniqueness** | - Execute `validate_customer_id_unique` query.
- Fetch the results into a DataFrame.
- Check for any duplicate `Customer_ID`s. | **Failure**: The DataFrame should be empty, indicating no duplicates. | **Critical** – Affects data integrity | Customer_ID: 1234 (used for two orders)
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: 2 | +| TC_002 | **Validate Correct Date Format** | - Execute `validate_order_date_format` query.
- Fetch the results into a DataFrame.
- Validate if the `Order_Date` is in the correct format (`dd/mm/yyyy`). | **Failure**: The DataFrame should have no invalid date formats. | **High** – Affects date parsing and reporting | Customer_ID: 1234
Order_Date: "12/01/2024" (invalid format)
Product_ID: 567
Quantity: 2 | +| TC_003 | **Validate Missing Customer Name** | - Execute `get_orders_with_missing_customer_name` query.
- Fetch the results into a DataFrame.
- Check for any missing `Customer_Name` values. | **Failure**: There should be no missing customer names. | **High** – Affects order processing | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: 2 (Customer_Name: NULL) | +| TC_004 | **Validate Negative Quantity Orders** | - Execute `get_orders_with_negative_quantity` query.
- Fetch the results into a DataFrame.
- Check for negative `Quantity` values. | **Failure**: The DataFrame should have no rows with negative quantities. | **High** – Affects business logic and financial calculations | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: -5 | +| TC_005 | **Validate Order Date Range (December 2024 only)** | - Execute the query to fetch all `Order_ID` and `Order_Date` from the `Orders` table.
- Check each order's date format and ensure it's within the range `2024-12-01` to `2024-12-31`.
- Identify invalid or out-of-range dates. | **Failure**: Orders with `Order_Date` outside the range `2024-12-01` to `2024-12-31` should be flagged.
**Failure**: Orders with invalid date formats should be flagged. | **High** – Invalid or out-of-range dates can affect reporting and processing. | Customer_ID: 1234
Order_Date: "01/12/2024"
Product_ID: 567
Quantity: 10 (Valid date)
Customer_ID: 5678
Order_Date: "01/11/2024" (Out of range)
Customer_ID: 91011
Order_Date: "InvalidDate" (Invalid format) | | +| TC_006 | **Validate Invalid Email Format** | - Execute `get_invalid_email_customers` query.
- Fetch the results into a DataFrame.
- Check for invalid email formats. | **Failure**: The DataFrame should have no rows with invalid emails. | **High** – Affects customer communication | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: 2
Customer_Email: "invalid_email" | +| TC_007 | **Ensure Unique Product_ID in Order** | - Execute `get_orders_with_duplicate_product_id` query.
- Fetch the results into a DataFrame.
- Check for duplicate `Product_ID`s in orders. | **Failure**: The DataFrame should be empty, indicating no duplicates. | **Critical** – Affects data integrity | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567 (duplicate)
Quantity: 2 | +| TC_008 | **Ensure Product_Name Cannot Be NULL** | - Execute `get_orders_with_null_product_name` query.
- Fetch the results into a DataFrame.
- Check for any `NULL` values in `Product_Name`. | **Failure**: The DataFrame should have no rows with NULL `Product_Name`. | **High** – Affects order completeness | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: 2
Product_Name: NULL | +| TC_009 | **Validate Referential Integrity Between Orders and Products** | - Execute `get_invalid_product_references` query.
- Fetch the results into a DataFrame.
- Check for any `Product_ID` references that do not exist in Products. | **Failure**: The DataFrame should have no rows indicating invalid `Product_ID` references. | **Critical** – Affects data integrity | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 999 (non-existing)
Quantity: 2 | + +## To run a specific test case: +**Run by exact function name:** +```sh +pytest -s tests/test_etl.py::test_invalid_product_id +``` +This will run only the test_invalid_product_id test case in tests/test_etl.py. + +## Running the Project Locally + +**1.Create and Activate a Virtual Environment** +```sh +python3 -m venv venv +``` + +Activate the virtual environment: +```sh +source venv/bin/activate +``` + +**2.Install Project Dependencies** +```sh +pip install -r requirements.txt +``` + +**3.Docker step** +```sh +docker-compose down +docker-compose up -d +``` + +**4.Set Up the Database** +```sh +python sql/sqlite_db/setup_db.py +``` + +**5.Load Data into the Database** +```sh +python tests/load_data.py +``` + +**6.Run the test** +```sh +pytest tests/test_etl.py +``` \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..3e6ab56 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,16 @@ +version: '3.8' + +services: + sqlite_db: + build: ./sql # Path where your Dockerfile is located + container_name: sqlite_db + volumes: + - ./sql/sqlite_db:/opt/sqlite_db # Map the local folder to the container's folder + ports: + - "8081:8080" # Adjust if needed + networks: + - sqlite_network + +networks: + sqlite_network: + driver: bridge diff --git a/orders_test_data.xlsx b/orders_test_data.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..c15865891c6d9fa159009346019e8f211d092935 GIT binary patch literal 9750 zcmeHtbySq!x<4JlpoD~yUvQA_E@coQwLc+qrLb`<=sg87wNH4CP zEZGe0^c>BsO~8M<;9zyKvV`<$SvRudVr~M;Z~2eed&ROKA{`!dLaS2%kAQCTC{juHbw96^LvfzdbdexhP7B9 z7nAYi_Ex9=vEiznUWo7hu1Vv`GZDdAPTbYOV!$GGT7+OGHuUkUXVC2RM>`L&;#`}{ zlYsN<4XAOb3%h4eM}10pZtOpVkRS2W_z>$Kq3;{d=44{#DT*kZAh&vWaRs9I@kTS2 z!x_e<#ZQH?`OoekrrMQiJBM}Jr-<5+Ddd}&D?l#oeg#17+DIcw$4$>NQ&bAiuJ&^N zgN3OA#tz<+`%IPv+S9q+MSqm(D9B)9Vd|TCqa!0BVWJ`-Dg3*t^$ zG+>0Dhb^sDs#QaK9ieeUcyk!Ct5eHtBPZH38sD=1{V-Dm0!i$)$$vH)1S{*6KR;c8 z(>JYk7`r;Vx;qzDLd3*Q1!oe@?7oHW_GTlFd#VoITnpo8U+tdkO}#pv8y_EEUY~Mv zu5hkAVy~ML*)cBB*0y%^(5w@IxjpY?%ipRqa|SOLsXLZdYfdJLc5hpra)EQEtJO7o zbaL}VSy#$neve0X7150#uA&x_-62GRMd=Uz)BjyX*7(HH?kl$T9Tx&yY6xo1<$EjMU_Jv_0 z=SA{e@pYIU1t%2?iOuA_t==i;`iS!$!y>Mtl*}EwiTRbA@Evop!JU<~@pFmV8VOeD zR{Dv%`^(y+@*4F?sE1CHr6{Z8x4PPxnH*6uv8fu{I`_HI7MLTfiY-AaA!(y->n!q| zc{z!1=dfd{>mYx4{d9`zb(QD=kmC60471i1njIRyVSbo@*rYtR)`2+bS!#>l=o;?v zAQ^=@R^=ZKvVPY&H$XUxl_!38;5$8im(N=Efn)_fyb_6+5++$ub4)q`^DQjDo}S{f ze5G@0&mkmsTrBiC+lE$KR%&d{|HDmMP052ff3-n@dE4gvL40Tac|?cB+Pioh>{meiHc~$W>6;*ruru+C8OT0hK{7kD`|P0kk_p*ygpflui@j)=%{{G zyB(U#NU5bNT$$x$sEV8|$iF;E1*VkalMNA|Zxg1Z0E@bGAJLt%%Y~>@g?7m6yP#oo zWt|p2G48gFXUYdp!|GufyBz@vqS#yTI3X(B9ZeRST^J<8Gf7s2>M8q~&i+wHcZn<8y9(d$9_O z@opLJpT|pFbIleaW!IYC4@vPxY@E++4^W2Q+OfC7NYgp`ThW3$c!6fJB^-|o zDtXgVINa|J4#p%0lbLppwZ3C--OmsIAezFFTcm7a1Az#-!oLoUzc5LMP74Y-d>rZapCvth8bYi4-(}CkzSvO(bumA_H6keFVvNU|rYD1uL$Y zLxKd=;F?1f8Y`cm!-6CZe~DC$RHQ();FF3gc@CQy;88)xp$ZF#(E4Nvl)_p=OJnAM2*jJsTl49yNUR>z`RMy6 zuke-(LqjgcJLueccY7_?h;(%k^LuuH2fEqItInc|vu))3r z;3~IDI<^pC$_zyw@{Xe(Qb-2k^4Mo!O91Yf ziz|YH;&g`;Qh?+$CqcSKd z4ry2+11Q16orSFgcyHdQ0!oj2FszU@0?%(PsL>r($OdZi>}6p;2eg=rKLZuTeH>QE z1zPghXJcyumdwS~Kvi+G!wUI8H=ez0fQvo~3`Y+Du<&{gYKdbX=_>$+@kHfd8v$4> zywpKGake9UMZk|dM7h{zfTtE-8ld5cjP$n$)nSRGja;C44XM$}@{#@$);tIM+#m}I zE%@(P)-D=&uzx9afh9_nMK5vqE+diPv_U{ogduPPQG7p%WSB6;%~(Y~c&9>9M4;0| z3bA)TiE5bevzxDqeE25|Elv2U2Fa*k~FWeqkx|q3OE* zOQ}ov9|PH@LOeHqX8pB@A5uTFY6g4Q|Lw)OP6hh>yA!$h1?&FX6G=_^wjFQV(^76W z6OLk3@-~-v<^f(>eY5~+-d&xz^*h#}RoKOJ|4Qm-EKR;-eb=8^e>daS>2Ob7^|4yn z`OcZn6w-+|MSUeYDGUzOokq`ry>X`cVvV*8sjs{pvyTEd)}03Az&!g)Xz~E|R{j>K zT6gCs9*p{y^Y)d5S0dlNID^+&|3KRt!C_f{;T0GEf3q8Z0+p}XAz-RyR6FQvXrfj zC{(N6RAqBpV$wkjsQt&@@BeITqRUJBO77*c*joiy;Ap*=pAaAYU(2dZkDML~*joQw z<}iOOxE2gKfXca8(ASnlPEW33Iyw%(g!Ei%8VG0R9@~6t-gg>^7yhP|xW_BMIzcc> zTv^&&@-~mf5e}XPteiPpIlqW+-luqR~B_uCo#}4p-s*}LZS|06S~z5)M$(J4V{9|V@qVV6T2G@*ruVt=kySAloqba#vcMr zu}A>m+%@$~ed8O?4P~2ixmwA;lo}+eh!?d53>CsDRnM&IZ4Cl*hr`NBF$;v~kAf)L z`pG6Fz*x-)C>9SNgKOKduYO{NwtAb?9+Y>&*-XD&(Co4f(4roHpe?Dcw1pnHb3B%y783_YIbXsrp~ z#b?+JcN;az6NK4zOqZAlBFB5U7zeN8pkeL{e`|0op#a*(8jSMco!O4WcZ}T)`IfG9 zJhz-%X&yIH8^T6D@)=U_feIQV80!B|sIMC!w2sm6db0in~=mP9zLn$VMOz7wO<1o~I>RFW2Si^8aqlIkLH z+Ks6qR{Qnh5-BeWed(BqKIZW*#RA{00dZ4sm}!90r26)lG+uX1TkCUaBO{As8sCl&w|MU&zxwA+oTwyrT0W0VJFl% zCViO=Kv#UPuUg=Od}%drkrB6xFRX^~w^l>=r;o8WHL@~dyZXLrmG!8>tS6~&oqC<2 z&et@I)-Y5l7RIYmX{fH3u5%T9OKliK;Way}C3UrOllf#!Yovp+z~#rqRUg)sT}Os~ z(Ti+Y{&JSPb7xihk7Sj^j)<##!Ag!tR$yK4mbhT@zkQ9^`F)xH5y zEjMWJ?{g88I6sUzB3=@1QhMtAPCSI_wv}YKfjU>zTvWqEQeCT9=qGOKDq((OpN>Eu z!obuB)`;R;x7ZUHg8h7MG0ZO0?L0C@@KtT(c#nScN&EWxO{vw) zjJ1V5YJlC*l95x8cq=sXBsO6}itH?Mw+3Z(K5L`njT@u159-^hWrnnSBHH-E);F7D zC=mWdQ8j~%EF7Gu!+IuD8Fg^l%#;wNCpUKjBJ#y$2n3!s)o`u{yfTl_q%(icHrmia z4(FJyIDvF7hRuqz7{G7qD=N345LGDnVawUUKa&^PI6E=I*CrV-nO{^b@x6L4uC8vf z`#p?z&xIIA`Az;~9k#~jIZTZd_#BQp^o^y^RDFR@`g&sNC}gUNnCP>G76;m?SpG=L z1knyzC-TE0uBqt};{X^Lw#^zOcYp%F^%Xpjy;RQ&G{BcsG19f}J5Xx}{-Q&4kF=mxpx}9%5b^no)|U-u;XIe(ps;|L5U}=ublfo9fva87kY` zT|P6p?C%FhDq6tUiQalNoO$=TGQ-I9h+u1RmVJfR6pzOYDxQ}W_&U6QGJ5ZDKN9xd z6A~kGB73_LA{oo` zQL)l}qb9dHcu% zOuB%6jN0k7_NFz-)2E4w5@&5a^%@Ee5N>4t! zCkMl85gHA9UZv=kRw(?cTMVyZ%T&6!$wJsmq&Q{>8Y1qgV3{*#ycj}k%PMj zx@=cWr&TPwG+ld}ct50-WGWq6c*xx2+x)!^Hh4~#uU9OQUtuO1pAPz@$GPjS(`$9e zma^YIzl(ml>{WY|o_!FJEq1wan>SpU94{Zvp#5&+0RO#l_Rf|@S35TRtiu|{e&rh* zTxDSnL4sjKXy__?5tQ>NHA)0V_;Y`RsW2|Vg@{t0`%aDE#ifcqPk6gkLl$lzrIt&y znn>1F68KG@x6MCOlSI?X;KV&r+pV!(FzW_mu|CIUOp97aRePn}7g!*NAJW5dQZ?xC z2cNNO{fLR!yjU;k%;3*>vPJhwdEXI_*hF|Suk40ODCOBcD7PeGilds9PP}X7ux4Bl zE%D)A?>NmsTn=`R6rD2`SR&-}a$Y7IQfXR8P-JaW1)nE6J|StRuZE|8D_sgeiSQ43xR_7=?yXH?){G@n z)9{pJ`Xqw)9ex|w1C4VdA)54*yLkFb>CIz}i5MYo2EYxeLAYvx zTS!_`!}%G@H?iFp!l03LtFonD5o$Ka9Y(7I)PvKf?F|jMXIA5eF`pvUYxz2;YJ?ch z_}E(cYB@a>M{MWHZxqzf0Ae0h#rM)w9v4NOpj@7_I|@?dEGS4wFR_1j&TjsB&Mxy; zeH$B#t7m@^Q`TPWq`vvO`wv|5&{Kj48X}DX=;JYA=tJK@caA|5joX|1p5l|T#D%n` zYsY&XZB>!A0bJZ#sgtYG!Dw7rT6Ll zNV+TlKusDvA{^5#8TCWYG#F+=^rX5?5c|rr8EMmchGKG;WW-IHo^(>PWvBxW$nY(( zRHkA1o;Gb-hxCT5i$S+!Fl*%esaeXfZTZva!D=esisX2z?^cCJR15)T5hvm-;|t7b z7964xd)(azQr5RS+A1l?d&)#+W)ii8P>GAewJC3hwX+9zInsX_8k^ee*48a~yr>kw zX3ZAua|RLRXELYU+zqNa$?6(>2zo#R@aL0tnOu?mrX7hlaL9MLCz)$e57jPg-}tu= zX7BvDCx4b**LUVBGpvbtYTm?t>#cmBkpK8-A*3{lKp>u=z4Yl#>YM^2eHrn%vxDyt zUTz2K7*<)ec@d5+chll&H{2ShrWnC3A|iD>6s?V?D;nr*#~TO{e~AgQDObN6lCobi z^G4*yEaqbktHRLy%0Jf=yv1ZtwtCseu!~GF!!3#z_q_EPh8C55P+|A35Lr5z0>mf~ zqIktlZ~97IbMwe#aH8u{8i_LVT`wQ=&| zU#~bC{)BiY#obHf+@KtoDS`D-Ugecy zJNJurF7zV#Tg~BINc^d`|4yPG;(4*(LhrjsqK&^R+3LMZ{=xyY1WgR}n4Jw0lskL`B4rnlD`?jVnTuNFbVQPg0eFTAhYg5*%bz{cMZ zV=XRLUQeL!{YF&MaZxyf(u9oD&U7%1R;(*V(xyDpJPPBGwReBA?yiD+Oin+FTMCZu z#`-o@e5JL5(;p5={jF5Dbpq&QY}ju+2cFT1P5ZYpC)a03cnCkQxrpSLSf6c@sS7V| z@*pTMm&V(o0y9V9xC|3b5u^H_M5B~6-B*_iScYEL+jGG& zz_f%m9y6BDaNe`1D7bZO7(xYNak*lxf>}pMr3GFT;!_o(SS@`n42p1v>R{~We8hU@ z^@>=g6r^9Y)?-+^EB)w3sY)SG@|+q?6>x`oQ+UE>h?Uge>@IO5sKf_7@SCj@J);Tj z1iwEEX1rYD7vJ|GEe4L$N*{2mN3*JmcnBjrm=2j>{6IV zrNaz7eU!4dqd5tqnj`5nwX$!dW9~qJBPyHYKwxvdR6OUYK@8 zB_xwYH%SGf@|#y`qKWl`@jCO=@uPFbp5%eqI^)mS9WrY#^3izW}@+->qI`ZdbHC*JhKi}N7W&J3ie+9grFJG3Q zKdANg72vPM=&u0R^R~;h`UkO*T><=ElKm^f^@x0#-~FJ~3zvU!PvF0@yIXQeF-ICz0k?oa>J1($oGRO6s5d?ym!{Th!0G$EHC- Z`UhT2K?WV;5*={yalG*O0e@Wn`!6kFkbeLG literal 0 HcmV?d00001 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ef47ed9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +# For setting up the environment +apache-airflow==2.5.0 # If using Airflow for orchestration +pandas==1.5.3 # For handling data manipulation +pytest==7.2.2 # For running tests +openpyxl==3.0.10 # For reading and writing Excel files (e.g., orders_test_data.xlsx) \ No newline at end of file diff --git a/sql/Dockerfile b/sql/Dockerfile new file mode 100644 index 0000000..f87ca23 --- /dev/null +++ b/sql/Dockerfile @@ -0,0 +1,7 @@ +FROM nouchka/sqlite3:latest + +# Set working directory to where the database will reside +WORKDIR /opt/sqlite_db + +# Initialize or create the SQLite database +RUN sqlite3 /opt/sqlite_db/etl.db "CREATE TABLE IF NOT EXISTS Orders (Order_ID INTEGER PRIMARY KEY AUTOINCREMENT, Product_Name TEXT, Quantity INTEGER);" diff --git a/sql/sqlite_db/db_queries.py b/sql/sqlite_db/db_queries.py new file mode 100644 index 0000000..f6b4372 --- /dev/null +++ b/sql/sqlite_db/db_queries.py @@ -0,0 +1,102 @@ +# Query to Validate Customer_ID Uniqueness +def validate_customer_id_unique(): + return """ + SELECT Customer_ID, Order_Date, COUNT(*) AS Order_Count + FROM Orders + GROUP BY Customer_ID, Order_Date + HAVING COUNT(*) > 1 + """ + +# Query to Validate Correct Date Format +def validate_order_date_format(): + return """ + SELECT Order_ID, Order_Date + FROM Orders + WHERE Order_Date IS NULL + OR NOT (Order_Date GLOB '????-??-??' + AND LENGTH(Order_Date) = 10 + AND CAST(substr(Order_Date, 1, 4) AS INTEGER) > 0 + AND substr(Order_Date, 6, 2) BETWEEN '01' AND '12' + AND CASE + WHEN substr(Order_Date, 6, 2) IN ('01', '03', '05', '07', '08', '10', '12') THEN substr(Order_Date, 9, 2) BETWEEN '01' AND '31' + WHEN substr(Order_Date, 6, 2) IN ('04', '06', '09', '11') THEN substr(Order_Date, 9, 2) BETWEEN '01' AND '30' + WHEN substr(Order_Date, 6, 2) = '02' THEN ( + CASE + WHEN (CAST(substr(Order_Date, 1, 4) AS INTEGER) % 4 = 0 + AND CAST(substr(Order_Date, 1, 4) AS INTEGER) % 100 != 0) + OR CAST(substr(Order_Date, 1, 4) AS INTEGER) % 400 = 0 THEN substr(Order_Date, 9, 2) BETWEEN '01' AND '29' + ELSE substr(Order_Date, 9, 2) BETWEEN '01' AND '28' + END + ) + ELSE 0 + END = 1 + ); + """ + +# Query to find orders with negative quantities +def get_orders_with_negative_quantity(): + return """ + SELECT Order_ID, Customer_ID, Product_ID, Quantity + FROM Orders + WHERE Quantity < 0 + """ + +# Query to find orders with missing Customer_Name +def get_orders_with_missing_customer_name(): + return """ + SELECT Order_ID, Customer_ID, Customer_Name, Product_ID, Quantity + FROM Orders + WHERE Customer_Name IS NULL + """ + +# Query to ensure unique Product_ID (no duplicates allowed in Orders) +def get_orders_with_duplicate_product_id(): + return """ + SELECT Product_ID, COUNT(*) + FROM Orders + GROUP BY Product_ID + HAVING COUNT(*) > 1 + """ + +# Query to ensure Product_Name cannot be NULL in Products +def get_orders_with_null_product_name(): + return """ + SELECT * + FROM Products + WHERE Product_Name IS NULL + """ + +# Query to get email customer in Orders +def get_invalid_email_customers(): + """ + Query to find customers with invalid email format. + Returns rows where the email does not match the expected pattern. + """ + query = """ + SELECT * + FROM Orders + WHERE Email NOT LIKE '%_@__%.__%'; + """ + return query + +def get_orders_with_invalid_date_range(): + """ + Query to find orders where the Order_Date is outside the range '2024-01-01' to '2024-12-31'. + """ + query = """ + SELECT * + FROM Orders + WHERE Order_Date < '2024-01-01' OR Order_Date > '2024-12-31'; + """ + return query + +def get_invalid_product_references(): + """ + Returns the SQL query to check for invalid Product_ID references in the Orders table. + """ + return """ + SELECT o.Order_ID, o.Product_ID + FROM Orders o + LEFT JOIN Products p ON o.Product_ID = p.Product_ID + WHERE p.Product_ID IS NULL; + """ \ No newline at end of file diff --git a/sql/sqlite_db/setup_db.py b/sql/sqlite_db/setup_db.py new file mode 100644 index 0000000..6224754 --- /dev/null +++ b/sql/sqlite_db/setup_db.py @@ -0,0 +1,39 @@ +import sqlite3 + +# Path to SQLite database +DB_PATH = 'sql/sqlite_db/etl.db' + +# Establish a connection +conn = sqlite3.connect(DB_PATH) +cursor = conn.cursor() + +# Drop tables if they exist to ensure schema updates +cursor.execute('DROP TABLE IF EXISTS Orders;') +cursor.execute('DROP TABLE IF EXISTS Products;') + +# Create the Orders table with the updated schema (including Email column) +cursor.execute(''' + CREATE TABLE Orders ( + Order_ID INTEGER PRIMARY KEY, + Customer_ID INTEGER, + Customer_Name TEXT, + Order_Date TEXT, + Product_ID INTEGER, + Quantity INTEGER, + Email TEXT + ); +''') + +# Create the Products table +cursor.execute(''' + CREATE TABLE Products ( + Product_ID INTEGER PRIMARY KEY, + Product_Name TEXT + ); +''') + +# Commit changes and close the connection +conn.commit() +conn.close() + +print("Database and tables set up successfully.") diff --git a/tests/load_data.py b/tests/load_data.py new file mode 100644 index 0000000..ddb214b --- /dev/null +++ b/tests/load_data.py @@ -0,0 +1,68 @@ +import sqlite3 +import os +from openpyxl import load_workbook + +# Path to the SQLite database +DB_PATH = 'sql/sqlite_db/etl.db' +# Path to the Excel file (dynamically resolve the absolute path) +EXCEL_FILE_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '../orders_test_data.xlsx')) + +def load_data_to_db(): + # Load the workbook and the 'Products' and 'Orders' sheets + wb = load_workbook(EXCEL_FILE_PATH) + + # Access the 'Products' and 'Orders' sheets + products_sheet = wb['Products'] + orders_sheet = wb['Orders'] + + # Establish a database connection + conn = sqlite3.connect(DB_PATH) + cursor = conn.cursor() + + # Insert data into Products table + for row in products_sheet.iter_rows(min_row=2, values_only=True): + cursor.execute(''' + INSERT OR IGNORE INTO Products (Product_ID, Product_Name) + VALUES (?, ?) + ''', (row[0], row[1])) + + # Insert data into Orders table + for row in orders_sheet.iter_rows(min_row=2, values_only=True): + # Check if the row is empty (all fields are empty or None) + if all(cell is None or cell == '' for cell in row): + continue # Skip the row if it's empty + + customer_id = row[0] + customer_name = row[1] + order_date = row[2] # Order_Date is assumed to be in the third column (index 2) + product_id = row[3] + quantity = row[4] + email = row[5] + + # Ensure that 'Order_Date' stays as a string, not a date object + if isinstance(order_date, str): + # If the order_date is in string format (like '12/01/2024'), keep it as is + order_date = order_date.strip() # Remove leading/trailing whitespace and newlines + elif isinstance(order_date, datetime): + # If the order_date is a datetime object, convert it to string + order_date = order_date.strftime('%d/%m/%Y') if order_date else None + else: + order_date = None # Set to None if the date format is invalid + + # Skip inserting rows where required data (such as order_date or customer_id) is invalid + if not customer_id or not order_date: + continue # Skip this row if customer_id or order_date is missing or invalid + + cursor.execute(''' + INSERT INTO Orders (Customer_ID, Customer_Name, Order_Date, Product_ID, Quantity, Email) + VALUES (?, ?, ?, ?, ?, ?) + ''', (customer_id, customer_name, order_date, product_id, quantity, email)) + + # Commit the changes and close the connection + conn.commit() + conn.close() + + print("Data loaded successfully from Excel to database.") + +if __name__ == '__main__': + load_data_to_db() diff --git a/tests/test_etl.py b/tests/test_etl.py new file mode 100644 index 0000000..1f235ea --- /dev/null +++ b/tests/test_etl.py @@ -0,0 +1,201 @@ +import pandas as pd +import sqlite3 +import pytest +import sys +import os +from datetime import datetime + +# Add the root directory of the project to the Python path +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from sql.sqlite_db.db_queries import ( + validate_customer_id_unique, + validate_order_date_format, + get_orders_with_negative_quantity, + get_orders_with_missing_customer_name, + get_orders_with_duplicate_product_id, + get_orders_with_null_product_name, + get_invalid_email_customers, + get_orders_with_invalid_date_range, + get_invalid_product_references +) + +# Fixture to set up and tear down the SQLite connection +@pytest.fixture(scope="module") +def db_connection(): + # Ensure the database file exists + db_path = os.path.join(os.path.dirname(__file__), "../sql/sqlite_db/etl.db") + assert os.path.exists(db_path), f"Database file not found at {db_path}" + + conn = sqlite3.connect(db_path) + yield conn + conn.close() + +# Test case 1: Validate customer id unique +def test_customer_id_unique(db_connection): + query = validate_customer_id_unique() + df = pd.read_sql(query, db_connection) + + # If df is not empty, print the rows that have duplicates + if not df.empty: + print("\nDuplicate Customer_IDs found:") + print(df) + + # Assert that there are no duplicate orders for the same Customer_ID and Order_Date + assert df.empty, "Duplicate orders exist:\n" + df.to_string(index=False) + +def is_valid_date(date_str): + """Check if a date string is valid (dd/mm/yyyy).""" + try: + # Try parsing the date + datetime.strptime(date_str, '%d/%m/%Y') + return True + except ValueError: + return False + +# Test case 2: Validate date format dd/mm/yyyy format +def test_order_date_format(db_connection): + # Run the SQL query to fetch orders with invalid date formats + query = validate_order_date_format() # Your validation SQL query + df = pd.read_sql(query, db_connection) # Fetch the result into a DataFrame + + # Strip any unwanted characters like newlines + df['Order_Date'] = df['Order_Date'].str.replace(r'\n', '').str.strip() + + # Validate if the date is in the correct format and valid + invalid_dates = df[~df['Order_Date'].apply(is_valid_date)] + + # Print out any rows with invalid date formats + if not invalid_dates.empty: + print("Orders with invalid date format:", invalid_dates) + + # Assert that there are no invalid dates remaining + assert invalid_dates.empty, f"There are orders with invalid date formats: {invalid_dates}" + +# Test case 3: Validate Missing Customer Name +def test_missing_customer_name(db_connection): + query = get_orders_with_missing_customer_name() + df = pd.read_sql(query, db_connection) + + missing_customer_name = df['Customer_Name'].isnull().sum() # Count NaN/None values + print(f"Number of missing Customer_Name values: {missing_customer_name}") + + # Assert that there are no missing customer names (fail if there are any) + assert missing_customer_name == 0, f"There are orders with missing Customer_Name: {missing_customer_name}" + +# Test case 4: Validate Negative Quantity Orders +def test_negative_quantity(db_connection): + query = get_orders_with_negative_quantity() + df = pd.read_sql(query, db_connection) + + # Log for debugging + print("DataFrame loaded from the database:") + print(df) + print(f"Negative quantities found: {df[df['Quantity'] < 0]}") + + # Assert that there are NO negative quantities + negative_quantity_count = (df['Quantity'] < 0).sum() # Count negative quantities + assert negative_quantity_count == 0, f"Orders with negative quantity found: {negative_quantity_count}" + +# Test case 5: Verify order date range should be within month December only +def test_order_date_range(db_connection): + """ + Validate that all Order_Date values are within the range '2024-12-01' to '2024-12-31'. + Invalid dates should also be flagged separately. + """ + # Query all rows from the Orders table + cursor = db_connection.cursor() + cursor.execute("SELECT Order_ID, Order_Date FROM Orders") + rows = cursor.fetchall() + + invalid_dates = [] + out_of_range_dates = [] + + # Process each row + for row in rows: + order_id = row[0] + order_date = row[1] + + # Validate the date format + try: + # Parse the date assuming the format is 'DD/MM/YYYY' + parsed_date = datetime.strptime(order_date, '%d/%m/%Y') + print(f"Parsed Date: {parsed_date}") # Debugging output + + # Check if the date is out of the valid range (December 2024) + if not (datetime(2024, 12, 1) <= parsed_date <= datetime(2024, 12, 31)): + out_of_range_dates.append((order_id, order_date)) + except ValueError: + # If the date is invalid, add it to the invalid dates list + invalid_dates.append((order_id, order_date)) + + # Log invalid dates + if invalid_dates: + print("\nOrders with invalid date formats:") + for order_id, invalid_date in invalid_dates: + print(f"Order_ID: {order_id}, Invalid Date: {invalid_date}") + + # Log out-of-range dates + if out_of_range_dates: + print("\nOrders with out-of-range dates:") + for order_id, out_of_range_date in out_of_range_dates: + print(f"Order_ID: {order_id}, Out-of-Range Date: {out_of_range_date}") + + # Collect all errors and fail at the end + errors = [] + # Collect all errors and fail at the end + if invalid_dates: + errors.append(f"Invalid date formats: {invalid_dates}") + if out_of_range_dates: + errors.append(f"Out-of-range dates: {out_of_range_dates}") + + # Combine errors into a single line for better test summary display + error_message = " | ".join(errors) + assert not errors, error_message + +# Test case 6: Test invalid email format +def test_invalid_email_format(db_connection): + """ + Test case to validate that all email addresses in the Orders table are in a valid format. + """ + query = get_invalid_email_customers() + df = pd.read_sql(query, db_connection) + + # Log for debugging + print("\nRows with invalid email format:") + print(df) + + # Assert that there are no rows with invalid email formats + assert df.empty, f"Invalid email addresses found:\n{df.to_string(index=False)}" + +# Test case 7: Ensure Unique Product_ID (no duplicates allowed) +def test_unique_product_id_in_order(db_connection): + query = get_orders_with_duplicate_product_id() + df = pd.read_sql(query, db_connection) + + assert df.empty, "There are duplicate Product_IDs in the Orders table" + +# Test case 8: Ensure Product_Name Cannot Be NULL +def test_product_name_not_null(db_connection): + query = get_orders_with_null_product_name() + df = pd.read_sql(query, db_connection) + + assert df.empty, "There are Products with NULL Product_Name" + +# Test case 9: Ensure Product_ID in Orders References a Valid Product_ID in Products +def test_referential_integrity(db_connection): + """ + Test case to validate referential integrity between Orders and Products tables. + Expected Behavior + If all Product_IDs in Orders have matching entries in Products, the query should return no rows. + If any Product_ID in Orders does not have a match in Products, the query should return those Order_IDs and their invalid Product_IDs. + """ + query = get_invalid_product_references() + df = pd.read_sql(query, db_connection) + + # Log for debugging + print("\nRows with invalid Product_ID references:") + print(df.to_string(index=False) if not df.empty else "No issues found.") + + # Assert that there are no rows with invalid Product_ID references + assert df.empty, f"Referential integrity issues found:\n{df.to_string(index=False)}"