diff --git a/.github/workflows/ci_cd_pipeline.yml b/.github/workflows/ci_cd_pipeline.yml index e602449..c2e6304 100644 --- a/.github/workflows/ci_cd_pipeline.yml +++ b/.github/workflows/ci_cd_pipeline.yml @@ -78,7 +78,7 @@ jobs: - name: Run tests run: | source venv/bin/activate - pytest tests/test_etl.py + pytest tests/test_data_unittest.py continue-on-error: true - name: Clean up Docker containers diff --git a/Dockerfile b/Dockerfile index 361dedd..859f76c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,4 +19,4 @@ COPY . . ENV DB_PATH="/opt/airflow/sqlite_db/etl.db" # Command to run when the container starts -CMD ["pytest", "tests/test_etl.py"] +CMD ["pytest", "tests/test_data_unittest.py"] diff --git a/README.md b/README.md index 8ca583e..c1e23eb 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ Ensure Order_Date is in the correct format (YYYY-MM-DD).
Handle missing or invalid values for Quantity (e.g., replace negative values with zero).
**Load**: Data is loaded into the Orders table in SQLite -## Test Plan +## Test Plan for Data Quality Testing (unit test) | Test Case ID | Test Case Description | Steps to Execute | Expected Result | Risk Level | Test Data | |--------------|------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------|----------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------| @@ -36,9 +36,9 @@ Handle missing or invalid values for Quantity (e.g., replace negative values wit ## To run a specific test case: **Run by exact function name:** ```sh -pytest -s tests/test_etl.py::test_invalid_product_id +pytest -s tests/test_data_unittest.py::test_invalid_product_id ``` -This will run only the test_invalid_product_id test case in tests/test_etl.py. +This will run only the test_invalid_product_id test case in tests/test_data_unittest.py. ## Running the Project Locally @@ -75,10 +75,22 @@ python tests/load_data.py **6.Run the test** ```sh -pytest tests/test_etl.py +pytest tests/test_data_unittest.py ``` # Example fail result after run test ![date result](https://github.com/Thanasornsawan/Practice_ETL_QA_analyst/blob/main/photos/date_range.png?raw=true) -![map result](https://github.com/Thanasornsawan/Practice_ETL_QA_analyst/blob/main/photos/id_mapping.png?raw=true) \ No newline at end of file +![map result](https://github.com/Thanasornsawan/Practice_ETL_QA_analyst/blob/main/photos/id_mapping.png?raw=true) + +## Data Completeness Testing: + +**Objective:** Ensure that all expected data is loaded into the target system without any loss. +**Test Case:** Compare the record counts between the source and target tables to verify completeness. + +**Run the test** +```sh +pytest tests/test_load_correct.py +``` +![load result](https://github.com/Thanasornsawan/Practice_ETL_QA_analyst/blob/main/photos/test_load.png?raw=true) + diff --git a/photos/test_load.png b/photos/test_load.png new file mode 100644 index 0000000..c2b2aa5 Binary files /dev/null and b/photos/test_load.png differ diff --git a/tests/test_etl.py b/tests/test_data_unittest.py similarity index 100% rename from tests/test_etl.py rename to tests/test_data_unittest.py diff --git a/tests/test_load_correct.py b/tests/test_load_correct.py new file mode 100644 index 0000000..f727e92 --- /dev/null +++ b/tests/test_load_correct.py @@ -0,0 +1,46 @@ +import pandas as pd +import sqlite3 +import os + +def test_row_count(): + # Path to SQLite database + DB_PATH = 'sql/sqlite_db/etl.db' + print(f"Database path: {DB_PATH}") + + # Establish a connection + conn = sqlite3.connect(DB_PATH) + + # Path to Excel file + EXCEL_FILE_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '../orders_test_data.xlsx')) + print(f"Excel file path: {EXCEL_FILE_PATH}") + + # Read source data from Excel file for both sheets + source_orders_df = pd.read_excel(EXCEL_FILE_PATH, sheet_name="Orders") + source_products_df = pd.read_excel(EXCEL_FILE_PATH, sheet_name="Products") + + # Drop empty rows in both sheets + source_orders_df = source_orders_df.dropna(how='all') + source_products_df = source_products_df.dropna(how='all') + + print(f"Source Orders Rows: {len(source_orders_df)}") + print(f"Source Products Rows: {len(source_products_df)}") + + # Read target data from the database for both tables + target_orders_df = pd.read_sql_query("SELECT * FROM Orders", conn) + target_products_df = pd.read_sql_query("SELECT * FROM Products", conn) + + print(f"Target Orders Rows: {len(target_orders_df)}") + print(f"Target Products Rows: {len(target_products_df)}") + + # Validate row count for Orders + assert len(source_orders_df) == len(target_orders_df), ( + f"Row count mismatch for Orders: Source ({len(source_orders_df)}) vs Target ({len(target_orders_df)})" + ) + + # Validate row count for Products + assert len(source_products_df) == len(target_products_df), ( + f"Row count mismatch for Products: Source ({len(source_products_df)}) vs Target ({len(target_products_df)})" + ) + + print("Row count validation passed for both Orders and Products.") + conn.close()