diff --git a/.github/workflows/ci_cd_pipeline.yml b/.github/workflows/ci_cd_pipeline.yml
new file mode 100644
index 0000000..5e547ba
--- /dev/null
+++ b/.github/workflows/ci_cd_pipeline.yml
@@ -0,0 +1,54 @@
+name: ETL CI/CD Pipeline
+
+on:
+ push:
+ branches:
+ - main
+ pull_request:
+ branches:
+ - main
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Checkout the code
+ uses: actions/checkout@v2
+
+ - name: Set up Python
+ uses: actions/setup-python@v2
+ with:
+ python-version: 3.9
+
+ - name: Install Docker and Docker Compose
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y docker.io
+ sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
+ sudo chmod +x /usr/local/bin/docker-compose
+
+ - name: Build and start Docker containers with docker-compose
+ run: |
+ docker-compose -f docker-compose.yml up -d
+ sleep 10 # Wait for the DB to start properly (adjust if needed)
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements.txt
+
+ - name: Run database setup
+ run: python sql/sqlite_db/setup_db.py
+
+ - name: Load data into database
+ run: tests/load_data.py
+
+ - name: Run tests
+ run: |
+ pytest tests/test_etl.py
+ continue-on-error: true
+
+ - name: Clean up Docker containers
+ run: |
+ docker-compose -f docker-compose.yml down
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..cd215b4
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+__pycache__
+etl.db
+.venv
+.pytest_cache
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..361dedd
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,22 @@
+# Dockerfile
+
+# Use a base Python image
+FROM python:3.9-slim
+
+# Set the working directory
+WORKDIR /app
+
+# Copy the requirements file into the container
+COPY requirements.txt .
+
+# Install the dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the project files into the container
+COPY . .
+
+# Set environment variables (if necessary)
+ENV DB_PATH="/opt/airflow/sqlite_db/etl.db"
+
+# Command to run when the container starts
+CMD ["pytest", "tests/test_etl.py"]
diff --git a/README.md b/README.md
index 0083a36..68ce4f5 100644
--- a/README.md
+++ b/README.md
@@ -21,10 +21,59 @@ Handle missing or invalid values for Quantity (e.g., replace negative values wit
## Test Plan
-| **Test Case ID** | **Test Case Description** | **Steps to Execute** | **Expected Result** | **Business Rule Compliance** | **Risk Level** | **Test Data** |
-|------------------|-----------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------|------------------------------------------------------------------|----------------------------------|-----------------------------------------------------------|
-| TC_01 | **Validate Customer_ID Uniqueness** | - Insert two orders with the same Customer_ID.
- Check if the system raises an error or rejects the second order. | **Failure**: The system should reject the second order with the same Customer_ID. | Duplicate Customer_ID violates uniqueness in the orders table. | **Critical** – Affects data integrity. | Customer_ID: 1234 (used for two orders)
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: 2 |
-| TC_02 | **Validate Correct Date Format** | - Insert an order with an invalid date format (e.g., `12/01/2024` for `Order_Date`).
- Attempt to save the order. | **Failure**: The system should reject the order due to incorrect date format. | The `Order_Date` must follow a standardized format. | **High** – Incorrect data can cause parsing issues and errors in reporting. | Customer_ID: 1234
Order_Date: "12/01/2024" (invalid format)
Product_ID: 567
Quantity: 2 |
-| TC_03 | **Validate Missing Customer_Name** | - Insert an order with a missing `Customer_Name` value.
- Attempt to save the order. | **Failure**: The system should reject the order due to missing customer name. | The `Customer_Name` field is mandatory for all orders. | **High** – Missing customer information affects order processing and analysis. | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: 2 (Customer_Name: NULL) |
-| TC_04 | **Validate Negative Quantity** | - Insert an order with a negative `Quantity` value.
- Attempt to save the order. | **Failure**: The system should reject the order due to invalid quantity. | `Quantity` must always be a positive number. | **High** – Negative quantity violates business logic and can affect financial calculations. | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: -5 |
-| TC_05 | **Validate Missing Order Date** | - Insert an order with a missing `Order_Date` value.
- Attempt to save the order. | **Failure**: The system should reject the order due to missing order date. | `Order_Date` cannot be missing. | **Critical** – Missing order dates make the data unusable for time-based analysis. | Customer_ID: 1234
Customer_Name: "John Doe"
Product_ID: 567
Quantity: 2 (Order_Date: NULL) |
\ No newline at end of file
+| Test Case ID | Test Case Description | Steps to Execute | Expected Result | Risk Level | Test Data |
+|--------------|------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------|----------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------|
+| TC_001 | **Validate Customer ID Uniqueness** | - Execute `validate_customer_id_unique` query.
- Fetch the results into a DataFrame.
- Check for any duplicate `Customer_ID`s. | **Failure**: The DataFrame should be empty, indicating no duplicates. | **Critical** – Affects data integrity | Customer_ID: 1234 (used for two orders)
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: 2 |
+| TC_002 | **Validate Correct Date Format** | - Execute `validate_order_date_format` query.
- Fetch the results into a DataFrame.
- Validate if the `Order_Date` is in the correct format (`dd/mm/yyyy`). | **Failure**: The DataFrame should have no invalid date formats. | **High** – Affects date parsing and reporting | Customer_ID: 1234
Order_Date: "12/01/2024" (invalid format)
Product_ID: 567
Quantity: 2 |
+| TC_003 | **Validate Missing Customer Name** | - Execute `get_orders_with_missing_customer_name` query.
- Fetch the results into a DataFrame.
- Check for any missing `Customer_Name` values. | **Failure**: There should be no missing customer names. | **High** – Affects order processing | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: 2 (Customer_Name: NULL) |
+| TC_004 | **Validate Negative Quantity Orders** | - Execute `get_orders_with_negative_quantity` query.
- Fetch the results into a DataFrame.
- Check for negative `Quantity` values. | **Failure**: The DataFrame should have no rows with negative quantities. | **High** – Affects business logic and financial calculations | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: -5 |
+| TC_005 | **Validate Order Date Range (December 2024 only)** | - Execute the query to fetch all `Order_ID` and `Order_Date` from the `Orders` table.
- Check each order's date format and ensure it's within the range `2024-12-01` to `2024-12-31`.
- Identify invalid or out-of-range dates. | **Failure**: Orders with `Order_Date` outside the range `2024-12-01` to `2024-12-31` should be flagged.
**Failure**: Orders with invalid date formats should be flagged. | **High** – Invalid or out-of-range dates can affect reporting and processing. | Customer_ID: 1234
Order_Date: "01/12/2024"
Product_ID: 567
Quantity: 10 (Valid date)
Customer_ID: 5678
Order_Date: "01/11/2024" (Out of range)
Customer_ID: 91011
Order_Date: "InvalidDate" (Invalid format) | |
+| TC_006 | **Validate Invalid Email Format** | - Execute `get_invalid_email_customers` query.
- Fetch the results into a DataFrame.
- Check for invalid email formats. | **Failure**: The DataFrame should have no rows with invalid emails. | **High** – Affects customer communication | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: 2
Customer_Email: "invalid_email" |
+| TC_007 | **Ensure Unique Product_ID in Order** | - Execute `get_orders_with_duplicate_product_id` query.
- Fetch the results into a DataFrame.
- Check for duplicate `Product_ID`s in orders. | **Failure**: The DataFrame should be empty, indicating no duplicates. | **Critical** – Affects data integrity | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567 (duplicate)
Quantity: 2 |
+| TC_008 | **Ensure Product_Name Cannot Be NULL** | - Execute `get_orders_with_null_product_name` query.
- Fetch the results into a DataFrame.
- Check for any `NULL` values in `Product_Name`. | **Failure**: The DataFrame should have no rows with NULL `Product_Name`. | **High** – Affects order completeness | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: 2
Product_Name: NULL |
+| TC_009 | **Validate Referential Integrity Between Orders and Products** | - Execute `get_invalid_product_references` query.
- Fetch the results into a DataFrame.
- Check for any `Product_ID` references that do not exist in Products. | **Failure**: The DataFrame should have no rows indicating invalid `Product_ID` references. | **Critical** – Affects data integrity | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 999 (non-existing)
Quantity: 2 |
+
+## To run a specific test case:
+**Run by exact function name:**
+```sh
+pytest -s tests/test_etl.py::test_invalid_product_id
+```
+This will run only the test_invalid_product_id test case in tests/test_etl.py.
+
+## Running the Project Locally
+
+**1.Create and Activate a Virtual Environment**
+```sh
+python3 -m venv venv
+```
+
+Activate the virtual environment:
+```sh
+source venv/bin/activate
+```
+
+**2.Install Project Dependencies**
+```sh
+pip install -r requirements.txt
+```
+
+**3.Docker step**
+```sh
+docker-compose down
+docker-compose up -d
+```
+
+**4.Set Up the Database**
+```sh
+python sql/sqlite_db/setup_db.py
+```
+
+**5.Load Data into the Database**
+```sh
+python tests/load_data.py
+```
+
+**6.Run the test**
+```sh
+pytest tests/test_etl.py
+```
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..3e6ab56
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,16 @@
+version: '3.8'
+
+services:
+ sqlite_db:
+ build: ./sql # Path where your Dockerfile is located
+ container_name: sqlite_db
+ volumes:
+ - ./sql/sqlite_db:/opt/sqlite_db # Map the local folder to the container's folder
+ ports:
+ - "8081:8080" # Adjust if needed
+ networks:
+ - sqlite_network
+
+networks:
+ sqlite_network:
+ driver: bridge
diff --git a/orders_test_data.xlsx b/orders_test_data.xlsx
new file mode 100644
index 0000000..c158658
Binary files /dev/null and b/orders_test_data.xlsx differ
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..ef47ed9
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+# For setting up the environment
+apache-airflow==2.5.0 # If using Airflow for orchestration
+pandas==1.5.3 # For handling data manipulation
+pytest==7.2.2 # For running tests
+openpyxl==3.0.10 # For reading and writing Excel files (e.g., orders_test_data.xlsx)
\ No newline at end of file
diff --git a/sql/Dockerfile b/sql/Dockerfile
new file mode 100644
index 0000000..f87ca23
--- /dev/null
+++ b/sql/Dockerfile
@@ -0,0 +1,7 @@
+FROM nouchka/sqlite3:latest
+
+# Set working directory to where the database will reside
+WORKDIR /opt/sqlite_db
+
+# Initialize or create the SQLite database
+RUN sqlite3 /opt/sqlite_db/etl.db "CREATE TABLE IF NOT EXISTS Orders (Order_ID INTEGER PRIMARY KEY AUTOINCREMENT, Product_Name TEXT, Quantity INTEGER);"
diff --git a/sql/sqlite_db/db_queries.py b/sql/sqlite_db/db_queries.py
new file mode 100644
index 0000000..f6b4372
--- /dev/null
+++ b/sql/sqlite_db/db_queries.py
@@ -0,0 +1,102 @@
+# Query to Validate Customer_ID Uniqueness
+def validate_customer_id_unique():
+ return """
+ SELECT Customer_ID, Order_Date, COUNT(*) AS Order_Count
+ FROM Orders
+ GROUP BY Customer_ID, Order_Date
+ HAVING COUNT(*) > 1
+ """
+
+# Query to Validate Correct Date Format
+def validate_order_date_format():
+ return """
+ SELECT Order_ID, Order_Date
+ FROM Orders
+ WHERE Order_Date IS NULL
+ OR NOT (Order_Date GLOB '????-??-??'
+ AND LENGTH(Order_Date) = 10
+ AND CAST(substr(Order_Date, 1, 4) AS INTEGER) > 0
+ AND substr(Order_Date, 6, 2) BETWEEN '01' AND '12'
+ AND CASE
+ WHEN substr(Order_Date, 6, 2) IN ('01', '03', '05', '07', '08', '10', '12') THEN substr(Order_Date, 9, 2) BETWEEN '01' AND '31'
+ WHEN substr(Order_Date, 6, 2) IN ('04', '06', '09', '11') THEN substr(Order_Date, 9, 2) BETWEEN '01' AND '30'
+ WHEN substr(Order_Date, 6, 2) = '02' THEN (
+ CASE
+ WHEN (CAST(substr(Order_Date, 1, 4) AS INTEGER) % 4 = 0
+ AND CAST(substr(Order_Date, 1, 4) AS INTEGER) % 100 != 0)
+ OR CAST(substr(Order_Date, 1, 4) AS INTEGER) % 400 = 0 THEN substr(Order_Date, 9, 2) BETWEEN '01' AND '29'
+ ELSE substr(Order_Date, 9, 2) BETWEEN '01' AND '28'
+ END
+ )
+ ELSE 0
+ END = 1
+ );
+ """
+
+# Query to find orders with negative quantities
+def get_orders_with_negative_quantity():
+ return """
+ SELECT Order_ID, Customer_ID, Product_ID, Quantity
+ FROM Orders
+ WHERE Quantity < 0
+ """
+
+# Query to find orders with missing Customer_Name
+def get_orders_with_missing_customer_name():
+ return """
+ SELECT Order_ID, Customer_ID, Customer_Name, Product_ID, Quantity
+ FROM Orders
+ WHERE Customer_Name IS NULL
+ """
+
+# Query to ensure unique Product_ID (no duplicates allowed in Orders)
+def get_orders_with_duplicate_product_id():
+ return """
+ SELECT Product_ID, COUNT(*)
+ FROM Orders
+ GROUP BY Product_ID
+ HAVING COUNT(*) > 1
+ """
+
+# Query to ensure Product_Name cannot be NULL in Products
+def get_orders_with_null_product_name():
+ return """
+ SELECT *
+ FROM Products
+ WHERE Product_Name IS NULL
+ """
+
+# Query to get email customer in Orders
+def get_invalid_email_customers():
+ """
+ Query to find customers with invalid email format.
+ Returns rows where the email does not match the expected pattern.
+ """
+ query = """
+ SELECT *
+ FROM Orders
+ WHERE Email NOT LIKE '%_@__%.__%';
+ """
+ return query
+
+def get_orders_with_invalid_date_range():
+ """
+ Query to find orders where the Order_Date is outside the range '2024-01-01' to '2024-12-31'.
+ """
+ query = """
+ SELECT *
+ FROM Orders
+ WHERE Order_Date < '2024-01-01' OR Order_Date > '2024-12-31';
+ """
+ return query
+
+def get_invalid_product_references():
+ """
+ Returns the SQL query to check for invalid Product_ID references in the Orders table.
+ """
+ return """
+ SELECT o.Order_ID, o.Product_ID
+ FROM Orders o
+ LEFT JOIN Products p ON o.Product_ID = p.Product_ID
+ WHERE p.Product_ID IS NULL;
+ """
\ No newline at end of file
diff --git a/sql/sqlite_db/setup_db.py b/sql/sqlite_db/setup_db.py
new file mode 100644
index 0000000..6224754
--- /dev/null
+++ b/sql/sqlite_db/setup_db.py
@@ -0,0 +1,39 @@
+import sqlite3
+
+# Path to SQLite database
+DB_PATH = 'sql/sqlite_db/etl.db'
+
+# Establish a connection
+conn = sqlite3.connect(DB_PATH)
+cursor = conn.cursor()
+
+# Drop tables if they exist to ensure schema updates
+cursor.execute('DROP TABLE IF EXISTS Orders;')
+cursor.execute('DROP TABLE IF EXISTS Products;')
+
+# Create the Orders table with the updated schema (including Email column)
+cursor.execute('''
+ CREATE TABLE Orders (
+ Order_ID INTEGER PRIMARY KEY,
+ Customer_ID INTEGER,
+ Customer_Name TEXT,
+ Order_Date TEXT,
+ Product_ID INTEGER,
+ Quantity INTEGER,
+ Email TEXT
+ );
+''')
+
+# Create the Products table
+cursor.execute('''
+ CREATE TABLE Products (
+ Product_ID INTEGER PRIMARY KEY,
+ Product_Name TEXT
+ );
+''')
+
+# Commit changes and close the connection
+conn.commit()
+conn.close()
+
+print("Database and tables set up successfully.")
diff --git a/tests/load_data.py b/tests/load_data.py
new file mode 100644
index 0000000..ddb214b
--- /dev/null
+++ b/tests/load_data.py
@@ -0,0 +1,68 @@
+import sqlite3
+import os
+from openpyxl import load_workbook
+
+# Path to the SQLite database
+DB_PATH = 'sql/sqlite_db/etl.db'
+# Path to the Excel file (dynamically resolve the absolute path)
+EXCEL_FILE_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '../orders_test_data.xlsx'))
+
+def load_data_to_db():
+ # Load the workbook and the 'Products' and 'Orders' sheets
+ wb = load_workbook(EXCEL_FILE_PATH)
+
+ # Access the 'Products' and 'Orders' sheets
+ products_sheet = wb['Products']
+ orders_sheet = wb['Orders']
+
+ # Establish a database connection
+ conn = sqlite3.connect(DB_PATH)
+ cursor = conn.cursor()
+
+ # Insert data into Products table
+ for row in products_sheet.iter_rows(min_row=2, values_only=True):
+ cursor.execute('''
+ INSERT OR IGNORE INTO Products (Product_ID, Product_Name)
+ VALUES (?, ?)
+ ''', (row[0], row[1]))
+
+ # Insert data into Orders table
+ for row in orders_sheet.iter_rows(min_row=2, values_only=True):
+ # Check if the row is empty (all fields are empty or None)
+ if all(cell is None or cell == '' for cell in row):
+ continue # Skip the row if it's empty
+
+ customer_id = row[0]
+ customer_name = row[1]
+ order_date = row[2] # Order_Date is assumed to be in the third column (index 2)
+ product_id = row[3]
+ quantity = row[4]
+ email = row[5]
+
+ # Ensure that 'Order_Date' stays as a string, not a date object
+ if isinstance(order_date, str):
+ # If the order_date is in string format (like '12/01/2024'), keep it as is
+ order_date = order_date.strip() # Remove leading/trailing whitespace and newlines
+ elif isinstance(order_date, datetime):
+ # If the order_date is a datetime object, convert it to string
+ order_date = order_date.strftime('%d/%m/%Y') if order_date else None
+ else:
+ order_date = None # Set to None if the date format is invalid
+
+ # Skip inserting rows where required data (such as order_date or customer_id) is invalid
+ if not customer_id or not order_date:
+ continue # Skip this row if customer_id or order_date is missing or invalid
+
+ cursor.execute('''
+ INSERT INTO Orders (Customer_ID, Customer_Name, Order_Date, Product_ID, Quantity, Email)
+ VALUES (?, ?, ?, ?, ?, ?)
+ ''', (customer_id, customer_name, order_date, product_id, quantity, email))
+
+ # Commit the changes and close the connection
+ conn.commit()
+ conn.close()
+
+ print("Data loaded successfully from Excel to database.")
+
+if __name__ == '__main__':
+ load_data_to_db()
diff --git a/tests/test_etl.py b/tests/test_etl.py
new file mode 100644
index 0000000..1f235ea
--- /dev/null
+++ b/tests/test_etl.py
@@ -0,0 +1,201 @@
+import pandas as pd
+import sqlite3
+import pytest
+import sys
+import os
+from datetime import datetime
+
+# Add the root directory of the project to the Python path
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from sql.sqlite_db.db_queries import (
+ validate_customer_id_unique,
+ validate_order_date_format,
+ get_orders_with_negative_quantity,
+ get_orders_with_missing_customer_name,
+ get_orders_with_duplicate_product_id,
+ get_orders_with_null_product_name,
+ get_invalid_email_customers,
+ get_orders_with_invalid_date_range,
+ get_invalid_product_references
+)
+
+# Fixture to set up and tear down the SQLite connection
+@pytest.fixture(scope="module")
+def db_connection():
+ # Ensure the database file exists
+ db_path = os.path.join(os.path.dirname(__file__), "../sql/sqlite_db/etl.db")
+ assert os.path.exists(db_path), f"Database file not found at {db_path}"
+
+ conn = sqlite3.connect(db_path)
+ yield conn
+ conn.close()
+
+# Test case 1: Validate customer id unique
+def test_customer_id_unique(db_connection):
+ query = validate_customer_id_unique()
+ df = pd.read_sql(query, db_connection)
+
+ # If df is not empty, print the rows that have duplicates
+ if not df.empty:
+ print("\nDuplicate Customer_IDs found:")
+ print(df)
+
+ # Assert that there are no duplicate orders for the same Customer_ID and Order_Date
+ assert df.empty, "Duplicate orders exist:\n" + df.to_string(index=False)
+
+def is_valid_date(date_str):
+ """Check if a date string is valid (dd/mm/yyyy)."""
+ try:
+ # Try parsing the date
+ datetime.strptime(date_str, '%d/%m/%Y')
+ return True
+ except ValueError:
+ return False
+
+# Test case 2: Validate date format dd/mm/yyyy format
+def test_order_date_format(db_connection):
+ # Run the SQL query to fetch orders with invalid date formats
+ query = validate_order_date_format() # Your validation SQL query
+ df = pd.read_sql(query, db_connection) # Fetch the result into a DataFrame
+
+ # Strip any unwanted characters like newlines
+ df['Order_Date'] = df['Order_Date'].str.replace(r'\n', '').str.strip()
+
+ # Validate if the date is in the correct format and valid
+ invalid_dates = df[~df['Order_Date'].apply(is_valid_date)]
+
+ # Print out any rows with invalid date formats
+ if not invalid_dates.empty:
+ print("Orders with invalid date format:", invalid_dates)
+
+ # Assert that there are no invalid dates remaining
+ assert invalid_dates.empty, f"There are orders with invalid date formats: {invalid_dates}"
+
+# Test case 3: Validate Missing Customer Name
+def test_missing_customer_name(db_connection):
+ query = get_orders_with_missing_customer_name()
+ df = pd.read_sql(query, db_connection)
+
+ missing_customer_name = df['Customer_Name'].isnull().sum() # Count NaN/None values
+ print(f"Number of missing Customer_Name values: {missing_customer_name}")
+
+ # Assert that there are no missing customer names (fail if there are any)
+ assert missing_customer_name == 0, f"There are orders with missing Customer_Name: {missing_customer_name}"
+
+# Test case 4: Validate Negative Quantity Orders
+def test_negative_quantity(db_connection):
+ query = get_orders_with_negative_quantity()
+ df = pd.read_sql(query, db_connection)
+
+ # Log for debugging
+ print("DataFrame loaded from the database:")
+ print(df)
+ print(f"Negative quantities found: {df[df['Quantity'] < 0]}")
+
+ # Assert that there are NO negative quantities
+ negative_quantity_count = (df['Quantity'] < 0).sum() # Count negative quantities
+ assert negative_quantity_count == 0, f"Orders with negative quantity found: {negative_quantity_count}"
+
+# Test case 5: Verify order date range should be within month December only
+def test_order_date_range(db_connection):
+ """
+ Validate that all Order_Date values are within the range '2024-12-01' to '2024-12-31'.
+ Invalid dates should also be flagged separately.
+ """
+ # Query all rows from the Orders table
+ cursor = db_connection.cursor()
+ cursor.execute("SELECT Order_ID, Order_Date FROM Orders")
+ rows = cursor.fetchall()
+
+ invalid_dates = []
+ out_of_range_dates = []
+
+ # Process each row
+ for row in rows:
+ order_id = row[0]
+ order_date = row[1]
+
+ # Validate the date format
+ try:
+ # Parse the date assuming the format is 'DD/MM/YYYY'
+ parsed_date = datetime.strptime(order_date, '%d/%m/%Y')
+ print(f"Parsed Date: {parsed_date}") # Debugging output
+
+ # Check if the date is out of the valid range (December 2024)
+ if not (datetime(2024, 12, 1) <= parsed_date <= datetime(2024, 12, 31)):
+ out_of_range_dates.append((order_id, order_date))
+ except ValueError:
+ # If the date is invalid, add it to the invalid dates list
+ invalid_dates.append((order_id, order_date))
+
+ # Log invalid dates
+ if invalid_dates:
+ print("\nOrders with invalid date formats:")
+ for order_id, invalid_date in invalid_dates:
+ print(f"Order_ID: {order_id}, Invalid Date: {invalid_date}")
+
+ # Log out-of-range dates
+ if out_of_range_dates:
+ print("\nOrders with out-of-range dates:")
+ for order_id, out_of_range_date in out_of_range_dates:
+ print(f"Order_ID: {order_id}, Out-of-Range Date: {out_of_range_date}")
+
+ # Collect all errors and fail at the end
+ errors = []
+ # Collect all errors and fail at the end
+ if invalid_dates:
+ errors.append(f"Invalid date formats: {invalid_dates}")
+ if out_of_range_dates:
+ errors.append(f"Out-of-range dates: {out_of_range_dates}")
+
+ # Combine errors into a single line for better test summary display
+ error_message = " | ".join(errors)
+ assert not errors, error_message
+
+# Test case 6: Test invalid email format
+def test_invalid_email_format(db_connection):
+ """
+ Test case to validate that all email addresses in the Orders table are in a valid format.
+ """
+ query = get_invalid_email_customers()
+ df = pd.read_sql(query, db_connection)
+
+ # Log for debugging
+ print("\nRows with invalid email format:")
+ print(df)
+
+ # Assert that there are no rows with invalid email formats
+ assert df.empty, f"Invalid email addresses found:\n{df.to_string(index=False)}"
+
+# Test case 7: Ensure Unique Product_ID (no duplicates allowed)
+def test_unique_product_id_in_order(db_connection):
+ query = get_orders_with_duplicate_product_id()
+ df = pd.read_sql(query, db_connection)
+
+ assert df.empty, "There are duplicate Product_IDs in the Orders table"
+
+# Test case 8: Ensure Product_Name Cannot Be NULL
+def test_product_name_not_null(db_connection):
+ query = get_orders_with_null_product_name()
+ df = pd.read_sql(query, db_connection)
+
+ assert df.empty, "There are Products with NULL Product_Name"
+
+# Test case 9: Ensure Product_ID in Orders References a Valid Product_ID in Products
+def test_referential_integrity(db_connection):
+ """
+ Test case to validate referential integrity between Orders and Products tables.
+ Expected Behavior
+ If all Product_IDs in Orders have matching entries in Products, the query should return no rows.
+ If any Product_ID in Orders does not have a match in Products, the query should return those Order_IDs and their invalid Product_IDs.
+ """
+ query = get_invalid_product_references()
+ df = pd.read_sql(query, db_connection)
+
+ # Log for debugging
+ print("\nRows with invalid Product_ID references:")
+ print(df.to_string(index=False) if not df.empty else "No issues found.")
+
+ # Assert that there are no rows with invalid Product_ID references
+ assert df.empty, f"Referential integrity issues found:\n{df.to_string(index=False)}"