diff --git a/.github/workflows/ci_cd_pipeline.yml b/.github/workflows/ci_cd_pipeline.yml new file mode 100644 index 0000000..5e547ba --- /dev/null +++ b/.github/workflows/ci_cd_pipeline.yml @@ -0,0 +1,54 @@ +name: ETL CI/CD Pipeline + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout the code + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.9 + + - name: Install Docker and Docker Compose + run: | + sudo apt-get update + sudo apt-get install -y docker.io + sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose + sudo chmod +x /usr/local/bin/docker-compose + + - name: Build and start Docker containers with docker-compose + run: | + docker-compose -f docker-compose.yml up -d + sleep 10 # Wait for the DB to start properly (adjust if needed) + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Run database setup + run: python sql/sqlite_db/setup_db.py + + - name: Load data into database + run: tests/load_data.py + + - name: Run tests + run: | + pytest tests/test_etl.py + continue-on-error: true + + - name: Clean up Docker containers + run: | + docker-compose -f docker-compose.yml down \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cd215b4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +__pycache__ +etl.db +.venv +.pytest_cache \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..361dedd --- /dev/null +++ b/Dockerfile @@ -0,0 +1,22 @@ +# Dockerfile + +# Use a base Python image +FROM python:3.9-slim + +# Set the working directory +WORKDIR /app + +# Copy the requirements file into the container +COPY requirements.txt . + +# Install the dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the project files into the container +COPY . . + +# Set environment variables (if necessary) +ENV DB_PATH="/opt/airflow/sqlite_db/etl.db" + +# Command to run when the container starts +CMD ["pytest", "tests/test_etl.py"] diff --git a/README.md b/README.md index 0083a36..68ce4f5 100644 --- a/README.md +++ b/README.md @@ -21,10 +21,59 @@ Handle missing or invalid values for Quantity (e.g., replace negative values wit ## Test Plan -| **Test Case ID** | **Test Case Description** | **Steps to Execute** | **Expected Result** | **Business Rule Compliance** | **Risk Level** | **Test Data** | -|------------------|-----------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------|------------------------------------------------------------------|----------------------------------|-----------------------------------------------------------| -| TC_01 | **Validate Customer_ID Uniqueness** | - Insert two orders with the same Customer_ID.
- Check if the system raises an error or rejects the second order. | **Failure**: The system should reject the second order with the same Customer_ID. | Duplicate Customer_ID violates uniqueness in the orders table. | **Critical** – Affects data integrity. | Customer_ID: 1234 (used for two orders)
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: 2 | -| TC_02 | **Validate Correct Date Format** | - Insert an order with an invalid date format (e.g., `12/01/2024` for `Order_Date`).
- Attempt to save the order. | **Failure**: The system should reject the order due to incorrect date format. | The `Order_Date` must follow a standardized format. | **High** – Incorrect data can cause parsing issues and errors in reporting. | Customer_ID: 1234
Order_Date: "12/01/2024" (invalid format)
Product_ID: 567
Quantity: 2 | -| TC_03 | **Validate Missing Customer_Name** | - Insert an order with a missing `Customer_Name` value.
- Attempt to save the order. | **Failure**: The system should reject the order due to missing customer name. | The `Customer_Name` field is mandatory for all orders. | **High** – Missing customer information affects order processing and analysis. | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: 2 (Customer_Name: NULL) | -| TC_04 | **Validate Negative Quantity** | - Insert an order with a negative `Quantity` value.
- Attempt to save the order. | **Failure**: The system should reject the order due to invalid quantity. | `Quantity` must always be a positive number. | **High** – Negative quantity violates business logic and can affect financial calculations. | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: -5 | -| TC_05 | **Validate Missing Order Date** | - Insert an order with a missing `Order_Date` value.
- Attempt to save the order. | **Failure**: The system should reject the order due to missing order date. | `Order_Date` cannot be missing. | **Critical** – Missing order dates make the data unusable for time-based analysis. | Customer_ID: 1234
Customer_Name: "John Doe"
Product_ID: 567
Quantity: 2 (Order_Date: NULL) | \ No newline at end of file +| Test Case ID | Test Case Description | Steps to Execute | Expected Result | Risk Level | Test Data | +|--------------|------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------|----------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------| +| TC_001 | **Validate Customer ID Uniqueness** | - Execute `validate_customer_id_unique` query.
- Fetch the results into a DataFrame.
- Check for any duplicate `Customer_ID`s. | **Failure**: The DataFrame should be empty, indicating no duplicates. | **Critical** – Affects data integrity | Customer_ID: 1234 (used for two orders)
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: 2 | +| TC_002 | **Validate Correct Date Format** | - Execute `validate_order_date_format` query.
- Fetch the results into a DataFrame.
- Validate if the `Order_Date` is in the correct format (`dd/mm/yyyy`). | **Failure**: The DataFrame should have no invalid date formats. | **High** – Affects date parsing and reporting | Customer_ID: 1234
Order_Date: "12/01/2024" (invalid format)
Product_ID: 567
Quantity: 2 | +| TC_003 | **Validate Missing Customer Name** | - Execute `get_orders_with_missing_customer_name` query.
- Fetch the results into a DataFrame.
- Check for any missing `Customer_Name` values. | **Failure**: There should be no missing customer names. | **High** – Affects order processing | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: 2 (Customer_Name: NULL) | +| TC_004 | **Validate Negative Quantity Orders** | - Execute `get_orders_with_negative_quantity` query.
- Fetch the results into a DataFrame.
- Check for negative `Quantity` values. | **Failure**: The DataFrame should have no rows with negative quantities. | **High** – Affects business logic and financial calculations | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: -5 | +| TC_005 | **Validate Order Date Range (December 2024 only)** | - Execute the query to fetch all `Order_ID` and `Order_Date` from the `Orders` table.
- Check each order's date format and ensure it's within the range `2024-12-01` to `2024-12-31`.
- Identify invalid or out-of-range dates. | **Failure**: Orders with `Order_Date` outside the range `2024-12-01` to `2024-12-31` should be flagged.
**Failure**: Orders with invalid date formats should be flagged. | **High** – Invalid or out-of-range dates can affect reporting and processing. | Customer_ID: 1234
Order_Date: "01/12/2024"
Product_ID: 567
Quantity: 10 (Valid date)
Customer_ID: 5678
Order_Date: "01/11/2024" (Out of range)
Customer_ID: 91011
Order_Date: "InvalidDate" (Invalid format) | | +| TC_006 | **Validate Invalid Email Format** | - Execute `get_invalid_email_customers` query.
- Fetch the results into a DataFrame.
- Check for invalid email formats. | **Failure**: The DataFrame should have no rows with invalid emails. | **High** – Affects customer communication | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: 2
Customer_Email: "invalid_email" | +| TC_007 | **Ensure Unique Product_ID in Order** | - Execute `get_orders_with_duplicate_product_id` query.
- Fetch the results into a DataFrame.
- Check for duplicate `Product_ID`s in orders. | **Failure**: The DataFrame should be empty, indicating no duplicates. | **Critical** – Affects data integrity | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567 (duplicate)
Quantity: 2 | +| TC_008 | **Ensure Product_Name Cannot Be NULL** | - Execute `get_orders_with_null_product_name` query.
- Fetch the results into a DataFrame.
- Check for any `NULL` values in `Product_Name`. | **Failure**: The DataFrame should have no rows with NULL `Product_Name`. | **High** – Affects order completeness | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 567
Quantity: 2
Product_Name: NULL | +| TC_009 | **Validate Referential Integrity Between Orders and Products** | - Execute `get_invalid_product_references` query.
- Fetch the results into a DataFrame.
- Check for any `Product_ID` references that do not exist in Products. | **Failure**: The DataFrame should have no rows indicating invalid `Product_ID` references. | **Critical** – Affects data integrity | Customer_ID: 1234
Order_Date: "2024-12-01"
Product_ID: 999 (non-existing)
Quantity: 2 | + +## To run a specific test case: +**Run by exact function name:** +```sh +pytest -s tests/test_etl.py::test_invalid_product_id +``` +This will run only the test_invalid_product_id test case in tests/test_etl.py. + +## Running the Project Locally + +**1.Create and Activate a Virtual Environment** +```sh +python3 -m venv venv +``` + +Activate the virtual environment: +```sh +source venv/bin/activate +``` + +**2.Install Project Dependencies** +```sh +pip install -r requirements.txt +``` + +**3.Docker step** +```sh +docker-compose down +docker-compose up -d +``` + +**4.Set Up the Database** +```sh +python sql/sqlite_db/setup_db.py +``` + +**5.Load Data into the Database** +```sh +python tests/load_data.py +``` + +**6.Run the test** +```sh +pytest tests/test_etl.py +``` \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..3e6ab56 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,16 @@ +version: '3.8' + +services: + sqlite_db: + build: ./sql # Path where your Dockerfile is located + container_name: sqlite_db + volumes: + - ./sql/sqlite_db:/opt/sqlite_db # Map the local folder to the container's folder + ports: + - "8081:8080" # Adjust if needed + networks: + - sqlite_network + +networks: + sqlite_network: + driver: bridge diff --git a/orders_test_data.xlsx b/orders_test_data.xlsx new file mode 100644 index 0000000..c158658 Binary files /dev/null and b/orders_test_data.xlsx differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ef47ed9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +# For setting up the environment +apache-airflow==2.5.0 # If using Airflow for orchestration +pandas==1.5.3 # For handling data manipulation +pytest==7.2.2 # For running tests +openpyxl==3.0.10 # For reading and writing Excel files (e.g., orders_test_data.xlsx) \ No newline at end of file diff --git a/sql/Dockerfile b/sql/Dockerfile new file mode 100644 index 0000000..f87ca23 --- /dev/null +++ b/sql/Dockerfile @@ -0,0 +1,7 @@ +FROM nouchka/sqlite3:latest + +# Set working directory to where the database will reside +WORKDIR /opt/sqlite_db + +# Initialize or create the SQLite database +RUN sqlite3 /opt/sqlite_db/etl.db "CREATE TABLE IF NOT EXISTS Orders (Order_ID INTEGER PRIMARY KEY AUTOINCREMENT, Product_Name TEXT, Quantity INTEGER);" diff --git a/sql/sqlite_db/db_queries.py b/sql/sqlite_db/db_queries.py new file mode 100644 index 0000000..f6b4372 --- /dev/null +++ b/sql/sqlite_db/db_queries.py @@ -0,0 +1,102 @@ +# Query to Validate Customer_ID Uniqueness +def validate_customer_id_unique(): + return """ + SELECT Customer_ID, Order_Date, COUNT(*) AS Order_Count + FROM Orders + GROUP BY Customer_ID, Order_Date + HAVING COUNT(*) > 1 + """ + +# Query to Validate Correct Date Format +def validate_order_date_format(): + return """ + SELECT Order_ID, Order_Date + FROM Orders + WHERE Order_Date IS NULL + OR NOT (Order_Date GLOB '????-??-??' + AND LENGTH(Order_Date) = 10 + AND CAST(substr(Order_Date, 1, 4) AS INTEGER) > 0 + AND substr(Order_Date, 6, 2) BETWEEN '01' AND '12' + AND CASE + WHEN substr(Order_Date, 6, 2) IN ('01', '03', '05', '07', '08', '10', '12') THEN substr(Order_Date, 9, 2) BETWEEN '01' AND '31' + WHEN substr(Order_Date, 6, 2) IN ('04', '06', '09', '11') THEN substr(Order_Date, 9, 2) BETWEEN '01' AND '30' + WHEN substr(Order_Date, 6, 2) = '02' THEN ( + CASE + WHEN (CAST(substr(Order_Date, 1, 4) AS INTEGER) % 4 = 0 + AND CAST(substr(Order_Date, 1, 4) AS INTEGER) % 100 != 0) + OR CAST(substr(Order_Date, 1, 4) AS INTEGER) % 400 = 0 THEN substr(Order_Date, 9, 2) BETWEEN '01' AND '29' + ELSE substr(Order_Date, 9, 2) BETWEEN '01' AND '28' + END + ) + ELSE 0 + END = 1 + ); + """ + +# Query to find orders with negative quantities +def get_orders_with_negative_quantity(): + return """ + SELECT Order_ID, Customer_ID, Product_ID, Quantity + FROM Orders + WHERE Quantity < 0 + """ + +# Query to find orders with missing Customer_Name +def get_orders_with_missing_customer_name(): + return """ + SELECT Order_ID, Customer_ID, Customer_Name, Product_ID, Quantity + FROM Orders + WHERE Customer_Name IS NULL + """ + +# Query to ensure unique Product_ID (no duplicates allowed in Orders) +def get_orders_with_duplicate_product_id(): + return """ + SELECT Product_ID, COUNT(*) + FROM Orders + GROUP BY Product_ID + HAVING COUNT(*) > 1 + """ + +# Query to ensure Product_Name cannot be NULL in Products +def get_orders_with_null_product_name(): + return """ + SELECT * + FROM Products + WHERE Product_Name IS NULL + """ + +# Query to get email customer in Orders +def get_invalid_email_customers(): + """ + Query to find customers with invalid email format. + Returns rows where the email does not match the expected pattern. + """ + query = """ + SELECT * + FROM Orders + WHERE Email NOT LIKE '%_@__%.__%'; + """ + return query + +def get_orders_with_invalid_date_range(): + """ + Query to find orders where the Order_Date is outside the range '2024-01-01' to '2024-12-31'. + """ + query = """ + SELECT * + FROM Orders + WHERE Order_Date < '2024-01-01' OR Order_Date > '2024-12-31'; + """ + return query + +def get_invalid_product_references(): + """ + Returns the SQL query to check for invalid Product_ID references in the Orders table. + """ + return """ + SELECT o.Order_ID, o.Product_ID + FROM Orders o + LEFT JOIN Products p ON o.Product_ID = p.Product_ID + WHERE p.Product_ID IS NULL; + """ \ No newline at end of file diff --git a/sql/sqlite_db/setup_db.py b/sql/sqlite_db/setup_db.py new file mode 100644 index 0000000..6224754 --- /dev/null +++ b/sql/sqlite_db/setup_db.py @@ -0,0 +1,39 @@ +import sqlite3 + +# Path to SQLite database +DB_PATH = 'sql/sqlite_db/etl.db' + +# Establish a connection +conn = sqlite3.connect(DB_PATH) +cursor = conn.cursor() + +# Drop tables if they exist to ensure schema updates +cursor.execute('DROP TABLE IF EXISTS Orders;') +cursor.execute('DROP TABLE IF EXISTS Products;') + +# Create the Orders table with the updated schema (including Email column) +cursor.execute(''' + CREATE TABLE Orders ( + Order_ID INTEGER PRIMARY KEY, + Customer_ID INTEGER, + Customer_Name TEXT, + Order_Date TEXT, + Product_ID INTEGER, + Quantity INTEGER, + Email TEXT + ); +''') + +# Create the Products table +cursor.execute(''' + CREATE TABLE Products ( + Product_ID INTEGER PRIMARY KEY, + Product_Name TEXT + ); +''') + +# Commit changes and close the connection +conn.commit() +conn.close() + +print("Database and tables set up successfully.") diff --git a/tests/load_data.py b/tests/load_data.py new file mode 100644 index 0000000..ddb214b --- /dev/null +++ b/tests/load_data.py @@ -0,0 +1,68 @@ +import sqlite3 +import os +from openpyxl import load_workbook + +# Path to the SQLite database +DB_PATH = 'sql/sqlite_db/etl.db' +# Path to the Excel file (dynamically resolve the absolute path) +EXCEL_FILE_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '../orders_test_data.xlsx')) + +def load_data_to_db(): + # Load the workbook and the 'Products' and 'Orders' sheets + wb = load_workbook(EXCEL_FILE_PATH) + + # Access the 'Products' and 'Orders' sheets + products_sheet = wb['Products'] + orders_sheet = wb['Orders'] + + # Establish a database connection + conn = sqlite3.connect(DB_PATH) + cursor = conn.cursor() + + # Insert data into Products table + for row in products_sheet.iter_rows(min_row=2, values_only=True): + cursor.execute(''' + INSERT OR IGNORE INTO Products (Product_ID, Product_Name) + VALUES (?, ?) + ''', (row[0], row[1])) + + # Insert data into Orders table + for row in orders_sheet.iter_rows(min_row=2, values_only=True): + # Check if the row is empty (all fields are empty or None) + if all(cell is None or cell == '' for cell in row): + continue # Skip the row if it's empty + + customer_id = row[0] + customer_name = row[1] + order_date = row[2] # Order_Date is assumed to be in the third column (index 2) + product_id = row[3] + quantity = row[4] + email = row[5] + + # Ensure that 'Order_Date' stays as a string, not a date object + if isinstance(order_date, str): + # If the order_date is in string format (like '12/01/2024'), keep it as is + order_date = order_date.strip() # Remove leading/trailing whitespace and newlines + elif isinstance(order_date, datetime): + # If the order_date is a datetime object, convert it to string + order_date = order_date.strftime('%d/%m/%Y') if order_date else None + else: + order_date = None # Set to None if the date format is invalid + + # Skip inserting rows where required data (such as order_date or customer_id) is invalid + if not customer_id or not order_date: + continue # Skip this row if customer_id or order_date is missing or invalid + + cursor.execute(''' + INSERT INTO Orders (Customer_ID, Customer_Name, Order_Date, Product_ID, Quantity, Email) + VALUES (?, ?, ?, ?, ?, ?) + ''', (customer_id, customer_name, order_date, product_id, quantity, email)) + + # Commit the changes and close the connection + conn.commit() + conn.close() + + print("Data loaded successfully from Excel to database.") + +if __name__ == '__main__': + load_data_to_db() diff --git a/tests/test_etl.py b/tests/test_etl.py new file mode 100644 index 0000000..1f235ea --- /dev/null +++ b/tests/test_etl.py @@ -0,0 +1,201 @@ +import pandas as pd +import sqlite3 +import pytest +import sys +import os +from datetime import datetime + +# Add the root directory of the project to the Python path +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from sql.sqlite_db.db_queries import ( + validate_customer_id_unique, + validate_order_date_format, + get_orders_with_negative_quantity, + get_orders_with_missing_customer_name, + get_orders_with_duplicate_product_id, + get_orders_with_null_product_name, + get_invalid_email_customers, + get_orders_with_invalid_date_range, + get_invalid_product_references +) + +# Fixture to set up and tear down the SQLite connection +@pytest.fixture(scope="module") +def db_connection(): + # Ensure the database file exists + db_path = os.path.join(os.path.dirname(__file__), "../sql/sqlite_db/etl.db") + assert os.path.exists(db_path), f"Database file not found at {db_path}" + + conn = sqlite3.connect(db_path) + yield conn + conn.close() + +# Test case 1: Validate customer id unique +def test_customer_id_unique(db_connection): + query = validate_customer_id_unique() + df = pd.read_sql(query, db_connection) + + # If df is not empty, print the rows that have duplicates + if not df.empty: + print("\nDuplicate Customer_IDs found:") + print(df) + + # Assert that there are no duplicate orders for the same Customer_ID and Order_Date + assert df.empty, "Duplicate orders exist:\n" + df.to_string(index=False) + +def is_valid_date(date_str): + """Check if a date string is valid (dd/mm/yyyy).""" + try: + # Try parsing the date + datetime.strptime(date_str, '%d/%m/%Y') + return True + except ValueError: + return False + +# Test case 2: Validate date format dd/mm/yyyy format +def test_order_date_format(db_connection): + # Run the SQL query to fetch orders with invalid date formats + query = validate_order_date_format() # Your validation SQL query + df = pd.read_sql(query, db_connection) # Fetch the result into a DataFrame + + # Strip any unwanted characters like newlines + df['Order_Date'] = df['Order_Date'].str.replace(r'\n', '').str.strip() + + # Validate if the date is in the correct format and valid + invalid_dates = df[~df['Order_Date'].apply(is_valid_date)] + + # Print out any rows with invalid date formats + if not invalid_dates.empty: + print("Orders with invalid date format:", invalid_dates) + + # Assert that there are no invalid dates remaining + assert invalid_dates.empty, f"There are orders with invalid date formats: {invalid_dates}" + +# Test case 3: Validate Missing Customer Name +def test_missing_customer_name(db_connection): + query = get_orders_with_missing_customer_name() + df = pd.read_sql(query, db_connection) + + missing_customer_name = df['Customer_Name'].isnull().sum() # Count NaN/None values + print(f"Number of missing Customer_Name values: {missing_customer_name}") + + # Assert that there are no missing customer names (fail if there are any) + assert missing_customer_name == 0, f"There are orders with missing Customer_Name: {missing_customer_name}" + +# Test case 4: Validate Negative Quantity Orders +def test_negative_quantity(db_connection): + query = get_orders_with_negative_quantity() + df = pd.read_sql(query, db_connection) + + # Log for debugging + print("DataFrame loaded from the database:") + print(df) + print(f"Negative quantities found: {df[df['Quantity'] < 0]}") + + # Assert that there are NO negative quantities + negative_quantity_count = (df['Quantity'] < 0).sum() # Count negative quantities + assert negative_quantity_count == 0, f"Orders with negative quantity found: {negative_quantity_count}" + +# Test case 5: Verify order date range should be within month December only +def test_order_date_range(db_connection): + """ + Validate that all Order_Date values are within the range '2024-12-01' to '2024-12-31'. + Invalid dates should also be flagged separately. + """ + # Query all rows from the Orders table + cursor = db_connection.cursor() + cursor.execute("SELECT Order_ID, Order_Date FROM Orders") + rows = cursor.fetchall() + + invalid_dates = [] + out_of_range_dates = [] + + # Process each row + for row in rows: + order_id = row[0] + order_date = row[1] + + # Validate the date format + try: + # Parse the date assuming the format is 'DD/MM/YYYY' + parsed_date = datetime.strptime(order_date, '%d/%m/%Y') + print(f"Parsed Date: {parsed_date}") # Debugging output + + # Check if the date is out of the valid range (December 2024) + if not (datetime(2024, 12, 1) <= parsed_date <= datetime(2024, 12, 31)): + out_of_range_dates.append((order_id, order_date)) + except ValueError: + # If the date is invalid, add it to the invalid dates list + invalid_dates.append((order_id, order_date)) + + # Log invalid dates + if invalid_dates: + print("\nOrders with invalid date formats:") + for order_id, invalid_date in invalid_dates: + print(f"Order_ID: {order_id}, Invalid Date: {invalid_date}") + + # Log out-of-range dates + if out_of_range_dates: + print("\nOrders with out-of-range dates:") + for order_id, out_of_range_date in out_of_range_dates: + print(f"Order_ID: {order_id}, Out-of-Range Date: {out_of_range_date}") + + # Collect all errors and fail at the end + errors = [] + # Collect all errors and fail at the end + if invalid_dates: + errors.append(f"Invalid date formats: {invalid_dates}") + if out_of_range_dates: + errors.append(f"Out-of-range dates: {out_of_range_dates}") + + # Combine errors into a single line for better test summary display + error_message = " | ".join(errors) + assert not errors, error_message + +# Test case 6: Test invalid email format +def test_invalid_email_format(db_connection): + """ + Test case to validate that all email addresses in the Orders table are in a valid format. + """ + query = get_invalid_email_customers() + df = pd.read_sql(query, db_connection) + + # Log for debugging + print("\nRows with invalid email format:") + print(df) + + # Assert that there are no rows with invalid email formats + assert df.empty, f"Invalid email addresses found:\n{df.to_string(index=False)}" + +# Test case 7: Ensure Unique Product_ID (no duplicates allowed) +def test_unique_product_id_in_order(db_connection): + query = get_orders_with_duplicate_product_id() + df = pd.read_sql(query, db_connection) + + assert df.empty, "There are duplicate Product_IDs in the Orders table" + +# Test case 8: Ensure Product_Name Cannot Be NULL +def test_product_name_not_null(db_connection): + query = get_orders_with_null_product_name() + df = pd.read_sql(query, db_connection) + + assert df.empty, "There are Products with NULL Product_Name" + +# Test case 9: Ensure Product_ID in Orders References a Valid Product_ID in Products +def test_referential_integrity(db_connection): + """ + Test case to validate referential integrity between Orders and Products tables. + Expected Behavior + If all Product_IDs in Orders have matching entries in Products, the query should return no rows. + If any Product_ID in Orders does not have a match in Products, the query should return those Order_IDs and their invalid Product_IDs. + """ + query = get_invalid_product_references() + df = pd.read_sql(query, db_connection) + + # Log for debugging + print("\nRows with invalid Product_ID references:") + print(df.to_string(index=False) if not df.empty else "No issues found.") + + # Assert that there are no rows with invalid Product_ID references + assert df.empty, f"Referential integrity issues found:\n{df.to_string(index=False)}"