Source code for tests.test_data_processor

#!/usr/bin/env python3
# tests/test_data_processor.py

import pytest
import logging
import numpy as np
import pandas as pd
import warnings
from timeseries_compute.data_processor import (
    MissingDataHandler,
    MissingDataHandlerFactory,
    test_stationarity,
    fill_data,
)
from unittest.mock import patch, MagicMock
from statsmodels.tsa.stattools import adfuller


# Add fixture for the test_stationarity function in the module

[docs]
@pytest.fixture
def df():
    """Fixture to provide a DataFrame for testing stationarity."""
    # Create a test DataFrame with some values
    return pd.DataFrame(
        {"A": [1.0, 2.0, 3.0, 4.0, 5.0], "B": [5.0, 6.0, 7.0, 8.0, 9.0]}
    )




[docs]
@pytest.fixture
def sample_data():
    """Fixture to provide sample data for testing."""
    data = {
        "A": [1.0, 2.0, None, 4.0, 5.0],
        "B": [None, 2.0, 3.0, None, 5.0],
        "C": [1.0, None, None, 4.0, 5.0],
    }
    return pd.DataFrame(data)




[docs]
def test_create_handler_drop():
    """Test the create_handler method of MissingDataHandlerFactory with 'drop' strategy."""
    handler_func = MissingDataHandlerFactory.create_handler("drop")
    assert callable(handler_func)
    assert handler_func.__name__ == "drop_na"




[docs]
def test_create_handler_forward_fill():
    """Test the create_handler method of MissingDataHandlerFactory with 'forward_fill' strategy."""
    handler_func = MissingDataHandlerFactory.create_handler("forward_fill")
    assert callable(handler_func)
    assert handler_func.__name__ == "forward_fill"




[docs]
def test_create_handler_invalid_strategy():
    """Test the create_handler method of MissingDataHandlerFactory with an invalid strategy."""
    with pytest.raises(ValueError):
        MissingDataHandlerFactory.create_handler("invalid_strategy")




[docs]
@pytest.fixture
def sample_df():
    """Create a sample DataFrame with NaN values for testing."""
    return pd.DataFrame(
        {
            "A": [1.0, 2.0, np.nan, 4.0, 5.0],
            "B": [np.nan, 2.0, 3.0, np.nan, 5.0],
            "C": [1.0, 2.0, 3.0, 4.0, 5.0],
        }
    )




[docs]
def test_drop_na():
    """Test drop_na method removes rows with NaN values."""
    # Create test data
    df = pd.DataFrame({"A": [1.0, 2.0, np.nan, 4.0], "B": [5.0, np.nan, 7.0, 8.0]})

    # Create handler instance
    handler = MissingDataHandler()

    # Expected result
    expected = pd.DataFrame({"A": [1.0, 4.0], "B": [5.0, 8.0]}, index=[0, 3])

    # Test method
    result = handler.drop_na(df)

    # Check equality
    pd.testing.assert_frame_equal(result, expected)




[docs]
def test_forward_fill(sample_df):
    """Test forward_fill method correctly propagates values forward."""
    # Create handler instance
    handler = MissingDataHandler()

    # Suppress the FutureWarning for the test
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=FutureWarning)
        result = handler.forward_fill(sample_df)

    expected = pd.DataFrame(
        {
            "A": [1.0, 2.0, 2.0, 4.0, 5.0],
            "B": [np.nan, 2.0, 3.0, 3.0, 5.0],
            "C": [1.0, 2.0, 3.0, 4.0, 5.0],
        }
    )

    pd.testing.assert_frame_equal(result, expected)




[docs]
def test_fill_data(sample_df):
    """Test fill_data with various filling methods."""
    # Test forward fill (default)
    # Suppress the FutureWarning for the test
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=FutureWarning)
        result_forward = fill_data(sample_df, strategy="forward_fill")

    expected_forward = pd.DataFrame(
        {
            "A": [1.0, 2.0, 2.0, 4.0, 5.0],
            "B": [np.nan, 2.0, 3.0, 3.0, 5.0],
            "C": [1.0, 2.0, 3.0, 4.0, 5.0],
        }
    )

    pd.testing.assert_frame_equal(result_forward, expected_forward)

    # Test drop
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=FutureWarning)
        result_drop = fill_data(sample_df, strategy="drop")

    # The dataset has NaNs in rows 0, 2, and 3, so only rows 1 and 4 remain
    expected_drop = pd.DataFrame(
        {"A": [2.0, 5.0], "B": [2.0, 5.0], "C": [2.0, 5.0]}, index=[1, 4]
    )

    pd.testing.assert_frame_equal(result_drop, expected_drop)

    # Test invalid method
    with pytest.raises(ValueError):
        fill_data(sample_df, strategy="invalid_strategy")




[docs]
@patch("timeseries_compute.data_processor.adfuller")
def test_ts_stationarity(mock_adfuller):
    """Test test_stationarity function properly uses adfuller test."""
    # Mock adfuller return value (test statistic, p-value, lags, nobs, critical values, icbest)
    mock_adfuller.return_value = (
        -3.5,
        0.01,
        1,
        100,
        {"1%": -3.5, "5%": -2.9, "10%": -2.6},
        1,
    )

    # Create test dataframe (not series)
    df = pd.DataFrame({"test_col": [1.0, 2.0, 3.0, 4.0, 5.0]})

    # Call function
    result = test_stationarity(df)

    # Check mock was called with right args
    mock_adfuller.assert_called_once()

    # Check expected structure based on implementation
    assert isinstance(result, dict)
    assert "test_col" in result
    assert "ADF Statistic" in result["test_col"]
    assert "p-value" in result["test_col"]
    assert result["test_col"]["ADF Statistic"] == -3.5
    assert result["test_col"]["p-value"] == 0.01




[docs]
def test_stationarity_integration():
    """Integration test for test_stationarity with real adfuller function."""
    # Create stationary series
    np.random.seed(42)

    # Use more random data to avoid divide by zero in statsmodels
    stationary_series = np.random.randn(100) * 10 + 5  # scaled and shifted

    # Create non-stationary series (random walk)
    random_walk = np.cumsum(np.random.randn(100) * 2)  # scaled random walk

    # Create a DataFrame for testing (since test_stationarity requires DataFrame)
    df = pd.DataFrame({"stationary": stationary_series, "non_stationary": random_walk})

    # Suppress RuntimeWarning from statsmodels
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        # Call function with real adfuller
        adf_results = test_stationarity(df)

    # Check structure without returning
    assert isinstance(adf_results, dict)
    assert "stationary" in adf_results
    assert "non_stationary" in adf_results
    assert isinstance(adf_results["stationary"]["ADF Statistic"], float)
    assert isinstance(adf_results["stationary"]["p-value"], float)
    assert isinstance(adf_results["non_stationary"]["ADF Statistic"], float)
    assert isinstance(adf_results["non_stationary"]["p-value"], float)