#!/usr/bin/env python3
# timeseries_compute/data_generator.py
"""
Time Series Data Generation Module.
This module provides functionality for generating synthetic price series data with
controlled statistical properties. It's designed as the first step in a typical
time series analysis pipeline, creating test data with known characteristics.
Key Components:
- PriceSeriesGenerator: Class for generating correlated price series
- generate_price_series: Convenience function with simplified interface
- set_random_seed: Function to ensure reproducible results
Typical Usage Flow:
1. Create a PriceSeriesGenerator instance with desired date range
2. Generate price series with specific initial values and correlations
3. Proceed with the generated data to data_processor.py for preparation
The generated price series follow a random walk with drift, with options
to control cross-series correlations.
"""
import logging as l
# script specific imports
import numpy as np # for sqrt function
import pandas as pd
import random
from tabulate import tabulate # pretty print dfs
from typing import Dict, Tuple, Optional, List # type hints for better code readability
# set random seed for reproducibility
DEFAULT_RANDOM_SEED = 2025 # this is used by random module
random.seed(DEFAULT_RANDOM_SEED)
[docs]
class PriceSeriesGenerator:
"""
Class generates a series of prices for given tickers over a specified date range.
Attributes:
start_date (str): The start date of the price series in YYYY-MM-DD format.
end_date (str): The end date of the price series in YYYY-MM-DD format.
dates (pd.DatetimeIndex): A range of dates from start_date to end_date, including only weekdays.
Methods:
__init__(start_date: str, end_date: str):
Initializes the PriceSeriesGenerator with the given date range.
generate_correlated_prices(anchor_prices: dict, correlation_matrix: Optional[Dict[Tuple[str, str], float]] = None) -> Dict[str, list]:
Generates a series of correlated prices for the given tickers with initial prices.
"""
def __init__(self, start_date: str, end_date: str):
"""
Given data range, initialize the generator
Args:
start_date (str): start, YYYY-MM-DD
end_date (str): end, YYYY-MM-DD
"""
ascii_banner = """\n\n\t> PriceSeriesGenerator <\n"""
l.info(ascii_banner)
self.start_date = start_date
self.end_date = end_date
self.dates = pd.date_range(
start=start_date, end=end_date, freq="B"
) # weekdays only
# set new random seed using a "convenience" function, which is a wrapper around the class
[docs]
def set_random_seed(seed: int = DEFAULT_RANDOM_SEED) -> None:
"""
Sets the random seed for the random module.
Args:
seed (int): Seed value for random number generator.
"""
l.info(f"Setting random seed to {seed}")
random.seed(seed)
[docs]
def generate_price_series(
start_date: str = "2023-01-01",
end_date: str = "2023-12-31",
anchor_prices: Optional[Dict[str, float]] = None,
random_seed: Optional[int] = None,
correlations: Optional[Dict[Tuple[str, str], float]] = None,
) -> Tuple[Dict[str, list], pd.DataFrame]:
"""
Generates a series of price data based on the provided parameters.
I return both a dict and a df. Supporting both means i can stop second guessing which to return.
Args:
start_date (str, optional): The start date for the price series. Defaults to "2023-01-01".
end_date (str, optional): The end date for the price series. Defaults to "2023-12-31".
anchor_prices (Dict[str, float], optional): A dictionary of tickers and their initial prices.
Defaults to {"GME": 100.0, "BYND": 200.0} if None.
random_seed (int, optional): Seed for random number generation. If provided, overrides the module-level seed.
correlations (Dict[Tuple[str, str], float], optional): Dictionary specifying correlations between ticker pairs.
Returns:
Tuple[Dict[str, list], pd.DataFrame]: A dictionary of generated prices and a DataFrame.
"""
if anchor_prices is None:
anchor_prices = {"GME": 100.0, "BYND": 200.0}
if random_seed is not None:
set_random_seed(random_seed)
l.info("Generating price series data")
generator = PriceSeriesGenerator(
start_date=start_date,
end_date=end_date,
)
price_dict = generator.generate_correlated_prices(
anchor_prices=anchor_prices, correlation_matrix=correlations
)
# Use the dates from the generator instead of creating a new date range
price_df = pd.DataFrame(price_dict, index=generator.dates)
return price_dict, price_df