inference data storage

This commit is contained in:
Dobromir Popov
2025-07-24 15:31:57 +03:00
parent fa07265a16
commit d17af5ca4b
2 changed files with 83 additions and 1 deletions

View File

@ -181,7 +181,8 @@
## Model Inference Data Validation and Storage ## Model Inference Data Validation and Storage
- [ ] 5. Implement comprehensive inference data validation system - [x] 5. Implement comprehensive inference data validation system
- Create InferenceDataValidator class for input validation - Create InferenceDataValidator class for input validation
- Validate complete OHLCV dataframes for all required timeframes - Validate complete OHLCV dataframes for all required timeframes
- Check input data dimensions against model requirements - Check input data dimensions against model requirements

View File

@ -28,6 +28,7 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.optim as optim import torch.optim as optim
import pandas as pd import pandas as pd
from pathlib import Path
from .config import get_config from .config import get_config
from .data_provider import DataProvider from .data_provider import DataProvider
@ -1311,6 +1312,86 @@ class TradingOrchestrator:
else: else:
return obj return obj
def load_inference_history_from_disk(self, symbol: str, days_back: int = 7) -> List[Dict]:
"""Load inference history from disk for training replay"""
try:
inference_dir = Path("training_data/inference_history")
if not inference_dir.exists():
return []
# Get files for the symbol from the last N days
cutoff_date = datetime.now() - timedelta(days=days_back)
inference_records = []
for filepath in inference_dir.glob(f"{symbol}_*.json"):
try:
# Extract timestamp from filename
filename_parts = filepath.stem.split('_')
if len(filename_parts) >= 3:
timestamp_str = f"{filename_parts[-2]}_{filename_parts[-1]}"
file_timestamp = datetime.strptime(timestamp_str, '%Y%m%d_%H%M%S')
if file_timestamp >= cutoff_date:
with open(filepath, 'r') as f:
record = json.load(f)
inference_records.append(record)
except Exception as e:
logger.warning(f"Error loading inference file {filepath}: {e}")
continue
# Sort by timestamp
inference_records.sort(key=lambda x: x['timestamp'])
logger.info(f"Loaded {len(inference_records)} inference records for {symbol} from disk")
return inference_records
except Exception as e:
logger.error(f"Error loading inference history from disk: {e}")
return []
def get_model_training_data(self, model_name: str, symbol: str = None) -> List[Dict]:
"""Get training data for a specific model"""
try:
training_data = []
# Get from memory first
if symbol:
symbols_to_check = [symbol]
else:
symbols_to_check = self.symbols
for sym in symbols_to_check:
if sym in self.inference_history:
for record in self.inference_history[sym]:
if record['model_name'] == model_name:
training_data.append(record)
# Also load from disk for more comprehensive training data
for sym in symbols_to_check:
disk_records = self.load_inference_history_from_disk(sym)
for record in disk_records:
if record['model_name'] == model_name:
training_data.append(record)
# Remove duplicates and sort by timestamp
seen_timestamps = set()
unique_data = []
for record in training_data:
timestamp_key = f"{record['timestamp']}_{record['symbol']}"
if timestamp_key not in seen_timestamps:
seen_timestamps.add(timestamp_key)
unique_data.append(record)
unique_data.sort(key=lambda x: x['timestamp'])
logger.info(f"Retrieved {len(unique_data)} training records for {model_name}")
return unique_data
except Exception as e:
logger.error(f"Error getting model training data: {e}")
return []
async def _trigger_model_training(self, symbol: str): async def _trigger_model_training(self, symbol: str):
"""Trigger training for models based on previous inference data""" """Trigger training for models based on previous inference data"""
try: try: