WIP oclcv in storage. migrate do duckdb

2025-10-24 15:57:33 +03:00
parent d4ed894a92
commit 809c32e7a8
7 changed files with 1417 additions and 92 deletions
--- a/ANNOTATE/core/annotation_manager.py
+++ b/ANNOTATE/core/annotation_manager.py
@@ -2,10 +2,12 @@
 Annotation Manager - Manages trade annotations and test case generation

 Handles storage, retrieval, and test case generation from manual trade annotations.
+Stores annotations in both JSON (legacy) and SQLite (with full market data).
 """

 import json
 import uuid
+import sys
 from pathlib import Path
 from datetime import datetime, timedelta
 from typing import List, Dict, Optional, Any
@@ -13,8 +15,20 @@ from dataclasses import dataclass, asdict
 import logging
 import pytz

+# Add parent directory to path for imports
+parent_dir = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(parent_dir))
+
 logger = logging.getLogger(__name__)

+# Import DuckDB storage
+try:
+    from core.duckdb_storage import DuckDBStorage
+    DUCKDB_AVAILABLE = True
+except ImportError:
+    DUCKDB_AVAILABLE = False
+    logger.warning("DuckDB storage not available for annotations")
+

@dataclass
 class TradeAnnotation:
@@ -51,6 +65,15 @@ class AnnotationManager:
        
        self.annotations_db = self._load_annotations()
        
+        # Initialize DuckDB storage for complete annotation data
+        self.duckdb_storage: Optional[DuckDBStorage] = None
+        if DUCKDB_AVAILABLE:
+            try:
+                self.duckdb_storage = DuckDBStorage()
+                logger.info("DuckDB storage initialized for annotations")
+            except Exception as e:
+                logger.warning(f"Could not initialize DuckDB storage: {e}")
+        
        logger.info(f"AnnotationManager initialized with storage: {self.storage_path}")
    
    def _load_annotations(self) -> Dict[str, List[Dict]]:
@@ -122,17 +145,39 @@ class AnnotationManager:
        logger.info(f"  Exit state: {len(exit_market_state or {})} timeframes")
        return annotation
    
-    def save_annotation(self, annotation: TradeAnnotation):
-        """Save annotation to storage"""
+    def save_annotation(self, annotation: TradeAnnotation, 
+                       market_snapshots: Dict = None,
+                       model_predictions: List[Dict] = None):
+        """
+        Save annotation to storage (JSON + SQLite)
+        
+        Args:
+            annotation: TradeAnnotation object
+            market_snapshots: Dict of {timeframe: DataFrame} with OHLCV data
+            model_predictions: List of model predictions at annotation time
+        """
        # Convert to dict
        ann_dict = asdict(annotation)
        
-        # Add to database
+        # Add to JSON database (legacy)
        self.annotations_db["annotations"].append(ann_dict)
        
-        # Save to file
+        # Save to JSON file
        self._save_annotations()
        
+        # Save to DuckDB with complete market data
+        if self.duckdb_storage and market_snapshots:
+            try:
+                self.duckdb_storage.store_annotation(
+                    annotation_id=annotation.annotation_id,
+                    annotation_data=ann_dict,
+                    market_snapshots=market_snapshots,
+                    model_predictions=model_predictions
+                )
+                logger.info(f"Saved annotation {annotation.annotation_id} to DuckDB with {len(market_snapshots)} timeframes")
+            except Exception as e:
+                logger.error(f"Could not save annotation to DuckDB: {e}")
+        
        logger.info(f"Saved annotation: {annotation.annotation_id}")
    
    def get_annotations(self, symbol: str = None, 
--- a/ANNOTATE/core/data_loader.py
+++ b/ANNOTATE/core/data_loader.py
@@ -36,7 +36,10 @@ class HistoricalDataLoader:
        self.memory_cache = {}
        self.cache_ttl = timedelta(minutes=5)
        
-        logger.info("HistoricalDataLoader initialized with existing DataProvider")
+        # Startup mode - allow stale cache for faster loading
+        self.startup_mode = True
+        
+        logger.info("HistoricalDataLoader initialized with existing DataProvider (startup mode: ON)")
    
    def get_data(self, symbol: str, timeframe: str, 
                 start_time: Optional[datetime] = None,
@@ -130,12 +133,22 @@ class HistoricalDataLoader:
                            return df
            
            # Fallback: fetch from DataProvider's historical data method
-            logger.info(f"Fetching fresh data for {symbol} {timeframe}")
-            df = self.data_provider.get_historical_data(
-                symbol=symbol,
-                timeframe=timeframe,
-                limit=limit
-            )
+            # During startup, allow stale cache to avoid slow API calls
+            if self.startup_mode:
+                logger.info(f"Loading data for {symbol} {timeframe} (startup mode: allow stale cache)")
+                df = self.data_provider.get_historical_data(
+                    symbol=symbol,
+                    timeframe=timeframe,
+                    limit=limit,
+                    allow_stale_cache=True
+                )
+            else:
+                logger.info(f"Fetching fresh data for {symbol} {timeframe}")
+                df = self.data_provider.get_historical_data(
+                    symbol=symbol,
+                    timeframe=timeframe,
+                    limit=limit
+                )
            
            if df is not None and not df.empty:
                # Filter by time range if specified
@@ -219,6 +232,11 @@ class HistoricalDataLoader:
        self.memory_cache.clear()
        logger.info("Memory cache cleared")
    
+    def disable_startup_mode(self):
+        """Disable startup mode to fetch fresh data"""
+        self.startup_mode = False
+        logger.info("Startup mode disabled - will fetch fresh data on next request")
+    
    def get_data_boundaries(self, symbol: str, timeframe: str) -> Tuple[Optional[datetime], Optional[datetime]]:
        """
        Get the earliest and latest available data timestamps
--- a/ANNOTATE/web/app.py
+++ b/ANNOTATE/web/app.py
@@ -163,6 +163,10 @@ class AnnotationDashboard:
        # Setup routes
        self._setup_routes()
        
+        # Start background data refresh after startup
+        if self.data_loader:
+            self._start_background_data_refresh()
+        
        logger.info("Annotation Dashboard initialized")
    
    def _enable_unified_storage_async(self):
@@ -201,6 +205,58 @@ class AnnotationDashboard:
        storage_thread = threading.Thread(target=enable_storage, daemon=True)
        storage_thread.start()
    
+    def _start_background_data_refresh(self):
+        """Start background task to refresh recent data after startup"""
+        def refresh_recent_data():
+            try:
+                import time
+                # Wait for app to fully start
+                time.sleep(5)
+                
+                logger.info("🔄 Starting background data refresh (fetching only recent missing data)")
+                
+                # Disable startup mode to fetch fresh data
+                self.data_loader.disable_startup_mode()
+                
+                # Fetch only last 5 minutes of 1m data and 300 seconds of 1s data
+                symbols = self.config.get('symbols', ['ETH/USDT', 'BTC/USDT'])
+                
+                for symbol in symbols:
+                    try:
+                        # Fetch last 5 candles of 1m data (5 minutes)
+                        logger.info(f"Refreshing recent 1m data for {symbol}")
+                        self.data_provider.get_historical_data(
+                            symbol=symbol,
+                            timeframe='1m',
+                            limit=5,
+                            refresh=True
+                        )
+                        
+                        # Fetch last 300 candles of 1s data (5 minutes)
+                        logger.info(f"Refreshing recent 1s data for {symbol}")
+                        self.data_provider.get_historical_data(
+                            symbol=symbol,
+                            timeframe='1s',
+                            limit=300,
+                            refresh=True
+                        )
+                        
+                        logger.info(f"✅ Refreshed recent data for {symbol}")
+                        
+                    except Exception as e:
+                        logger.warning(f"Could not refresh recent data for {symbol}: {e}")
+                
+                logger.info("✅ Background data refresh completed")
+                
+            except Exception as e:
+                logger.error(f"Error in background data refresh: {e}")
+        
+        # Start in background thread
+        import threading
+        refresh_thread = threading.Thread(target=refresh_recent_data, daemon=True)
+        refresh_thread.start()
+        logger.info("📊 Background data refresh scheduled")
+    
    def _get_pivot_markers_for_timeframe(self, symbol: str, timeframe: str, df: pd.DataFrame) -> dict:
        """
        Get pivot markers for a specific timeframe using WilliamsMarketStructure directly
@@ -526,8 +582,38 @@ class AnnotationDashboard:
                    exit_market_state=exit_market_state
                )
                
-                # Save annotation
-                self.annotation_manager.save_annotation(annotation)
+                # Collect market snapshots for SQLite storage
+                market_snapshots = {}
+                if self.data_loader:
+                    try:
+                        # Get OHLCV data for all timeframes around the annotation time
+                        entry_time = datetime.fromisoformat(data['entry']['timestamp'].replace('Z', '+00:00'))
+                        exit_time = datetime.fromisoformat(data['exit']['timestamp'].replace('Z', '+00:00'))
+                        
+                        # Get data from 5 minutes before entry to 5 minutes after exit
+                        start_time = entry_time - timedelta(minutes=5)
+                        end_time = exit_time + timedelta(minutes=5)
+                        
+                        for timeframe in ['1s', '1m', '1h', '1d']:
+                            df = self.data_loader.get_data(
+                                symbol=data['symbol'],
+                                timeframe=timeframe,
+                                start_time=start_time,
+                                end_time=end_time,
+                                limit=1500
+                            )
+                            if df is not None and not df.empty:
+                                market_snapshots[timeframe] = df
+                        
+                        logger.info(f"Collected {len(market_snapshots)} timeframes for annotation storage")
+                    except Exception as e:
+                        logger.error(f"Error collecting market snapshots: {e}")
+                
+                # Save annotation with market snapshots
+                self.annotation_manager.save_annotation(
+                    annotation=annotation,
+                    market_snapshots=market_snapshots
+                )
                
                # Automatically generate test case with ±5min data
                try: