gogo2/verify_no_synthetic_data.py

#!/usr/bin/env python3
"""
NO SYNTHETIC DATA VERIFICATION SCRIPT

This script scans the entire codebase to ensure NO synthetic, mock,
dummy, or generated data implementations remain.

Run this script to verify 100% real market data compliance.
"""

import os
import re
import sys
from pathlib import Path
from typing import List, Dict, Tuple

# Patterns that indicate synthetic data
FORBIDDEN_PATTERNS = [
    r'np\.random\.',
    r'random\.uniform',
    r'random\.choice',
    r'random\.normal',
    r'generate.*data',
    r'create.*fake',
    r'dummy.*data',
    r'mock.*data',
    r'simulate.*',
    r'synthetic.*data',
    r'fake.*data',
    r'test.*data.*=',
    r'simulated.*',
    r'generated.*data'
]

# Allowed exceptions (legitimate uses)
ALLOWED_EXCEPTIONS = [
    'np.random.choice',  # In training for batch sampling
    'random exploration',  # RL exploration
    'random seed',  # For reproducibility
    'random.choice.*action',  # RL action selection
    'random sample',  # Data sampling (not generation)
    'model.train.*random',  # Training mode randomness
    'test.*real.*data',  # Testing with real data
    'random.*shuffle',  # Data shuffling
    'random.*split'  # Data splitting
]

# File extensions to check
EXTENSIONS = ['.py', '.md', '.txt', '.json', '.yaml', '.yml']

def is_allowed_exception(line: str, pattern: str) -> bool:
    """Check if a pattern match is an allowed exception"""
    line_lower = line.lower()
    line_stripped = line.strip()

    # Skip comments and documentation
    if line_stripped.startswith('#') or line_stripped.startswith('*') or line_stripped.startswith('//'):
        return True

    # Skip markdown documentation
    if any(keyword in line_lower for keyword in ['code:', '```', 'line ', '📁', '❌', '✅']):
        return True

    # Skip policy documentation (mentions of forbidden things in policy docs)
    if any(keyword in line_lower for keyword in ['policy', 'forbidden', 'not allowed', 'never use', 'zero synthetic']):
        return True

    # Skip error messages and logging about synthetic data
    if any(keyword in line_lower for keyword in ['logger.', 'print(', 'error(', 'warning(']):
        return True

    # Skip variable names and string literals mentioning synthetic data
    if any(keyword in line_lower for keyword in ['_synthetic_', 'allow_synthetic', 'no synthetic']):
        return True

    # Skip function/method definitions that handle real data
    if any(keyword in line_lower for keyword in ['def ', 'class ', 'from real', 'real market']):
        return True

    # Check for legitimate RL exploration (with context)
    if any(keyword in line_lower for keyword in ['exploration', 'epsilon', 'action selection', 'random exploration']):
        return True

    # Check for legitimate training randomness
    if any(keyword in line_lower for keyword in ['batch.*sample', 'shuffle', 'split', 'randint.*start']):
        return True

    # Check for reproducibility
    if 'seed' in line_lower:
        return True

    # Check for legitimate data operations (not generation)
    if any(keyword in line_lower for keyword in ['test_data =', 'latest_data =', 'test_dataset =']):
        return True

    # Skip verification script itself
    if 'verify_no_synthetic_data.py' in str(line):
        return True

    # Check other allowed patterns
    for exception in ALLOWED_EXCEPTIONS:
        if re.search(exception, line_lower):
            return True

    return False

def scan_file(file_path: Path) -> List[Tuple[int, str, str]]:
    """Scan a file for forbidden patterns"""
    violations = []

    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            lines = f.readlines()

        for line_num, line in enumerate(lines, 1):
            for pattern in FORBIDDEN_PATTERNS:
                if re.search(pattern, line, re.IGNORECASE):
                    # Check if it's an allowed exception
                    if not is_allowed_exception(line, pattern):
                        violations.append((line_num, pattern, line.strip()))

    except Exception as e:
        print(f"⚠️  Error scanning {file_path}: {e}")

    return violations

def scan_codebase(root_path: Path) -> Dict[str, List[Tuple[int, str, str]]]:
    """Scan entire codebase for synthetic data violations"""
    violations = {}

    # Skip certain directories
    skip_dirs = {'.git', '__pycache__', 'node_modules', '.vscode', 'cache', 'logs', 'runs'}

    for root, dirs, files in os.walk(root_path):
        # Skip unwanted directories
        dirs[:] = [d for d in dirs if d not in skip_dirs]

        for file in files:
            file_path = Path(root) / file

            # Check only relevant file types
            if file_path.suffix in EXTENSIONS:
                file_violations = scan_file(file_path)
                if file_violations:
                    relative_path = file_path.relative_to(root_path)
                    violations[str(relative_path)] = file_violations

    return violations

def main():
    """Main verification function"""
    print("🔍 SCANNING CODEBASE FOR SYNTHETIC DATA VIOLATIONS...")
    print("=" * 80)

    # Get project root
    project_root = Path(__file__).parent

    # Scan codebase
    violations = scan_codebase(project_root)

    if not violations:
        print("✅ SUCCESS: NO SYNTHETIC DATA FOUND!")
        print("🎯 100% REAL MARKET DATA COMPLIANCE VERIFIED")
        print("🚫 Zero synthetic, mock, dummy, or generated data")
        print("=" * 80)
        return 0

    # Report violations
    print(f"❌ FOUND {len(violations)} FILES WITH POTENTIAL SYNTHETIC DATA:")
    print("=" * 80)

    total_violations = 0
    for file_path, file_violations in violations.items():
        print(f"\n📁 {file_path}:")
        for line_num, pattern, line in file_violations:
            total_violations += 1
            print(f"   Line {line_num}: {pattern}")
            print(f"   Code: {line[:100]}...")

    print("=" * 80)
    print(f"❌ TOTAL VIOLATIONS: {total_violations}")
    print("🚨 CRITICAL: Synthetic data detected - must be removed!")
    print("🎯 Only 100% real market data is allowed")

    return 1

if __name__ == "__main__":
    exit_code = main()
    sys.exit(exit_code)