gogo2/verify_no_synthetic_data.py
2025-05-25 00:28:52 +03:00

189 lines
6.3 KiB
Python

#!/usr/bin/env python3
"""
NO SYNTHETIC DATA VERIFICATION SCRIPT
This script scans the entire codebase to ensure NO synthetic, mock,
dummy, or generated data implementations remain.
Run this script to verify 100% real market data compliance.
"""
import os
import re
import sys
from pathlib import Path
from typing import List, Dict, Tuple
# Patterns that indicate synthetic data
FORBIDDEN_PATTERNS = [
r'np\.random\.',
r'random\.uniform',
r'random\.choice',
r'random\.normal',
r'generate.*data',
r'create.*fake',
r'dummy.*data',
r'mock.*data',
r'simulate.*',
r'synthetic.*data',
r'fake.*data',
r'test.*data.*=',
r'simulated.*',
r'generated.*data'
]
# Allowed exceptions (legitimate uses)
ALLOWED_EXCEPTIONS = [
'np.random.choice', # In training for batch sampling
'random exploration', # RL exploration
'random seed', # For reproducibility
'random.choice.*action', # RL action selection
'random sample', # Data sampling (not generation)
'model.train.*random', # Training mode randomness
'test.*real.*data', # Testing with real data
'random.*shuffle', # Data shuffling
'random.*split' # Data splitting
]
# File extensions to check
EXTENSIONS = ['.py', '.md', '.txt', '.json', '.yaml', '.yml']
def is_allowed_exception(line: str, pattern: str) -> bool:
"""Check if a pattern match is an allowed exception"""
line_lower = line.lower()
line_stripped = line.strip()
# Skip comments and documentation
if line_stripped.startswith('#') or line_stripped.startswith('*') or line_stripped.startswith('//'):
return True
# Skip markdown documentation
if any(keyword in line_lower for keyword in ['code:', '```', 'line ', '📁', '', '']):
return True
# Skip policy documentation (mentions of forbidden things in policy docs)
if any(keyword in line_lower for keyword in ['policy', 'forbidden', 'not allowed', 'never use', 'zero synthetic']):
return True
# Skip error messages and logging about synthetic data
if any(keyword in line_lower for keyword in ['logger.', 'print(', 'error(', 'warning(']):
return True
# Skip variable names and string literals mentioning synthetic data
if any(keyword in line_lower for keyword in ['_synthetic_', 'allow_synthetic', 'no synthetic']):
return True
# Skip function/method definitions that handle real data
if any(keyword in line_lower for keyword in ['def ', 'class ', 'from real', 'real market']):
return True
# Check for legitimate RL exploration (with context)
if any(keyword in line_lower for keyword in ['exploration', 'epsilon', 'action selection', 'random exploration']):
return True
# Check for legitimate training randomness
if any(keyword in line_lower for keyword in ['batch.*sample', 'shuffle', 'split', 'randint.*start']):
return True
# Check for reproducibility
if 'seed' in line_lower:
return True
# Check for legitimate data operations (not generation)
if any(keyword in line_lower for keyword in ['test_data =', 'latest_data =', 'test_dataset =']):
return True
# Skip verification script itself
if 'verify_no_synthetic_data.py' in str(line):
return True
# Check other allowed patterns
for exception in ALLOWED_EXCEPTIONS:
if re.search(exception, line_lower):
return True
return False
def scan_file(file_path: Path) -> List[Tuple[int, str, str]]:
"""Scan a file for forbidden patterns"""
violations = []
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
lines = f.readlines()
for line_num, line in enumerate(lines, 1):
for pattern in FORBIDDEN_PATTERNS:
if re.search(pattern, line, re.IGNORECASE):
# Check if it's an allowed exception
if not is_allowed_exception(line, pattern):
violations.append((line_num, pattern, line.strip()))
except Exception as e:
print(f"⚠️ Error scanning {file_path}: {e}")
return violations
def scan_codebase(root_path: Path) -> Dict[str, List[Tuple[int, str, str]]]:
"""Scan entire codebase for synthetic data violations"""
violations = {}
# Skip certain directories
skip_dirs = {'.git', '__pycache__', 'node_modules', '.vscode', 'cache', 'logs', 'runs'}
for root, dirs, files in os.walk(root_path):
# Skip unwanted directories
dirs[:] = [d for d in dirs if d not in skip_dirs]
for file in files:
file_path = Path(root) / file
# Check only relevant file types
if file_path.suffix in EXTENSIONS:
file_violations = scan_file(file_path)
if file_violations:
relative_path = file_path.relative_to(root_path)
violations[str(relative_path)] = file_violations
return violations
def main():
"""Main verification function"""
print("🔍 SCANNING CODEBASE FOR SYNTHETIC DATA VIOLATIONS...")
print("=" * 80)
# Get project root
project_root = Path(__file__).parent
# Scan codebase
violations = scan_codebase(project_root)
if not violations:
print("✅ SUCCESS: NO SYNTHETIC DATA FOUND!")
print("🎯 100% REAL MARKET DATA COMPLIANCE VERIFIED")
print("🚫 Zero synthetic, mock, dummy, or generated data")
print("=" * 80)
return 0
# Report violations
print(f"❌ FOUND {len(violations)} FILES WITH POTENTIAL SYNTHETIC DATA:")
print("=" * 80)
total_violations = 0
for file_path, file_violations in violations.items():
print(f"\n📁 {file_path}:")
for line_num, pattern, line in file_violations:
total_violations += 1
print(f" Line {line_num}: {pattern}")
print(f" Code: {line[:100]}...")
print("=" * 80)
print(f"❌ TOTAL VIOLATIONS: {total_violations}")
print("🚨 CRITICAL: Synthetic data detected - must be removed!")
print("🎯 Only 100% real market data is allowed")
return 1
if __name__ == "__main__":
exit_code = main()
sys.exit(exit_code)