"""
Feature Extraction Module
Extracts behavioral patterns and features from user interactions
"""
import json
import time
import hashlib
import numpy as np
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional
from flask import Request
import logging

logger = logging.getLogger(__name__)

class FeatureExtractor:
    """Extracts features from user interactions for ML training"""
    
    def __init__(self):
        self.feature_cache = {}
        self.timing_data = {}
        
    def extract_request_features(self, request: Request, client_ip: str, session_data: Dict = None) -> Dict[str, Any]:
        """Extract comprehensive features from a request"""
        try:
            features = {}
            
            # Basic request features
            features.update(self._extract_basic_features(request, client_ip))
            
            # Timing features
            features.update(self._extract_timing_features(client_ip, session_data))
            
            # Header analysis
            features.update(self._extract_header_features(request))
            
            # User agent analysis
            features.update(self._extract_user_agent_features(request))
            
            # Behavioral features
            features.update(self._extract_behavioral_features(session_data))
            
            # Network features
            features.update(self._extract_network_features(request, client_ip))
            
            return features
            
        except Exception as e:
            logger.error(f"Feature extraction failed: {e}")
            return {}
    
    def _extract_basic_features(self, request: Request, client_ip: str) -> Dict[str, Any]:
        """Extract basic request features"""
        return {
            'request_method': request.method,
            'has_referrer': bool(request.referrer),
            'content_length': request.content_length or 0,
            'ip_hash': hashlib.md5(client_ip.encode()).hexdigest()[:8],
            'timestamp': time.time()
        }
    
    def _extract_timing_features(self, client_ip: str, session_data: Dict = None) -> Dict[str, Any]:
        """Extract timing-related features"""
        current_time = time.time()
        
        # Initialize timing data for new IPs
        if client_ip not in self.timing_data:
            self.timing_data[client_ip] = {
                'first_seen': current_time,
                'last_request': current_time,
                'request_count': 0,
                'request_times': []
            }
        
        timing_info = self.timing_data[client_ip]
        timing_info['request_count'] += 1
        timing_info['request_times'].append(current_time)
        
        # Keep only recent requests (last 10 minutes)
        cutoff_time = current_time - 600
        timing_info['request_times'] = [t for t in timing_info['request_times'] if t > cutoff_time]
        
        # Calculate timing features
        time_since_first = current_time - timing_info['first_seen']
        time_since_last = current_time - timing_info['last_request']
        
        # Request frequency analysis
        recent_requests = len(timing_info['request_times'])
        avg_interval = 0
        if recent_requests > 1:
            intervals = [timing_info['request_times'][i] - timing_info['request_times'][i-1] 
                        for i in range(1, len(timing_info['request_times']))]
            avg_interval = np.mean(intervals) if intervals else 0
        
        timing_info['last_request'] = current_time
        
        features = {
            'time_since_first_request': time_since_first,
            'time_since_last_request': time_since_last,
            'total_requests': timing_info['request_count'],
            'recent_requests': recent_requests,
            'avg_request_interval': avg_interval,
            'session_duration': session_data.get('session_duration', 0) if session_data else 0
        }
        
        # Session-specific timing
        if session_data:
            features.update({
                'page_load_time': session_data.get('page_load_time', 0),
                'interaction_delay': session_data.get('interaction_delay', 0),
                'form_fill_time': session_data.get('form_fill_time', 0)
            })
        
        return features
    
    def _extract_header_features(self, request: Request) -> Dict[str, Any]:
        """Extract features from HTTP headers"""
        headers = dict(request.headers)
        
        # Common headers analysis
        common_headers = ['Accept', 'Accept-Language', 'Accept-Encoding', 
                         'Connection', 'Cache-Control', 'DNT']
        
        features = {
            'header_count': len(headers),
            'has_accept_language': 'Accept-Language' in headers,
            'has_accept_encoding': 'Accept-Encoding' in headers,
            'has_connection': 'Connection' in headers,
            'has_cache_control': 'Cache-Control' in headers,
            'has_dnt': 'DNT' in headers,
            'common_headers_ratio': sum(1 for h in common_headers if h in headers) / len(common_headers)
        }
        
        # Accept-Language analysis
        if 'Accept-Language' in headers:
            lang_header = headers['Accept-Language']
            features.update({
                'language_count': len(lang_header.split(',')),
                'has_quality_values': 'q=' in lang_header,
                'primary_language': lang_header.split(',')[0].split(';')[0].strip()
            })
        
        # User-Agent presence and length
        user_agent = headers.get('User-Agent', '')
        features.update({
            'user_agent_length': len(user_agent),
            'has_user_agent': bool(user_agent)
        })
        
        return features
    
    def _extract_user_agent_features(self, request: Request) -> Dict[str, Any]:
        """Analyze User-Agent string for bot patterns"""
        user_agent = request.headers.get('User-Agent', '').lower()
        
        # Bot indicators
        bot_keywords = ['bot', 'crawler', 'spider', 'scraper', 'automated', 'curl', 'wget', 'python', 'java']
        legitimate_browsers = ['chrome', 'firefox', 'safari', 'edge', 'opera']
        
        features = {
            'ua_length': len(user_agent),
            'has_bot_keywords': any(keyword in user_agent for keyword in bot_keywords),
            'bot_keyword_count': sum(1 for keyword in bot_keywords if keyword in user_agent),
            'has_legitimate_browser': any(browser in user_agent for browser in legitimate_browsers),
            'ua_entropy': self._calculate_entropy(user_agent),
            'ua_complexity': len(set(user_agent.split())) if user_agent else 0
        }
        
        # Browser version analysis
        if 'chrome' in user_agent:
            features['browser_type'] = 'chrome'
        elif 'firefox' in user_agent:
            features['browser_type'] = 'firefox'
        elif 'safari' in user_agent:
            features['browser_type'] = 'safari'
        else:
            features['browser_type'] = 'other'
        
        return features
    
    def _extract_behavioral_features(self, session_data: Dict = None) -> Dict[str, Any]:
        """Extract behavioral features from session data"""
        if not session_data:
            return {
                'mouse_movements': 0,
                'keyboard_events': 0,
                'scroll_events': 0,
                'click_events': 0,
                'has_human_behavior': False
            }
        
        features = {
            'mouse_movements': session_data.get('mouse_movements', 0),
            'keyboard_events': session_data.get('keyboard_events', 0),
            'scroll_events': session_data.get('scroll_events', 0),
            'click_events': session_data.get('click_events', 0),
            'screen_width': session_data.get('screen_width', 0),
            'screen_height': session_data.get('screen_height', 0),
            'color_depth': session_data.get('color_depth', 0),
            'timezone_offset': session_data.get('timezone_offset', 0)
        }
        
        # Calculate behavioral indicators
        total_interactions = (features['mouse_movements'] + features['keyboard_events'] + 
                            features['scroll_events'] + features['click_events'])
        
        features.update({
            'total_interactions': total_interactions,
            'has_human_behavior': total_interactions > 5,
            'interaction_diversity': len([x for x in [features['mouse_movements'], 
                                                     features['keyboard_events'],
                                                     features['scroll_events'], 
                                                     features['click_events']] if x > 0])
        })
        
        return features
    
    def _extract_network_features(self, request: Request, client_ip: str) -> Dict[str, Any]:
        """Extract network-related features"""
        features = {
            'is_private_ip': self._is_private_ip(client_ip),
            'ip_version': 6 if ':' in client_ip else 4,
            'has_x_forwarded_for': 'X-Forwarded-For' in request.headers,
            'has_x_real_ip': 'X-Real-IP' in request.headers
        }
        
        # Analyze forwarded headers
        forwarded_for = request.headers.get('X-Forwarded-For', '')
        if forwarded_for:
            proxy_chain = forwarded_for.split(',')
            features.update({
                'proxy_chain_length': len(proxy_chain),
                'has_proxy_chain': len(proxy_chain) > 1
            })
        
        return features
    
    def _calculate_entropy(self, text: str) -> float:
        """Calculate Shannon entropy of text"""
        if not text:
            return 0
        
        # Count character frequencies
        char_counts = {}
        for char in text:
            char_counts[char] = char_counts.get(char, 0) + 1
        
        # Calculate entropy
        entropy = 0
        text_length = len(text)
        for count in char_counts.values():
            probability = count / text_length
            entropy -= probability * np.log2(probability)
        
        return entropy
    
    def _is_private_ip(self, ip: str) -> bool:
        """Check if IP address is private"""
        if ':' in ip:  # IPv6
            return ip.startswith('::1') or ip.startswith('fc') or ip.startswith('fd')
        
        # IPv4
        parts = ip.split('.')
        if len(parts) != 4:
            return False
        
        try:
            parts = [int(part) for part in parts]
            return (parts[0] == 10 or 
                   (parts[0] == 172 and 16 <= parts[1] <= 31) or
                   (parts[0] == 192 and parts[1] == 168) or
                   parts[0] == 127)
        except ValueError:
            return False
    
    def get_feature_vector(self, features: Dict[str, Any]) -> np.ndarray:
        """Convert feature dictionary to numpy array"""
        # Define feature order and default values
        feature_order = [
            'request_method', 'has_referrer', 'content_length', 'timestamp',
            'time_since_first_request', 'time_since_last_request', 'total_requests',
            'recent_requests', 'avg_request_interval', 'session_duration',
            'page_load_time', 'interaction_delay', 'form_fill_time',
            'header_count', 'has_accept_language', 'has_accept_encoding',
            'has_connection', 'has_cache_control', 'has_dnt', 'common_headers_ratio',
            'language_count', 'has_quality_values', 'user_agent_length',
            'has_user_agent', 'ua_length', 'has_bot_keywords', 'bot_keyword_count',
            'has_legitimate_browser', 'ua_entropy', 'ua_complexity',
            'mouse_movements', 'keyboard_events', 'scroll_events', 'click_events',
            'screen_width', 'screen_height', 'color_depth', 'timezone_offset',
            'total_interactions', 'has_human_behavior', 'interaction_diversity',
            'is_private_ip', 'ip_version', 'has_x_forwarded_for', 'has_x_real_ip',
            'proxy_chain_length', 'has_proxy_chain'
        ]
        
        # Convert categorical features to numeric
        numeric_features = []
        for feature_name in feature_order:
            value = features.get(feature_name, 0)
            
            # Convert boolean to int
            if isinstance(value, bool):
                value = int(value)
            # Convert string to hash
            elif isinstance(value, str):
                value = hash(value) % 10000  # Simple hash to numeric
            # Ensure numeric
            elif value is None:
                value = 0
            
            numeric_features.append(float(value))
        
        return np.array(numeric_features)