"""
PVUSA Photovoltaic Simulator
================================================================================
Professional Energy Analytics Platform

Based on PVUSA_SIMULATOR_DEF.ipynb with:
- Arrhenius equation for thermal acceleration (IEC 61215, NREL)
- Hallberg-Peck model for humidity-induced degradation
- HSU soiling model with rain-based cleaning
- Train/Test split validation (2 years training, rest testing)

References:
- Jordan & Kurtz (2013), "Photovoltaic Degradation Rates", Prog. Photovolt.
- Kempe, M. (2006), "Modeling of rates of moisture ingress", Solar Energy Materials
- Coello & Boyle (2019), "Simple Model for Predicting Time Series Soiling"
- NREL PVDegradationTools: https://github.com/NREL/PVDegradationTools
"""

from urllib.parse import urlparse
import os
import pandas as pd
import numpy as np
import math
import warnings
import io
import base64
import requests

from dateutil.relativedelta import relativedelta
from dataclasses import dataclass
from typing import Dict
from sklearn.linear_model import HuberRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.special import erf
from scipy import stats

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from pyscript import window

# Access JavaScript bridge for standalone web
web_bridge = window.webBridge

warnings.filterwarnings('ignore')


# OS compatible path builder
def build_path(directory: str, file: str):
    """Builds absolute path to file."""
    return os.path.abspath(os.path.join(directory, file))

# ============================================================================
# GLOBAL STATE - MATCHES NOTEBOOK EXACTLY
# ============================================================================

simulation_state = {
    'config': None,
    'weather_df': None,
    'power_df': None,
    'merged_df': None,
    'coefficients': None,
    'predictions_df': None,
    'daily_validation': None,
    'metrics': None,
    'tdr_enabled': True,
    
    # Train/Test split configuration
    'train_years': 2,  # Default: 2 years for training
    'train_df': None,
    'test_df': None,
    'train_split_date': None,
    'train_daily_validation': None,
    'test_daily_validation': None,
    'train_metrics': None,
    'test_metrics': None,
    
    # Week-ahead forecast state
    'forecast_df': None,
    'forecast_predictions_df': None,
    
    'module_params': {
        # THERMAL DEGRADATION (Arrhenius Model) - CONSERVATIVE
        'activation_energy_eV': 0.65,           # Reduced from 0.95
        'T_ref_K': 298.15,
        'first_year_degradation': 0.02,         # 2% year-1 (standard LID)
        'annual_degradation_rate': 0.005,       # 0.5%/year (typical warranty)
        'thermal_weight': 0.15,                 # Reduced from 0.4
        
        # HUMIDITY DEGRADATION (Hallberg-Peck)
        'humidity_activation_energy_eV': 0.6,   # Reduced from 0.8
        'humidity_exponent': 2.0,               # Reduced from 3.0
        'RH_ref': 0.60,
        'humidity_weight': 0.10,                # Reduced from 0.25
        'humidity_degradation_rate': 0.002,     # Reduced from 0.005
        
        # SOILING (HSU Model)
        'tilt_angle': 20.0,
        'PM10_ugm3': 35.0,
        'PM2.5_ugm3': 21.0,
        'v_PM10': 0.0050,
        'v_PM2.5': 0.0006,
        'cleaning_threshold': 5.0,
        'initial_soiling_buildup': 0.02,        
    }
}

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def clean_float(val):
    """Ensure value is JSON-serializable."""
    if val is None:
        return 0.0
    try:
        f = float(val)
        if math.isnan(f) or math.isinf(f):
            return 0.0
        return round(f, 6)
    except:
        return 0.0


def load_plant_config(data: Dict) -> Dict:
    """Load and parse plant configuration from JSON."""
    building = data['building']
    location = building['location']
    
    arrays = []
    for roof in building['roofs']:
        roof_height = roof['roof_environmental_parameters']['height_meters']
        
        for inst in roof['installations']:
            for field in inst['fields']:
                for array in field['arrays']:
                    losses = array.get('system_losses', {})
                    loss_factors = [
                        (1 - losses.get('soiling_loss', 0)),
                        (1 - losses.get('mismatch_loss', 0)),
                        (1 - losses.get('wiring_dc_loss', 0)),
                        (1 - losses.get('wiring_ac_loss', 0)),
                        (1 - losses.get('inverter_loss', 0)),
                        (1 - losses.get('availability_loss', 0)),
                        (1 - losses.get('iam_loss', 0))
                    ]
                    total_loss_factor = float(np.prod(loss_factors))
                    
                    string_params = {}
                    if array['strings']:
                        first_string = array['strings'][0]
                        string_params = {
                            'noct_c': first_string.get('noct_c', 45.0),
                            'temp_coef_pmax_per_c': first_string.get('temp_coef_pmax_per_c', -0.004),
                            'degradation_rate_annual': first_string.get('degradation_rate_annual', 0.005)
                        }
                        age_years = 0.25
                        degradation_factor = 1.0 - (string_params['degradation_rate_annual'] * age_years)
                    else:
                        degradation_factor = 1.0
                    
                    arrays.append({
                        'id': array['id'],
                        'name': array['name'],
                        'tilt': array['orientation']['tilt_degrees'],
                        'azimuth': array['orientation']['azimuth_degrees'],
                        'pvusa_coefficients': array['pvusa_coefficients'],
                        'system_losses': losses,
                        'loss_factor': total_loss_factor,
                        'loss_percentage': (1 - total_loss_factor) * 100,
                        'degradation_factor': degradation_factor,
                        'string_params': string_params,
                        'num_strings': len(array['strings']),
                        'num_modules': sum(len(s['modules']) for s in array['strings']),
                        'total_power_w': sum(
                            m['module_specifications']['power_stc_w'] 
                            for s in array['strings'] 
                            for m in s['modules']
                        ),
                        'roof_height': roof_height
                    })
    
    config = {
        'building_id': building['id'],
        'building_name': building['name'],
        'location': location,
        'prediction_period': building['prediction_period'],
        'arrays': arrays,
        'roof_height': building['roofs'][0]['roof_environmental_parameters']['height_meters']
    }
    
    return config


# ============================================================================
# WEATHER DATA FETCHING (Open-Meteo API) - MATCHES NOTEBOOK
# ============================================================================

def fetch_weather_from_api(config: Dict) -> pd.DataFrame:
    """Fetch weather data from Open-Meteo Archive API."""
    location = config['location']
    period = config['prediction_period']
    array = config['arrays'][0]
    
    latitude = location['latitude']
    longitude = location['longitude']
    timezone = location['timezone']
    start_date = period['start_date']
    end_date = period['end_date']
    
    tilt = array['tilt']
    azimuth = array['azimuth']
    roof_height = array['roof_height']
    
    string_params = array.get('string_params', {})
    T_NOCT = string_params.get('noct_c', 45.0)
    beta_stc = string_params.get('temp_coef_pmax_per_c', -0.004)
    
    base_url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        'latitude': latitude,
        'longitude': longitude,
        'timezone': timezone,
        'start_date': start_date,
        'end_date': end_date,
        'hourly': 'global_tilted_irradiance,shortwave_radiation,temperature_2m,relative_humidity_2m,wind_speed_10m,precipitation',
        'tilt': tilt,
        'azimuth': azimuth
    }
    
    response = requests.get(base_url, params=params, timeout=60)
    response.raise_for_status()

    data = response.json()
    if 'error' in data and data['error']:
        raise ValueError(f"API error: {data.get('reason')}")
    
    hourly = data['hourly']
    df = pd.DataFrame({
        'timestamp': pd.to_datetime(hourly['time'], utc=True),
        'gti': hourly['global_tilted_irradiance'],
        'ghi': hourly['shortwave_radiation'],
        'temperature_2m': hourly['temperature_2m'],
        'rh': hourly['relative_humidity_2m'],
        'wind_speed_10m': hourly['wind_speed_10m'],
        'precipitation_mm': hourly['precipitation']
    })
    
    # Wind speed correction
    height_target = roof_height
    height_ref = 10.0
    roughness_length = 0.3
    
    wind_factor = np.log(height_target / roughness_length) / np.log(height_ref / roughness_length)
    df['wind_speed_corrected'] = df['wind_speed_10m'] * wind_factor
    
    # Cell temperature (Skoplaki Model)
    I_NOCT = 800.0
    T_a_NOCT = 20.0
    T_STC = 25.0
    v_NOCT = 1.0
    tau_alpha = 0.9
    eta_stc = 0.20
    
    h_w_NOCT = 5.7 + 2.8 * v_NOCT
    h_w = 5.7 + 2.8 * df['wind_speed_corrected']
    
    irr_ratio = df['gti'] / I_NOCT
    temp_rise = (T_NOCT - T_a_NOCT) * (h_w_NOCT / h_w)
    efficiency_factor = (eta_stc / tau_alpha) * (1.0 - beta_stc * T_STC)
    
    df['t_cell'] = df['temperature_2m'] + irr_ratio * temp_rise * (1.0 - efficiency_factor)
    
    df['hour_utc'] = df['timestamp'].dt.floor('h')
    
    web_bridge.writeTmpFile("lleida_weather_fetched.csv", df.to_csv())
    return df


def fetch_forecast_weather(config: Dict, forecast_days: int = 15) -> pd.DataFrame:
    """
    Fetch weather forecast from Open-Meteo Forecast API for week-ahead prediction.
    
    Uses the standard Open-Meteo forecast API (not archive) for future predictions.
    """
    location = config['location']
    array = config['arrays'][0]
    
    latitude = location['latitude']
    longitude = location['longitude']
    timezone = location['timezone']
    
    tilt = array['tilt']
    azimuth = array['azimuth']
    roof_height = array['roof_height']
    
    string_params = array.get('string_params', {})
    T_NOCT = string_params.get('noct_c', 45.0)
    beta_stc = string_params.get('temp_coef_pmax_per_c', -0.004)
    
    # Use Open-Meteo Forecast API
    base_url = "https://api.open-meteo.com/v1/forecast"
    params = {
        'latitude': latitude,
        'longitude': longitude,
        'timezone': timezone,
        'forecast_days': forecast_days,
        'hourly': 'global_tilted_irradiance,shortwave_radiation,temperature_2m,relative_humidity_2m,wind_speed_10m,precipitation',
        'tilt': tilt,
        'azimuth': azimuth
    }
    
    response = requests.get(base_url, params=params, timeout=60)
    response.raise_for_status()
    
    data = response.json()
    if 'error' in data and data['error']:
        raise ValueError(f"API error: {data.get('reason')}")
    
    hourly = data['hourly']
    df = pd.DataFrame({
        'timestamp': pd.to_datetime(hourly['time'], utc=True),
        'gti': hourly['global_tilted_irradiance'],
        'ghi': hourly['shortwave_radiation'],
        'temperature_2m': hourly['temperature_2m'],
        'rh': hourly['relative_humidity_2m'],
        'wind_speed_10m': hourly['wind_speed_10m'],
        'precipitation_mm': hourly['precipitation']
    })
    
    # Wind speed correction (same as historical)
    height_target = roof_height
    height_ref = 10.0
    roughness_length = 0.3
    
    wind_factor = np.log(height_target / roughness_length) / np.log(height_ref / roughness_length)
    df['wind_speed_corrected'] = df['wind_speed_10m'] * wind_factor
    
    # Cell temperature (Skoplaki Model)
    I_NOCT = 800.0
    T_a_NOCT = 20.0
    T_STC = 25.0
    v_NOCT = 1.0
    tau_alpha = 0.9
    eta_stc = 0.20
    
    h_w_NOCT = 5.7 + 2.8 * v_NOCT
    h_w = 5.7 + 2.8 * df['wind_speed_corrected']
    
    irr_ratio = df['gti'].fillna(0) / I_NOCT
    temp_rise = (T_NOCT - T_a_NOCT) * (h_w_NOCT / h_w)
    efficiency_factor = (eta_stc / tau_alpha) * (1.0 - beta_stc * T_STC)
    
    df['t_cell'] = df['temperature_2m'] + irr_ratio * temp_rise * (1.0 - efficiency_factor)
    df['hour_utc'] = df['timestamp'].dt.floor('h')
    
    # Fill NaN values (nighttime GTI is often NaN)
    df['gti'] = df['gti'].fillna(0)
    df['ghi'] = df['ghi'].fillna(0)
    df['t_cell'] = df['t_cell'].fillna(df['temperature_2m'])
    
    return df


def load_and_align_data(power_df: pd.DataFrame, config: Dict):
    """Fetch weather from API and load power data, then merge."""
    weather_df = fetch_weather_from_api(config)
    
    power_df['timestamp'] = pd.to_datetime(power_df['timestamp'], utc=True)
    power_df['hour_utc'] = power_df['timestamp'].dt.floor('h')
    
    power_hourly = power_df.groupby('hour_utc').agg({
        'pvPower': 'mean'
    }).reset_index()
    power_hourly = power_hourly.rename(columns={'pvPower': 'real_power_kw'})
    power_hourly['real_power_w'] = power_hourly['real_power_kw'] * 1000.0
    
    merged = pd.merge(
        weather_df,
        power_hourly[['hour_utc', 'real_power_kw', 'real_power_w']],
        on='hour_utc',
        how='inner'
    )
    
    merged = merged[merged['gti'] > 10].copy()
    merged = merged[merged['real_power_w'] > 0].copy()
    merged = merged.sort_values('hour_utc').reset_index(drop=True)

    return weather_df, power_df, merged


# ============================================================================
# TRANSITORY DETERIORATION RATE (TDR) - FULL MODEL FROM NOTEBOOK
# ============================================================================

def calculate_tdr_factors(df: pd.DataFrame, module_params: Dict) -> pd.DataFrame:
    """
    Calculate Transitory Deterioration Rate using the Arrhenius-based 
    cumulative degradation model with humidity and soiling effects.
    
    Scientific basis:
    - Arrhenius equation for thermal acceleration (IEC 61215, NREL)
    - Hallberg-Peck model for humidity-induced degradation (NIST/NREL)
    - HSU soiling model with rain-based cleaning (Coello & Boyle 2019)
    - First-year soiling buildup (initial contamination period)
    - Linear annual degradation from manufacturer warranty (REC: 0.5%/year)
    - First-year LID from warranty (REC: 2%)
    """
    df = df.copy()
    
    # Module parameters
    Ea = module_params.get('activation_energy_eV', 0.65)
    k_B = 8.617e-5  # Boltzmann constant in eV/K
    T_ref = module_params.get('T_ref_K', 298.15)
    
    # Degradation rates from warranty
    first_year_degradation = module_params.get('first_year_degradation', 0.02)
    annual_degradation_rate = module_params.get('annual_degradation_rate', 0.005)
    thermal_weight = module_params.get('thermal_weight', 0.15)
    
    # Convert cell temperature to Kelvin
    T_cell_K = df['t_cell'] + 273.15
    
    # Calculate Arrhenius Acceleration Factor for each timestep
    df['acceleration_factor'] = np.exp(
        (Ea / k_B) * (1/T_ref - 1/T_cell_K)
    )
    
    # Clip extreme values (physical limits)
    df['acceleration_factor'] = df['acceleration_factor'].clip(lower=0.5, upper=5.0)
    
    # Calculate cumulative thermal dose (weighted hours)
    df['thermal_dose'] = df['acceleration_factor'].cumsum()
    
    # Normalize thermal dose
    total_hours = len(df)
    hours_per_year = 8760.0
    
    # Calendar-based year fraction
    calendar_year_fraction = np.arange(total_hours) / hours_per_year
    
    # Thermal-accelerated year fraction
    thermal_year_fraction = df['thermal_dose'].values / hours_per_year
    
    # Blend calendar time with thermal-accelerated time
    blended_year_fraction = (
        (1 - thermal_weight) * calendar_year_fraction + 
        thermal_weight * thermal_year_fraction
    )
    
    # Apply degradation model:
    # Year 1: linear ramp to first_year_degradation (LID)
    # Years 2+: linear at annual_degradation_rate
    thermal_degradation = np.where(
        blended_year_fraction < 1.0,
        first_year_degradation * blended_year_fraction,
        first_year_degradation + annual_degradation_rate * (blended_year_fraction - 1.0)
    )
    
    # Thermal TDR factor = 1 - cumulative_degradation
    thermal_tdr_factor = 1.0 - thermal_degradation
    thermal_tdr_factor = np.clip(thermal_tdr_factor, 0.80, 1.0)
    
    # Store intermediate values for analysis
    df['cumulative_degradation_pct'] = thermal_degradation * 100
    df['effective_year'] = blended_year_fraction
    
    # ========================================================================
    # HUMIDITY-INDUCED DEGRADATION (Hallberg-Peck Model)
    # ========================================================================
    
    # Humidity parameters
    Ea_humidity = module_params.get('humidity_activation_energy_eV', 0.6)
    RH_exponent = module_params.get('humidity_exponent', 2.0)
    RH_ref = module_params.get('RH_ref', 0.60)
    humidity_weight = module_params.get('humidity_weight', 0.10)
    base_humidity_degradation_rate = module_params.get('humidity_degradation_rate', 0.002)
    
    # Get relative humidity from dataframe (0-100 scale)
    if 'rh' in df.columns:
        RH_fraction = df['rh'].values / 100.0
    else:
        RH_fraction = 0.70  # Default for Lleida
    
    # Calculate humidity acceleration factor
    humidity_acceleration = (
        (RH_fraction / RH_ref) ** RH_exponent * 
        np.exp((Ea_humidity / k_B) * (1/T_ref - 1/T_cell_K.values))
    )
    humidity_acceleration = np.clip(humidity_acceleration, 0.5, 10.0)
    
    # Cumulative humidity dose
    humidity_dose = np.cumsum(humidity_acceleration)
    humidity_year_fraction = humidity_dose / hours_per_year
    
    # Humidity degradation (linear with accelerated time)
    humidity_degradation = base_humidity_degradation_rate * humidity_year_fraction
    humidity_degradation = np.clip(humidity_degradation, 0.0, 0.20)
    
    # ========================================================================
    # SOILING LOSSES (HSU Model - Coello & Boyle 2019)
    # ========================================================================
    
    # Soiling parameters
    tilt_angle_deg = module_params.get('tilt_angle', 20.0)
    PM10_concentration = module_params.get('PM10_ugm3', 35.0)
    PM25_concentration = module_params.get('PM2.5_ugm3', 21.0)
    v_PM10 = module_params.get('v_PM10', 0.0050)
    v_PM25 = module_params.get('v_PM2.5', 0.0006)
    cleaning_threshold_mm = module_params.get('cleaning_threshold', 5.0)
    
    # Convert tilt to radians
    tilt_rad = np.radians(tilt_angle_deg)
    
    # Get precipitation data
    if 'precipitation_mm' in df.columns:
        rainfall = df['precipitation_mm'].values
    else:
        rainfall = np.zeros(len(df))
    
    # Calculate hourly mass accumulation (g/m2/h)
    delta_t_hours = 1.0
    hourly_accumulation = (
        (v_PM10 * PM10_concentration + v_PM25 * PM25_concentration) * 
        delta_t_hours * 3600 * np.cos(tilt_rad) * 1e-6
    )
    
    # Calculate cumulative mass with rain-based cleaning
    omega = np.zeros(len(df))
    for i in range(len(df)):
        if i == 0:
            omega[i] = hourly_accumulation
        else:
            if rainfall[i] >= cleaning_threshold_mm:
                omega[i] = hourly_accumulation  # Reset after rain cleaning
            else:
                omega[i] = omega[i-1] + hourly_accumulation
    
    omega = np.clip(omega, 0.0, 10.0)
    
    # Calculate soiling ratio using HSU model
    soiling_ratio = 1.0 - 0.3437 * erf(0.17 * omega**0.8473)
    soiling_ratio = np.clip(soiling_ratio, 0.6563, 1.0)
    soiling_loss_pct = (1.0 - soiling_ratio) * 100
    
    # ========================================================================
    # FIRST-YEAR SOILING BUILDUP CORRECTION
    # ========================================================================
    
    initial_soiling_buildup = module_params.get('initial_soiling_buildup', 0.02)
    
    if initial_soiling_buildup > 0:
        # Calculate first-year soiling penalty
        first_year_soiling_penalty = np.where(
            blended_year_fraction < 1.0,
            initial_soiling_buildup * blended_year_fraction,
            initial_soiling_buildup
        )
        
        # Apply penalty to soiling ratio
        soiling_ratio = soiling_ratio * (1.0 - first_year_soiling_penalty)
        soiling_ratio = np.clip(soiling_ratio, 0.6563, 1.0)
        
        # Update soiling loss percentage
        soiling_loss_pct = (1.0 - soiling_ratio) * 100
        
    # ========================================================================
    # COMBINED TDR FACTOR (ALL THREE MECHANISMS)
    # ========================================================================
    
    humidity_factor = 1.0 - (humidity_degradation * humidity_weight)
    
    # Combined TDR factor (multiplicative combination)
    combined_tdr = thermal_tdr_factor * humidity_factor * soiling_ratio
    combined_tdr = np.clip(combined_tdr, 0.50, 1.0)
    
    # Store as DataFrame column
    df['tdr_factor'] = combined_tdr
    
    # Update cumulative degradation to reflect combined losses
    df['cumulative_degradation_pct'] = (1.0 - df['tdr_factor']) * 100
    
    return df


def calculate_forecast_tdr(df: pd.DataFrame, module_params: Dict, 
                           historical_df: pd.DataFrame = None) -> pd.DataFrame:
    """
    Calculate TDR for forecast data, continuing from historical degradation.
    
    For forecasts, we use the final TDR value from historical data
    as a baseline, plus instantaneous soiling effects from forecast weather.
    """
    df = df.copy()
    
    # Get baseline TDR from end of historical period
    if historical_df is not None and 'tdr_factor' in historical_df.columns:
        baseline_tdr = historical_df['tdr_factor'].iloc[-1]
        effective_years = historical_df['effective_year'].iloc[-1]
        # Also get the min TDR from historical to set a reasonable floor
        min_historical_tdr = historical_df['tdr_factor'].min()
    else:
        baseline_tdr = 0.97  # Default assumption: ~3% degradation
        effective_years = 2.0
        min_historical_tdr = 0.85
    
    # For short-term forecasts, assume TDR is approximately constant
    # (degradation over 15 days is negligible compared to years of operation)
    # Apply only minor soiling variations based on forecast precipitation
    
    tilt_angle_deg = module_params.get('tilt_angle', 20.0)
    PM10_concentration = module_params.get('PM10_ugm3', 35.0)
    PM25_concentration = module_params.get('PM2.5_ugm3', 21.0)
    v_PM10 = module_params.get('v_PM10', 0.0050)
    v_PM25 = module_params.get('v_PM2.5', 0.0006)
    cleaning_threshold_mm = module_params.get('cleaning_threshold', 5.0)
    
    tilt_rad = np.radians(tilt_angle_deg)
    
    # Get precipitation from forecast
    if 'precipitation_mm' in df.columns:
        rainfall = df['precipitation_mm'].fillna(0).values
    else:
        rainfall = np.zeros(len(df))
    
    # Calculate soiling for forecast period (small variations around baseline)
    hourly_accumulation = (
        (v_PM10 * PM10_concentration + v_PM25 * PM25_concentration) * 
        1.0 * 3600 * np.cos(tilt_rad) * 1e-6
    )
    
    omega = np.zeros(len(df))
    omega[0] = 0.3  # Small existing soiling at start
    
    for i in range(1, len(df)):
        if rainfall[i] >= cleaning_threshold_mm:
            omega[i] = hourly_accumulation  # Rain cleans panels
        else:
            omega[i] = omega[i-1] + hourly_accumulation
    
    omega = np.clip(omega, 0.0, 5.0)  # Cap soiling accumulation
    
    # HSU soiling ratio - small variation around 1.0
    soiling_variation = 1.0 - 0.3437 * erf(0.17 * omega**0.8473)
    soiling_variation = np.clip(soiling_variation, 0.97, 1.0)  # Max 3% soiling effect
    
    # Combined TDR for forecast = baseline * small soiling variation
    # This keeps TDR close to the historical baseline with minor weather-based variations
    df['tdr_factor'] = baseline_tdr * soiling_variation
    
    # Clip to reasonable bounds based on historical data (not arbitrary 0.85)
    df['tdr_factor'] = df['tdr_factor'].clip(lower=min_historical_tdr * 0.99, upper=1.0)
    
    # Fill in other columns for consistency
    df['acceleration_factor'] = 1.0
    df['effective_year'] = effective_years
    df['cumulative_degradation_pct'] = (1.0 - df['tdr_factor']) * 100
    
    return df


# ============================================================================
# PVUSA CALIBRATION - MATCHES NOTEBOOK
# ============================================================================

def hampel_mask(x, window=7, n_sigmas=3.0):
    """Hampel filter to detect outliers in time series."""
    x = np.asarray(x, float)
    n = len(x)
    k = window // 2
    mask = np.ones(n, dtype=bool)
    for i in range(n):
        lo = max(0, i - k)
        hi = min(n, i + k + 1)
        w = x[lo:hi]
        med = np.nanmedian(w)
        mad = np.nanmedian(np.abs(w - med))
        sigma = 1.4826 * mad if mad > 0 else 0.0
        if sigma > 0 and np.abs(x[i] - med) > n_sigmas * sigma:
            mask[i] = False
    return mask


def calibrate_pvusa_huber(df: pd.DataFrame) -> tuple:
    """
    Calibrate PVUSA coefficients using Huber Regression.
    
    UNCHANGED FROM ORIGINAL - No TDR corrections applied here.
    The coefficients are calibrated on raw measured data.
    """
    d = df.copy()
    
    d = d[d['gti'] > 50].copy()
    d = d[d['real_power_w'] >= 200.0].copy()
    
    if 'hour_utc' in d.columns:
        hrs = d['hour_utc'].dt.hour
        d = d[(hrs >= 12) & (hrs <= 14)].copy()
    elif 'timestamp' in d.columns:
        ts = pd.to_datetime(d['timestamp'])
        hrs = ts.dt.hour
        d = d[(hrs >= 12) & (hrs <= 14)].copy()
    
    if len(d) < 50:
        raise ValueError(f"Not enough data after time filtering: {len(d)} samples")
    
    G_temp = d['gti'].values
    P_temp = d['real_power_w'].values
    z = P_temp / G_temp
    
    q1, q3 = np.nanpercentile(z, [25, 75])
    iqr = q3 - q1
    lo, hi = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    mask_iqr = (z >= lo) & (z <= hi)
    
    d_sorted = d.sort_values('hour_utc' if 'hour_utc' in d.columns else 'timestamp').reset_index()
    mask_hamp_series = hampel_mask(d_sorted['real_power_w'].values, window=7, n_sigmas=3.0)
    mask_hamp = pd.Series(mask_hamp_series, index=d_sorted['index']).reindex(d.index).fillna(True).astype(bool)
    
    d = d[mask_iqr & mask_hamp.values].copy()
    
    if len(d) > 50:
        upper_cap = d['real_power_w'].quantile(0.999)
        if np.isfinite(upper_cap):
            d = d[d['real_power_w'] <= upper_cap].copy()
    
    d = d.dropna(subset=['gti', 't_cell', 'wind_speed_corrected', 'real_power_w'])
    d = d.sort_values('hour_utc' if 'hour_utc' in d.columns else 'timestamp').reset_index(drop=True)
    
    if len(d) < 100:
        raise ValueError(f"Not enough data after filtering: {len(d)} samples (need >= 100)")
    
    G = d['gti'].values
    T = d['t_cell'].values
    V = d['wind_speed_corrected'].values
    P = d['real_power_w'].values
    
    T_delta = T - 25.0
    
    A = np.column_stack([G, G**2, G * T_delta, G * V])
    
    huber = HuberRegressor(epsilon=1.35, alpha=1e-4, fit_intercept=False, max_iter=2000)
    huber.fit(A, P)
    
    coefficients = {
        'c1': float(huber.coef_[0]),
        'c2': float(huber.coef_[1]),
        'c3': float(huber.coef_[2]),
        'c4': float(huber.coef_[3])
    }
    
    P_pred = A @ huber.coef_
    P_pred = np.clip(P_pred, 0, None)
    rmse = np.sqrt(mean_squared_error(P, P_pred))
    
    return coefficients, rmse, P_pred


def apply_pvusa_model(df: pd.DataFrame, coefficients: dict, 
                      loss_factor: float = 1.0, 
                      degradation_factor: float = 1.0,
                      apply_tdr: bool = False) -> pd.DataFrame:
    """
    Apply PVUSA model to predict power.
    
    PVUSA formula: P = G * (c1 + c2*G + c3*T + c4*V)
    
    If apply_tdr=True, multiply by time-varying TDR factor.
    """
    G = df['gti'].values
    T = df['t_cell'].values
    V = df['wind_speed_corrected'].values
    
    c1 = coefficients['c1']
    c2 = coefficients['c2']
    c3 = coefficients['c3']
    c4 = coefficients['c4']
    
    # PVUSA: P = G * (c1 + c2*G + c3*T + c4*V)
    # Using raw T (not T_delta) as coefficients were calibrated this way
    P_ideal_w = G * (c1 + c2 * G + c3 * T + c4 * V)
    
    # Apply system losses and degradation
    P_pred_w = P_ideal_w * loss_factor * degradation_factor
    
    # Apply TDR factor if enabled and available
    if apply_tdr and 'tdr_factor' in df.columns:
        P_pred_w = P_pred_w * df['tdr_factor'].values
    
    P_pred_w = np.clip(P_pred_w, 0, None)
    
    df = df.copy()
    df['predicted_power_w'] = P_pred_w
    df['predicted_power_kw'] = P_pred_w / 1000.0
    df['predicted_power_ideal_w'] = np.clip(P_ideal_w, 0, None)
    
    return df


# ============================================================================
# VALIDATION FUNCTIONS - MATCHES NOTEBOOK
# ============================================================================

def compute_daily_energy(df: pd.DataFrame, power_col: str, time_col: str = 'hour_utc') -> pd.DataFrame:
    """Aggregate hourly power to daily energy using trapezoidal integration."""
    d = df[[time_col, power_col]].dropna().copy()
    d = d.sort_values(time_col)
    d['date'] = d[time_col].dt.date
    
    daily_rows = []
    for date, group in d.groupby('date'):
        if len(group) < 2:
            continue
        
        group = group.sort_values(time_col)
        
        t_ns = group[time_col].values.astype('datetime64[ns]').astype('int64')
        t_hours = (t_ns - t_ns[0]) / 3.6e12
        
        y_kw = group[power_col].values
        if 'w' in power_col.lower() and 'kw' not in power_col.lower():
            y_kw = y_kw / 1000.0
        
        energy_kwh = float(np.trapezoid(y_kw, t_hours))
        daily_rows.append({'date': pd.to_datetime(date), 'energy_kwh': energy_kwh})
    
    return pd.DataFrame(daily_rows)


def compute_validation_metrics(sim_daily: pd.DataFrame, real_daily: pd.DataFrame) -> tuple:
    """Compute validation metrics between simulated and real daily energy."""
    merged = pd.merge(
        sim_daily.rename(columns={'energy_kwh': 'sim_kwh'}),
        real_daily.rename(columns={'energy_kwh': 'real_kwh'}),
        on='date',
        how='inner'
    )
    
    if len(merged) < 2:
        return merged, {}
    
    y_true = merged['real_kwh'].values
    y_pred = merged['sim_kwh'].values
    
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / np.where(y_true != 0, y_true, 1))) * 100
    
    # Cumulative error
    cumsum_real = np.cumsum(y_true)
    cumsum_pred = np.cumsum(y_pred)
    cumulative_error_pct = 100 * (cumsum_real[-1] - cumsum_pred[-1]) / cumsum_real[-1]
    
    p50 = np.median(y_true)
    p95 = np.percentile(y_true, 95)
    
    metrics = {
        'R2': round(r2, 4),
        'RMSE_kWh': round(rmse, 2),
        'MAE_kWh': round(mae, 2),
        'MAPE_percent': round(mape, 2),
        'NRMSE_median': round(rmse / p50, 4) if p50 > 0 else 0,
        'NRMSE_P95': round(rmse / p95, 4) if p95 > 0 else 0,
        'Cumulative_Error_percent': round(cumulative_error_pct, 2),
        'n_days': len(merged)
    }
    
    return merged, metrics


# ============================================================================
# PLOTTING FUNCTIONS - WHITE BACKGROUND FOR PROFESSIONAL LOOK
# ============================================================================

def generate_diagnostic_plot(df: pd.DataFrame, coefficients: Dict) -> str:
    """Generate calibration diagnostic plots."""
    G = df['gti'].values
    T = df['t_cell'].values
    V = df['wind_speed_corrected'].values
    P_real = df['real_power_w'].values
    
    c1, c2, c3, c4 = coefficients['c1'], coefficients['c2'], coefficients['c3'], coefficients['c4']
    
    P_pred = G * (c1 + c2*G + c3*T + c4*V)
    P_pred = np.clip(P_pred, 0, None)
    
    residuals = P_real - P_pred
    z = P_real / np.where(G > 0, G, 1)
    
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    fig.patch.set_facecolor('white')
    
    for ax in axes.flatten():
        ax.set_facecolor('white')
        ax.tick_params(colors='black')
        ax.xaxis.label.set_color('black')
        ax.yaxis.label.set_color('black')
        ax.title.set_color('black')
        for spine in ax.spines.values():
            spine.set_color('black')
    
    ax = axes[0, 0]
    ax.scatter(G, P_real, alpha=0.3, s=8, label='Measured', color='#2E86AB')
    ax.scatter(G, P_pred, alpha=0.3, s=8, label='Predicted', color='#F6AD55')
    ax.set_xlabel('GTI (W/m2)')
    ax.set_ylabel('Power (W)')
    ax.set_title('Power vs Irradiance')
    ax.legend(facecolor='white', edgecolor='black')
    ax.grid(True, alpha=0.6, color='lightgrey')
    
    ax = axes[0, 1]
    ax.scatter(G, residuals, alpha=0.3, s=8, color='#fc8181')
    ax.axhline(0, color='black', linestyle='--', linewidth=1)
    ax.set_xlabel('GTI (W/m2)')
    ax.set_ylabel('Residual (W)')
    ax.set_title('Residuals vs Irradiance')
    ax.grid(True, alpha=0.6, color='lightgrey')
    
    ax = axes[1, 0]
    ax.scatter(G, z, alpha=0.3, s=8, color='#48bb78')
    ax.set_xlabel('GTI (W/m2)')
    ax.set_ylabel('P/G (W*m2/W)')
    ax.set_title('Conversion Ratio vs Irradiance')
    ax.grid(True, alpha=0.6, color='lightgrey')
    
    ax = axes[1, 1]
    ax.hist(residuals, bins=50, alpha=0.7, color='#805ad5', edgecolor='black')
    ax.axvline(0, color='#fc8181', linestyle='--', linewidth=2)
    ax.set_xlabel('Residual (W)')
    ax.set_ylabel('Frequency')
    ax.set_title(f'Residual Distribution (mu={np.mean(residuals):.1f}, sigma={np.std(residuals):.1f})')
    ax.grid(True, alpha=0.6, color='lightgrey')
    
    plt.tight_layout()
    
    buf = io.BytesIO()
    plt.savefig(buf, format='png', dpi=120, bbox_inches='tight', facecolor='white')
    plt.close(fig)
    buf.seek(0)
    
    return base64.b64encode(buf.getvalue()).decode('utf-8')


def generate_validation_plot(daily_df: pd.DataFrame, merged_df: pd.DataFrame = None, 
                             title_suffix: str = "") -> str:
    """Generate comprehensive daily energy validation plots."""
    daily_df = daily_df.copy()
    daily_df['error_kwh'] = daily_df['real_kwh'] - daily_df['sim_kwh']
    daily_df['error_pct'] = 100 * daily_df['error_kwh'] / daily_df['real_kwh']
    daily_df['month'] = daily_df['date'].dt.month
    daily_df['season'] = daily_df['month'].map({
        12: 'Winter', 1: 'Winter', 2: 'Winter',
        3: 'Spring', 4: 'Spring', 5: 'Spring',
        6: 'Summer', 7: 'Summer', 8: 'Summer',
        9: 'Fall', 10: 'Fall', 11: 'Fall'
    })
    
    fig, axes = plt.subplots(3, 2, figsize=(16, 14))
    fig.patch.set_facecolor('white')
    
    if title_suffix:
        fig.suptitle(f'Validation Results - {title_suffix}', fontsize=14, color='black', y=1.02)
    
    for ax in axes.flatten():
        ax.set_facecolor('white')
        ax.tick_params(colors='black')
        ax.xaxis.label.set_color('black')
        ax.yaxis.label.set_color('black')
        ax.title.set_color('black')
        for spine in ax.spines.values():
            spine.set_color('black')
    
    # Time series
    ax = axes[0, 0]
    ax.plot(daily_df['date'], daily_df['real_kwh'], 'o-', label='Measured', 
            color='#2E86AB', markersize=3, linewidth=1.5, alpha=0.9)
    ax.plot(daily_df['date'], daily_df['sim_kwh'], 's--', label='PVUSA + TDR', 
            color='#F6AD55', markersize=3, linewidth=1.5, alpha=0.9)
    ax.fill_between(daily_df['date'], daily_df['real_kwh'], daily_df['sim_kwh'], 
                     alpha=0.2, color='lightgrey')
    ax.set_xlabel('Date', fontsize=11)
    ax.set_ylabel('Daily Energy (kWh)', fontsize=11)
    ax.set_title('Daily Energy: Measured vs PVUSA + TDR', fontsize=12, pad=10)
    ax.legend(facecolor='white', edgecolor='black', fontsize=9)
    ax.grid(True, alpha=0.6, color='lightgrey')
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right', color='black')
    
    # Scatter plot
    ax = axes[0, 1]
    ax.scatter(daily_df['real_kwh'], daily_df['sim_kwh'], alpha=0.6, s=35, 
               color='#48bb78', edgecolors='black', linewidth=0.5)
    
    max_val = max(daily_df['real_kwh'].max(), daily_df['sim_kwh'].max())
    ax.plot([0, max_val], [0, max_val], '--', linewidth=2, color='#fc8181', 
            label='Perfect Prediction', alpha=0.8)
    
    slope, intercept, r_value, _, _ = stats.linregress(daily_df['real_kwh'], daily_df['sim_kwh'])
    line = slope * daily_df['real_kwh'] + intercept
    ax.plot(daily_df['real_kwh'], line, '-', linewidth=2, color='#F6AD55', 
            label=f'Fit: y={slope:.3f}x+{intercept:.1f}', alpha=0.8)
    
    ax.set_xlabel('Measured Energy (kWh)', fontsize=11)
    ax.set_ylabel('Predicted Energy (kWh)', fontsize=11)
    ax.set_title(f'Scatter: R2={r_value**2:.4f}', fontsize=12, pad=10)
    ax.legend(facecolor='white', edgecolor='black', fontsize=9)
    ax.grid(True, alpha=0.6, color='lightgrey')
    ax.set_aspect('equal', adjustable='box')
    
    # Error distribution
    ax = axes[1, 0]
    n, bins, patches = ax.hist(daily_df['error_kwh'], bins=40, color='#fc8181', 
                                alpha=0.75, edgecolor='black', linewidth=0.8)
    ax.axvline(0, color='#48bb78', linestyle='--', linewidth=2.5, 
               label='Zero error', alpha=0.8)
    ax.axvline(daily_df['error_kwh'].mean(), color='#F6AD55', linestyle=':', 
               linewidth=2.5, label=f'Mean: {daily_df["error_kwh"].mean():.2f} kWh', alpha=0.8)
    
    mu, sigma = daily_df['error_kwh'].mean(), daily_df['error_kwh'].std()
    
    ax.set_xlabel('Error (Measured - Predicted) [kWh]', fontsize=11)
    ax.set_ylabel('Frequency', fontsize=11)
    ax.set_title(f'Error Distribution (sigma={sigma:.2f} kWh)', fontsize=12, pad=10)
    ax.legend(facecolor='white', edgecolor='black', fontsize=9, loc='upper left')
    ax.grid(True, alpha=0.6, axis='y', color='lightgrey')
    
    # TDR factor over time
    ax = axes[1, 1]
    if merged_df is not None and 'tdr_factor' in merged_df.columns:
        merged_df_copy = merged_df.copy()
        merged_df_copy['date'] = merged_df_copy['hour_utc'].dt.date
        daily_tdr = merged_df_copy.groupby('date')['tdr_factor'].mean().reset_index()
        daily_tdr['date'] = pd.to_datetime(daily_tdr['date'])
        
        ax.plot(daily_tdr['date'], daily_tdr['tdr_factor'], 
                color='#F6AD55', linewidth=2, label='TDR Factor')
        ax.axhline(1.0, color='#fc8181', linestyle='--', alpha=0.5, label='Baseline (1.0)')
        ax.fill_between(daily_tdr['date'], daily_tdr['tdr_factor'], 1.0, 
                        alpha=0.3, color='#F6AD55')
        ax.set_ylabel('TDR Factor', fontsize=11)
        ax.set_title('Combined Degradation Over Time', fontsize=12, pad=10)
        ax.legend(facecolor='white', edgecolor='black', fontsize=9)
        ax.set_ylim([0.8, 1.01])
    else:
        ax.text(0.5, 0.5, 'TDR not applied', ha='center', va='center', 
                transform=ax.transAxes, color='black', fontsize=14)
    ax.set_xlabel('Date', fontsize=11)
    ax.grid(True, alpha=0.6, color='lightgrey')
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right', color='black')
    
    # Seasonal box plots
    ax = axes[2, 0]
    season_order = ['Winter', 'Spring', 'Summer', 'Fall']
    season_data = [daily_df[daily_df['season'] == s]['error_pct'].dropna() 
                   for s in season_order]
    
    bp = ax.boxplot(season_data, labels=season_order, patch_artist=True,
                    boxprops=dict(facecolor='#F6AD55', alpha=0.7),
                    medianprops=dict(color='black', linewidth=2),
                    whiskerprops=dict(color='black'),
                    capprops=dict(color='black'),
                    flierprops=dict(marker='o', markerfacecolor='#fc8181', 
                                   markersize=4, alpha=0.5))
    
    ax.axhline(0, color='#48bb78', linestyle='--', linewidth=2, alpha=0.8)
    ax.set_ylabel('Error (%)', fontsize=11)
    ax.set_title('Seasonal Error Distribution', fontsize=12, pad=10)
    ax.grid(True, alpha=0.6, axis='y', color='lightgrey')
    
    # Cumulative energy
    ax = axes[2, 1]
    daily_df['cumsum_real'] = daily_df['real_kwh'].cumsum()
    daily_df['cumsum_sim'] = daily_df['sim_kwh'].cumsum()
    
    ax.plot(daily_df['date'], daily_df['cumsum_real'], color='#2E86AB', 
            linewidth=2.5, label='Measured (cumulative)', alpha=0.9)
    ax.plot(daily_df['date'], daily_df['cumsum_sim'], color='#F6AD55', 
            linewidth=2.5, label='Predicted (cumulative)', linestyle='--', alpha=0.9)
    ax.fill_between(daily_df['date'], daily_df['cumsum_real'], daily_df['cumsum_sim'], 
                     alpha=0.2, color='lightgrey')
    
    final_real = daily_df['cumsum_real'].iloc[-1]
    final_sim = daily_df['cumsum_sim'].iloc[-1]
    error_pct = 100 * (final_real - final_sim) / final_real
    
    ax.set_xlabel('Date', fontsize=11)
    ax.set_ylabel('Cumulative Energy (kWh)', fontsize=11)
    ax.set_title(f'Cumulative Energy (Total Error: {error_pct:.2f}%)', 
                 fontsize=12, pad=10)
    ax.legend(facecolor='white', edgecolor='black', fontsize=9)
    ax.grid(True, alpha=0.6, color='lightgrey')
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right', color='black')
    
    plt.tight_layout()
    
    buf = io.BytesIO()
    plt.savefig(buf, format='png', dpi=120, bbox_inches='tight', facecolor='white')
    plt.close(fig)
    buf.seek(0)
    
    return base64.b64encode(buf.getvalue()).decode('utf-8')


def generate_train_test_comparison_plot(train_daily: pd.DataFrame, test_daily: pd.DataFrame,
                                        train_metrics: Dict, test_metrics: Dict,
                                        split_date: pd.Timestamp) -> str:
    """Generate comparison plot showing train vs test performance."""
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.patch.set_facecolor('white')
    fig.suptitle(f'Train/Test Split Comparison (Split Date: {split_date.strftime("%Y-%m-%d")})', 
                 fontsize=14, color='black', y=1.02)
    
    for ax in axes.flatten():
        ax.set_facecolor('white')
        ax.tick_params(colors='black')
        ax.xaxis.label.set_color('black')
        ax.yaxis.label.set_color('black')
        ax.title.set_color('black')
        for spine in ax.spines.values():
            spine.set_color('black')
    
    # Combined time series
    ax = axes[0, 0]
    ax.plot(train_daily['date'], train_daily['real_kwh'], 'o-', label='Train Measured', 
            color='#2E86AB', markersize=2, linewidth=1, alpha=0.8)
    ax.plot(train_daily['date'], train_daily['sim_kwh'], 's--', label='Train Predicted', 
            color='#48bb78', markersize=2, linewidth=1, alpha=0.8)
    ax.plot(test_daily['date'], test_daily['real_kwh'], 'o-', label='Test Measured', 
            color='#F6AD55', markersize=2, linewidth=1, alpha=0.8)
    ax.plot(test_daily['date'], test_daily['sim_kwh'], 's--', label='Test Predicted', 
            color='#fc8181', markersize=2, linewidth=1, alpha=0.8)
    ax.axvline(split_date, color='black', linestyle='--', linewidth=2, alpha=0.7, label='Split Date')
    ax.set_xlabel('Date', fontsize=11)
    ax.set_ylabel('Daily Energy (kWh)', fontsize=11)
    ax.set_title('Train vs Test: Daily Energy', fontsize=12, pad=10)
    ax.legend(facecolor='white', edgecolor='black', fontsize=8, ncol=2)
    ax.grid(True, alpha=0.6, color='lightgrey')
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right', color='black')
    
    # Scatter comparison
    ax = axes[0, 1]
    ax.scatter(train_daily['real_kwh'], train_daily['sim_kwh'], alpha=0.5, s=25, 
               color='#48bb78', label=f'Train (R2={train_metrics["R2"]:.4f})', edgecolors='none')
    ax.scatter(test_daily['real_kwh'], test_daily['sim_kwh'], alpha=0.5, s=25, 
               color='#fc8181', label=f'Test (R2={test_metrics["R2"]:.4f})', edgecolors='none')
    
    all_vals = pd.concat([train_daily[['real_kwh', 'sim_kwh']], test_daily[['real_kwh', 'sim_kwh']]])
    max_val = max(all_vals['real_kwh'].max(), all_vals['sim_kwh'].max())
    ax.plot([0, max_val], [0, max_val], '--', linewidth=2, color='black', alpha=0.5)
    
    ax.set_xlabel('Measured Energy (kWh)', fontsize=11)
    ax.set_ylabel('Predicted Energy (kWh)', fontsize=11)
    ax.set_title('Scatter: Train vs Test', fontsize=12, pad=10)
    ax.legend(facecolor='white', edgecolor='black', fontsize=9)
    ax.grid(True, alpha=0.6, color='lightgrey')
    ax.set_aspect('equal', adjustable='box')
    
    # Metrics comparison bar chart
    ax = axes[1, 0]
    metrics_names = ['R2', 'RMSE_kWh', 'MAE_kWh', 'MAPE_percent']
    x = np.arange(len(metrics_names))
    width = 0.35
    
    train_vals = [train_metrics[m] for m in metrics_names]
    test_vals = [test_metrics[m] for m in metrics_names]
    
    bars1 = ax.bar(x - width/2, train_vals, width, label='Train', color='#48bb78', alpha=0.8)
    bars2 = ax.bar(x + width/2, test_vals, width, label='Test', color='#fc8181', alpha=0.8)
    
    ax.set_xlabel('Metric', fontsize=11)
    ax.set_ylabel('Value', fontsize=11)
    ax.set_title('Metrics Comparison: Train vs Test', fontsize=12, pad=10)
    ax.set_xticks(x)
    ax.set_xticklabels(['R2', 'RMSE\n(kWh)', 'MAE\n(kWh)', 'MAPE\n(%)'], fontsize=9)
    ax.legend(facecolor='white', edgecolor='black', fontsize=9)
    ax.grid(True, alpha=0.6, axis='y', color='lightgrey')
    
    for bar, val in zip(bars1, train_vals):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                f'{val:.2f}', ha='center', va='bottom', fontsize=8, color='black')
    for bar, val in zip(bars2, test_vals):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                f'{val:.2f}', ha='center', va='bottom', fontsize=8, color='black')
    
    # Summary text
    ax = axes[1, 1]
    ax.axis('off')
    
    summary_text = f"""
    TRAIN/TEST SPLIT SUMMARY
    ========================
    
    Split Date: {split_date.strftime("%Y-%m-%d")}
    
    TRAINING SET ({train_metrics['n_days']} days)
    --------------------------
    R2:                 {train_metrics['R2']:.4f}
    RMSE:               {train_metrics['RMSE_kWh']:.2f} kWh/day
    MAE:                {train_metrics['MAE_kWh']:.2f} kWh/day
    MAPE:               {train_metrics['MAPE_percent']:.2f}%
    Cumulative Error:   {train_metrics['Cumulative_Error_percent']:.2f}%
    
    TEST SET ({test_metrics['n_days']} days)
    --------------------------
    R2:                 {test_metrics['R2']:.4f}
    RMSE:               {test_metrics['RMSE_kWh']:.2f} kWh/day
    MAE:                {test_metrics['MAE_kWh']:.2f} kWh/day
    MAPE:               {test_metrics['MAPE_percent']:.2f}%
    Cumulative Error:   {test_metrics['Cumulative_Error_percent']:.2f}%
    """
    
    ax.text(0.5, 0.5, summary_text, transform=ax.transAxes, fontsize=10,
            verticalalignment='center', horizontalalignment='center',
            fontfamily='monospace', color='black',
            bbox=dict(boxstyle='round', facecolor='white', edgecolor='black'))
    
    plt.tight_layout()
    
    buf = io.BytesIO()
    plt.savefig(buf, format='png', dpi=120, bbox_inches='tight', facecolor='white')
    plt.close(fig)
    buf.seek(0)
    
    return base64.b64encode(buf.getvalue()).decode('utf-8')


def generate_forecast_plot(forecast_df: pd.DataFrame) -> str:
    """Generate professional week-ahead forecast visualization."""
    df = forecast_df.copy()
    df['date'] = df['hour_utc'].dt.date
    df['day_name'] = df['hour_utc'].dt.strftime('%a')
    
    # Compute daily totals
    daily = df.groupby('date').agg({
        'predicted_power_kw': 'sum',
        'gti': ['sum', 'max'],
        't_cell': 'mean',
        'tdr_factor': 'mean',
        'precipitation_mm': 'sum'
    }).reset_index()
    daily.columns = ['date', 'energy_kwh', 'gti_total', 'gti_max', 't_cell_avg', 'tdr_avg', 'precip_total']
    daily['date'] = pd.to_datetime(daily['date'])
    daily['day_name'] = daily['date'].dt.strftime('%a')
    
    # Professional color scheme
    PRIMARY_COLOR = '#2563eb'  # Blue
    SECONDARY_COLOR = '#f59e0b'  # Amber/Orange for irradiance
    ACCENT_COLOR = '#ef4444'  # Red for temperature
    GRID_COLOR = '#e5e7eb'
    TEXT_COLOR = '#1f2937'
    
    fig = plt.figure(figsize=(16, 10))
    fig.patch.set_facecolor('white')
    
    # Create grid layout
    gs = fig.add_gridspec(2, 3, height_ratios=[1, 1], width_ratios=[2, 2, 1.2], 
                          hspace=0.3, wspace=0.3)
    
    # =========== TOP LEFT: Hourly Power Profile ===========
    ax1 = fig.add_subplot(gs[0, :2])
    ax1.set_facecolor('white')
    
    ax1.fill_between(df['hour_utc'], 0, df['predicted_power_kw'], 
                     alpha=0.4, color=PRIMARY_COLOR, linewidth=0)
    ax1.plot(df['hour_utc'], df['predicted_power_kw'], 
             color=PRIMARY_COLOR, linewidth=1.8, alpha=0.9)
    
    ax1.set_xlabel('', fontsize=10)
    ax1.set_ylabel('Power Output (kW)', fontsize=11, color=TEXT_COLOR, fontweight='medium')
    ax1.set_title('Hourly Power Generation Forecast', fontsize=13, color=TEXT_COLOR, 
                  fontweight='bold', pad=15)
    ax1.grid(True, alpha=0.5, color=GRID_COLOR, linestyle='-', linewidth=0.5)
    ax1.set_xlim(df['hour_utc'].min(), df['hour_utc'].max())
    ax1.set_ylim(bottom=0)
    
    # Format x-axis
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%a %d'))
    ax1.xaxis.set_major_locator(mdates.DayLocator())
    ax1.tick_params(axis='both', colors=TEXT_COLOR, labelsize=9)
    
    for spine in ax1.spines.values():
        spine.set_color(GRID_COLOR)
        spine.set_linewidth(0.5)
    
    # =========== TOP RIGHT: Summary Card ===========
    ax_summary = fig.add_subplot(gs[0, 2])
    ax_summary.axis('off')
    
    total_energy = daily['energy_kwh'].sum()
    avg_daily = daily['energy_kwh'].mean()
    peak_idx = daily['energy_kwh'].idxmax()
    peak_day = daily.loc[peak_idx, 'date']
    peak_energy = daily['energy_kwh'].max()
    min_energy = daily['energy_kwh'].min()
    avg_tdr = df['tdr_factor'].mean()
    
    # Create summary box
    summary_box = f"""
━━━━━━━━━━━━━━━━━━━━━━━━━━
   FORECAST SUMMARY
━━━━━━━━━━━━━━━━━━━━━━━━━━

  Period
  {df['hour_utc'].min().strftime('%b %d')} → {df['hour_utc'].max().strftime('%b %d, %Y')}

━━━━━━━━━━━━━━━━━━━━━━━━━━
  PRODUCTION
━━━━━━━━━━━━━━━━━━━━━━━━━━
  Total         {total_energy:>8.1f} kWh
  Daily Avg     {avg_daily:>8.1f} kWh
  Peak          {peak_energy:>8.1f} kWh
  Minimum       {min_energy:>8.1f} kWh

━━━━━━━━━━━━━━━━━━━━━━━━━━
  CONDITIONS
━━━━━━━━━━━━━━━━━━━━━━━━━━
  Avg GTI       {df['gti'].mean():>8.0f} W/m²
  Peak GTI      {df['gti'].max():>8.0f} W/m²
  Avg Temp      {df['t_cell'].mean():>8.1f} °C

━━━━━━━━━━━━━━━━━━━━━━━━━━
  SYSTEM
━━━━━━━━━━━━━━━━━━━━━━━━━━
  TDR Factor    {avg_tdr:>8.4f}
  Efficiency    {avg_tdr*100:>7.1f} %
"""
    
    ax_summary.text(0.5, 0.5, summary_box, transform=ax_summary.transAxes, 
                    fontsize=9, fontfamily='monospace', color=TEXT_COLOR,
                    verticalalignment='center', horizontalalignment='center',
                    bbox=dict(boxstyle='round,pad=0.5', facecolor='#f8fafc', 
                             edgecolor=PRIMARY_COLOR, linewidth=1.5))
    
    # =========== BOTTOM LEFT: Daily Energy Bars ===========
    ax2 = fig.add_subplot(gs[1, 0])
    ax2.set_facecolor('white')
    
    x_pos = np.arange(len(daily))
    bars = ax2.bar(x_pos, daily['energy_kwh'], color=PRIMARY_COLOR, alpha=0.85, 
                   edgecolor='white', linewidth=1.5, width=0.7)
    
    # Highlight peak day
    peak_bar_idx = daily['energy_kwh'].idxmax()
    bars[peak_bar_idx].set_color('#16a34a')
    bars[peak_bar_idx].set_alpha(0.95)
    
    # Add value labels
    for i, (bar, val) in enumerate(zip(bars, daily['energy_kwh'])):
        color = '#16a34a' if i == peak_bar_idx else PRIMARY_COLOR
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.15, 
                f'{val:.1f}', ha='center', va='bottom', fontsize=10, 
                color=color, fontweight='bold')
    
    ax2.set_xlabel('', fontsize=10)
    ax2.set_ylabel('Daily Energy (kWh)', fontsize=11, color=TEXT_COLOR, fontweight='medium')
    ax2.set_title('Daily Production Forecast', fontsize=13, color=TEXT_COLOR, 
                  fontweight='bold', pad=15)
    ax2.set_xticks(x_pos)
    ax2.set_xticklabels([f"{row['day_name']}\n{row['date'].strftime('%d/%m')}" 
                         for _, row in daily.iterrows()], fontsize=9)
    ax2.grid(True, alpha=0.5, color=GRID_COLOR, axis='y', linestyle='-', linewidth=0.5)
    ax2.set_ylim(0, max(daily['energy_kwh']) * 1.15)
    ax2.tick_params(axis='both', colors=TEXT_COLOR, labelsize=9)
    
    for spine in ax2.spines.values():
        spine.set_color(GRID_COLOR)
        spine.set_linewidth(0.5)
    
    # =========== BOTTOM CENTER: Weather Conditions ===========
    ax3 = fig.add_subplot(gs[1, 1])
    ax3.set_facecolor('white')
    ax3_twin = ax3.twinx()
    
    # Plot GTI as area
    ax3.fill_between(df['hour_utc'], 0, df['gti'], alpha=0.35, color=SECONDARY_COLOR)
    line1, = ax3.plot(df['hour_utc'], df['gti'], color=SECONDARY_COLOR, 
                      linewidth=1.5, alpha=0.9, label='Irradiance (GTI)')
    
    # Plot temperature as line
    line2, = ax3_twin.plot(df['hour_utc'], df['t_cell'], color=ACCENT_COLOR, 
                           linewidth=2, alpha=0.85, label='Cell Temperature')
    
    ax3.set_xlabel('', fontsize=10)
    ax3.set_ylabel('Global Tilted Irradiance (W/m²)', fontsize=10, 
                   color=SECONDARY_COLOR, fontweight='medium')
    ax3_twin.set_ylabel('Cell Temperature (°C)', fontsize=10, 
                        color=ACCENT_COLOR, fontweight='medium')
    ax3.set_title('Weather Conditions', fontsize=13, color=TEXT_COLOR, 
                  fontweight='bold', pad=15)
    
    ax3.tick_params(axis='y', labelcolor=SECONDARY_COLOR, labelsize=9)
    ax3_twin.tick_params(axis='y', labelcolor=ACCENT_COLOR, labelsize=9)
    ax3.tick_params(axis='x', colors=TEXT_COLOR, labelsize=9)
    
    ax3.xaxis.set_major_formatter(mdates.DateFormatter('%a %d'))
    ax3.xaxis.set_major_locator(mdates.DayLocator())
    ax3.set_xlim(df['hour_utc'].min(), df['hour_utc'].max())
    ax3.set_ylim(bottom=0)
    
    ax3.grid(True, alpha=0.5, color=GRID_COLOR, linestyle='-', linewidth=0.5)
    
    # Legend
    lines = [line1, line2]
    labels = [l.get_label() for l in lines]
    ax3.legend(lines, labels, loc='upper right', fontsize=9, 
               framealpha=0.95, edgecolor=GRID_COLOR)
    
    for spine in ax3.spines.values():
        spine.set_color(GRID_COLOR)
        spine.set_linewidth(0.5)
    for spine in ax3_twin.spines.values():
        spine.set_color(GRID_COLOR)
        spine.set_linewidth(0.5)
    
    # =========== BOTTOM RIGHT: Daily Conditions Table ===========
    ax_table = fig.add_subplot(gs[1, 2])
    ax_table.axis('off')
    
    # Create table data - truncate to fit 15 days
    table_data = []
    for _, row in daily.iterrows():
        table_data.append([
            row['date'].strftime('%d'),
            f"{row['energy_kwh']:.1f}",
            f"{row['gti_max']:.0f}",
            f"{row['t_cell_avg']:.0f}"
        ])
    
    table = ax_table.table(
        cellText=table_data,
        colLabels=['D', 'kWh', 'GTI', 'T'],
        loc='center',
        cellLoc='center',
        colColours=[PRIMARY_COLOR]*4,
        colWidths=[0.18, 0.28, 0.28, 0.26]
    )
    
    table.auto_set_font_size(False)
    table.set_fontsize(7)
    table.scale(1, 1.4)
    
    # Style header
    for i in range(4):
        table[(0, i)].set_text_props(color='white', fontweight='bold')
        table[(0, i)].set_facecolor(PRIMARY_COLOR)
    
    # Style data rows
    for i in range(1, len(table_data) + 1):
        for j in range(4):
            table[(i, j)].set_facecolor('#f8fafc' if i % 2 == 0 else 'white')
            table[(i, j)].set_edgecolor(GRID_COLOR)
    
    ax_table.set_title('Daily Conditions', fontsize=11, color=TEXT_COLOR, 
                       fontweight='bold', pad=10, loc='center')
    
    # Main title
    fig.suptitle('PV System 15-Day Production Forecast', fontsize=16, 
                 fontweight='bold', color=TEXT_COLOR, y=0.98)
    
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    
    buf = io.BytesIO()
    plt.savefig(buf, format='png', dpi=150, bbox_inches='tight', facecolor='white')
    plt.close(fig)
    buf.seek(0)
    
    return base64.b64encode(buf.getvalue()).decode('utf-8')


# ============================================================================
# ROUTES
# ============================================================================

def process_load_config(data: Dict):
    """Step 1: Load plant configuration."""
    try:
        config = load_plant_config(data)
        simulation_state['config'] = config
        
        array = config['arrays'][0]
        
        return {
            'status': 'success',
            'building_name': config['building_name'],
            'location': config['location'],
            'prediction_period': config['prediction_period'],
            'arrays': config['arrays'],
            'roof_height': config['roof_height'],
            'system_factors': {
                'loss_factor': clean_float(array['loss_factor']),
                'loss_percentage': clean_float(array['loss_percentage']),
                'degradation_factor': clean_float(array['degradation_factor'])
            },
            'tdr_enabled': simulation_state['tdr_enabled'],
            'module_params': simulation_state['module_params'],
            'train_years': simulation_state['train_years']
        }
    except Exception as e:
        return {'status': 'error', 'message': str(e)}, 500


def process_load_data(data: pd.DataFrame):
    """Step 2: Fetch weather from Open-Meteo API and load power data."""
    try:
        if simulation_state['config'] is None:
            return {'status': 'error', 'message': 'Load config first'}, 400
        
        config = simulation_state['config']
        weather_df, power_df, merged_df = load_and_align_data(data, config)
        
        simulation_state['weather_df'] = weather_df
        simulation_state['power_df'] = power_df
        simulation_state['merged_df'] = merged_df
        
        # Calculate train/test split date
        train_years = simulation_state['train_years']
        start_date = pd.to_datetime(config['prediction_period']['start_date']).tz_localize('UTC')
        split_date = start_date + relativedelta(years=train_years)
        simulation_state['train_split_date'] = split_date
        
        # Split data into train and test sets
        train_df = merged_df[merged_df['hour_utc'] < split_date].copy()
        test_df = merged_df[merged_df['hour_utc'] >= split_date].copy()
        
        simulation_state['train_df'] = train_df
        simulation_state['test_df'] = test_df
        
        return {
            'status': 'success',
            'weather_records': len(weather_df),
            'power_records': len(power_df),
            'merged_records': len(merged_df),
            'date_range': {
                'start': str(merged_df['hour_utc'].min()),
                'end': str(merged_df['hour_utc'].max())
            },
            'train_test_split': {
                'split_date': str(split_date),
                'train_years': train_years,
                'train_records': len(train_df),
                'test_records': len(test_df),
                'train_period': f"{train_df['hour_utc'].min()} to {train_df['hour_utc'].max()}" if len(train_df) > 0 else "N/A",
                'test_period': f"{test_df['hour_utc'].min()} to {test_df['hour_utc'].max()}" if len(test_df) > 0 else "N/A"
            },
            'weather_stats': {
                'gti_mean': clean_float(merged_df['gti'].mean()),
                'gti_max': clean_float(merged_df['gti'].max()),
                't_cell_mean': clean_float(merged_df['t_cell'].mean()),
                't_cell_max': clean_float(merged_df['t_cell'].max()),
                'wind_mean': clean_float(merged_df['wind_speed_corrected'].mean())
            },
            'power_stats': {
                'mean_kw': clean_float(merged_df['real_power_kw'].mean()),
                'max_kw': clean_float(merged_df['real_power_kw'].max()),
                'mean_w': clean_float(merged_df['real_power_w'].mean()),
                'max_w': clean_float(merged_df['real_power_w'].max())
            },
            'data_source': 'Open-Meteo Archive API'
        }
    except Exception as e:
        return {'status': 'error', 'message': str(e)}, 500


def process_calibrate(data: Dict):
    """Step 3: Calibrate PVUSA coefficients using TRAINING data only."""
    try:
        if simulation_state['merged_df'] is None:
            return {'status': 'error', 'message': 'Load data first'}, 400

        use_json_coefs = data.get('use_json_coefficients', False)
        
        # Use TRAINING data for calibration
        train_df = simulation_state['train_df']
        if train_df is None or len(train_df) == 0:
            return {'status': 'error', 'message': 'No training data available'}, 400
        
        df_calib = train_df[train_df['gti'] > 50].copy()
        
        if use_json_coefs and simulation_state['config']:
            json_coefs = simulation_state['config']['arrays'][0]['pvusa_coefficients']
            coefficients = {
                'c1': json_coefs['c1'],
                'c2': json_coefs['c2'],
                'c3': json_coefs['c3'],
                'c4': json_coefs['c4']
            }
            
            G = df_calib['gti'].values
            T = df_calib['t_cell'].values
            V = df_calib['wind_speed_corrected'].values
            P = df_calib['real_power_w'].values
            
            P_pred = G * (coefficients['c1'] + coefficients['c2']*G + 
                        coefficients['c3']*T + coefficients['c4']*V)
            P_pred = np.clip(P_pred, 0, None)
            rmse = np.sqrt(mean_squared_error(P, P_pred))
            method = 'JSON Pre-defined'
        else:
            if len(df_calib) < 100:
                return {'status': 'error', 'message': f'Not enough training data: {len(df_calib)} samples'}, 400
            
            coefficients, rmse, _ = calibrate_pvusa_huber(df_calib)
            method = 'Huber Regression (Training Data)'
        
        simulation_state['coefficients'] = coefficients
        simulation_state['calibration_method'] = method
        
        plot_b64 = generate_diagnostic_plot(df_calib, coefficients)
        
        return {
            'status': 'success',
            'method': method,
            'coefficients': {
                'c1': clean_float(coefficients['c1']),
                'c2': clean_float(coefficients['c2']),
                'c3': clean_float(coefficients['c3']),
                'c4': clean_float(coefficients['c4'])
            },
            'calibration_rmse_w': clean_float(rmse),
            'samples_used': len(df_calib),
            'calibration_period': f"{train_df['hour_utc'].min()} to {train_df['hour_utc'].max()}",
            'note': 'Calibration performed on TRAINING data only. TDR applied during prediction.',
            'plot_image': plot_b64
        }
    except Exception as e:
        return {'status': 'error', 'message': str(e)}, 500


def process_predict(data: Dict):
    """Step 4: Apply PVUSA model to predict power for ALL data (train + test)."""
    try:
        if simulation_state['coefficients'] is None:
            return {'status': 'error', 'message': 'Calibrate coefficients first'}, 400
        
        apply_tdr = data.get('apply_tdr', True)
        
        # Apply to FULL dataset
        df = simulation_state['merged_df'].copy()
        coefficients = simulation_state['coefficients']
        config = simulation_state['config']
        module_params = simulation_state['module_params']
        
        array_config = config['arrays'][0] if config else {}
        degradation_factor = array_config.get('degradation_factor', 1.0)
        
        if apply_tdr:
            df = calculate_tdr_factors(df, module_params)
        
        df = apply_pvusa_model(df, coefficients, 
                              loss_factor=1.0,
                              degradation_factor=degradation_factor,
                              apply_tdr=apply_tdr)
        
        simulation_state['predictions_df'] = df
        simulation_state['tdr_applied'] = apply_tdr
        
        # Save predictions
        cols_to_save = ['hour_utc', 'gti', 't_cell', 'wind_speed_corrected', 
                        'real_power_w', 'real_power_kw', 'predicted_power_w', 'predicted_power_kw',
                        'predicted_power_ideal_w']
        if 'tdr_factor' in df.columns:
            cols_to_save.extend(['tdr_factor', 'acceleration_factor', 'effective_year'])
        
        web_bridge.writeTmpFile("pvusa_predictions.csv", df[cols_to_save].to_csv())
        
        tdr_stats = {}
        if 'tdr_factor' in df.columns:
            tdr_stats = {
                'tdr_initial': clean_float(df['tdr_factor'].iloc[0]),
                'tdr_final': clean_float(df['tdr_factor'].iloc[-1]),
                'tdr_mean': clean_float(df['tdr_factor'].mean()),
                'tdr_degradation_pct': clean_float((1 - df['tdr_factor'].iloc[-1]) * 100),
                'mean_acceleration_factor': clean_float(df['acceleration_factor'].mean()),
                'effective_years': clean_float(df['effective_year'].iloc[-1])
            }
        
        return {
            'status': 'success',
            'records': len(df),
            'tdr_applied': apply_tdr,
            'tdr_stats': tdr_stats,
            'system_factors': {
                'loss_factor': clean_float(1.0),
                'degradation_factor': clean_float(degradation_factor),
                'note': 'Full TDR model applied (Thermal + Humidity + Soiling)' if apply_tdr else 'TDR not applied'
            },
            'predictions': {
                'mean_predicted_kw': clean_float(df['predicted_power_kw'].mean()),
                'max_predicted_kw': clean_float(df['predicted_power_kw'].max()),
                'mean_real_kw': clean_float(df['real_power_kw'].mean()),
                'max_real_kw': clean_float(df['real_power_kw'].max()),
                'total_predicted_kwh': clean_float(df['predicted_power_kw'].sum()),
                'total_real_kwh': clean_float(df['real_power_kw'].sum())
            }
        }
    except Exception as e:
        return {'status': 'error', 'message': str(e)}, 500


def process_validate():
    """Step 5: Validate predictions with daily energy metrics for TRAIN, TEST, and COMBINED."""
    try:
        if simulation_state['predictions_df'] is None:
            return {'status': 'error', 'message': 'Run predictions first'}, 400
        
        df = simulation_state['predictions_df'].copy()
        split_date = simulation_state['train_split_date']
        
        # Split predictions into train and test
        train_pred_df = df[df['hour_utc'] < split_date].copy()
        test_pred_df = df[df['hour_utc'] >= split_date].copy()
        
        # ========== TRAINING SET VALIDATION ==========
        train_sim_daily = compute_daily_energy(train_pred_df, 'predicted_power_w', 'hour_utc')
        train_real_daily = compute_daily_energy(train_pred_df, 'real_power_w', 'hour_utc')
        train_daily_merged, train_metrics = compute_validation_metrics(train_sim_daily, train_real_daily)
        
        simulation_state['train_daily_validation'] = train_daily_merged
        simulation_state['train_metrics'] = train_metrics
        
        train_plot_b64 = generate_validation_plot(train_daily_merged, train_pred_df, "TRAINING SET")
        
        # ========== TEST SET VALIDATION ==========
        test_sim_daily = compute_daily_energy(test_pred_df, 'predicted_power_w', 'hour_utc')
        test_real_daily = compute_daily_energy(test_pred_df, 'real_power_w', 'hour_utc')
        test_daily_merged, test_metrics = compute_validation_metrics(test_sim_daily, test_real_daily)
        
        simulation_state['test_daily_validation'] = test_daily_merged
        simulation_state['test_metrics'] = test_metrics
        
        test_plot_b64 = generate_validation_plot(test_daily_merged, test_pred_df, "TEST SET")
        
        # ========== COMBINED VALIDATION ==========
        sim_daily = compute_daily_energy(df, 'predicted_power_w', 'hour_utc')
        real_daily = compute_daily_energy(df, 'real_power_w', 'hour_utc')
        daily_merged, metrics = compute_validation_metrics(sim_daily, real_daily)
        
        simulation_state['daily_validation'] = daily_merged
        simulation_state['metrics'] = metrics
        
        combined_plot_b64 = generate_validation_plot(daily_merged, df, "COMBINED (ALL DATA)")
        
        # Generate train/test comparison plot
        comparison_plot_b64 = generate_train_test_comparison_plot(
            train_daily_merged, test_daily_merged, 
            train_metrics, test_metrics, split_date
        )
        
        # Save all validation data
        web_bridge.writeTmpFile("daily_validation_train.csv", train_daily_merged.to_csv())
        web_bridge.writeTmpFile("daily_validation_test.csv", test_daily_merged.to_csv())
        web_bridge.writeTmpFile("daily_validation.csv", daily_merged.to_csv())
        
        return {
            'status': 'success',
            'split_date': str(split_date),
            'tdr_applied': simulation_state.get('tdr_applied', False),
            
            'train_metrics': train_metrics,
            'train_daily_stats': {
                'mean_sim_kwh': clean_float(train_daily_merged['sim_kwh'].mean()),
                'mean_real_kwh': clean_float(train_daily_merged['real_kwh'].mean()),
                'total_sim_kwh': clean_float(train_daily_merged['sim_kwh'].sum()),
                'total_real_kwh': clean_float(train_daily_merged['real_kwh'].sum())
            },
            'train_plot_image': train_plot_b64,
            
            'test_metrics': test_metrics,
            'test_daily_stats': {
                'mean_sim_kwh': clean_float(test_daily_merged['sim_kwh'].mean()),
                'mean_real_kwh': clean_float(test_daily_merged['real_kwh'].mean()),
                'total_sim_kwh': clean_float(test_daily_merged['sim_kwh'].sum()),
                'total_real_kwh': clean_float(test_daily_merged['real_kwh'].sum())
            },
            'test_plot_image': test_plot_b64,
            
            'combined_metrics': metrics,
            'combined_daily_stats': {
                'mean_sim_kwh': clean_float(daily_merged['sim_kwh'].mean()),
                'mean_real_kwh': clean_float(daily_merged['real_kwh'].mean()),
                'total_sim_kwh': clean_float(daily_merged['sim_kwh'].sum()),
                'total_real_kwh': clean_float(daily_merged['real_kwh'].sum())
            },
            'combined_plot_image': combined_plot_b64,
            
            'comparison_plot_image': comparison_plot_b64
        }
    except Exception as e:
        return {'status': 'error', 'message': str(e)}, 500


def process_get_timeseries():
    """Get time series data for frontend plotting."""
    try:
        if simulation_state['daily_validation'] is None:
            return {'status': 'error', 'message': 'No validation data'}, 400
        
        daily = simulation_state['daily_validation']
        df = simulation_state['predictions_df']
        
        response = {
            'dates': daily['date'].dt.strftime('%Y-%m-%d').tolist(),
            'sim_kwh': [clean_float(x) for x in daily['sim_kwh'].tolist()],
            'real_kwh': [clean_float(x) for x in daily['real_kwh'].tolist()]
        }
        
        if df is not None and 'tdr_factor' in df.columns:
            df_copy = df.copy()
            df_copy['date'] = df_copy['hour_utc'].dt.date
            daily_tdr = df_copy.groupby('date')['tdr_factor'].mean().reset_index()
            daily_tdr['date'] = pd.to_datetime(daily_tdr['date'])
            
            response['tdr_dates'] = daily_tdr['date'].dt.strftime('%Y-%m-%d').tolist()
            response['tdr_factor'] = [clean_float(x) for x in daily_tdr['tdr_factor'].tolist()]
        
        # Add train/test split info
        if simulation_state['train_split_date']:
            response['split_date'] = simulation_state['train_split_date'].strftime('%Y-%m-%d')
        
        return response
    except Exception as e:
        return {'status': 'error', 'message': str(e)}, 500


def process_set_module_params(data: Dict):
    """Update module parameters for TDR model."""
    try:
        for key in simulation_state['module_params'].keys():
            if key in data:
                simulation_state['module_params'][key] = float(data[key])
        
        return {
            'status': 'success',
            'module_params': simulation_state['module_params']
        }
    except Exception as e:
        return {'status': 'error', 'message': str(e)}, 500


def process_set_train_years(data: Dict):
    """Set the number of years for training data."""
    try:
        years = data.get('train_years', 2)
        simulation_state['train_years'] = int(years)
        
        return {
            'status': 'success',
            'train_years': simulation_state['train_years']
        }
    except Exception as e:
        return {'status': 'error', 'message': str(e)}, 500
    

def process_forecast(data: Dict):
    """Step 6: Generate 15-day forecast using Open-Meteo Forecast API."""
    try:
        if simulation_state['coefficients'] is None:
            return {'status': 'error', 'message': 'Calibrate coefficients first (complete steps 1-5)'}, 400
        
        if simulation_state['config'] is None:
            return {'status': 'error', 'message': 'Load configuration first'}, 400
        
        forecast_days = data.get('forecast_days', 15)
        apply_tdr = data.get('apply_tdr', True)
        
        config = simulation_state['config']
        coefficients = simulation_state['coefficients']
        module_params = simulation_state['module_params']
        
        # Fetch forecast weather from Open-Meteo Forecast API
        forecast_df = fetch_forecast_weather(config, forecast_days)
        
        # Apply TDR (using historical baseline + forecast soiling)
        if apply_tdr:
            historical_df = simulation_state.get('predictions_df', None)
            forecast_df = calculate_forecast_tdr(forecast_df, module_params, historical_df)
        else:
            forecast_df['tdr_factor'] = 1.0
            forecast_df['acceleration_factor'] = 1.0
            forecast_df['effective_year'] = 0.0
            forecast_df['cumulative_degradation_pct'] = 0.0
        
        # Get degradation factor from config
        array_config = config['arrays'][0] if config else {}
        degradation_factor = array_config.get('degradation_factor', 1.0)
        
        # Apply PVUSA model to forecast data
        forecast_df = apply_pvusa_model(
            forecast_df, 
            coefficients, 
            loss_factor=1.0,
            degradation_factor=degradation_factor,
            apply_tdr=apply_tdr
        )
        
        # Store forecast data
        simulation_state['forecast_df'] = forecast_df
        simulation_state['forecast_predictions_df'] = forecast_df
        
        # Calculate daily forecasts with more aggregations
        forecast_df['date'] = forecast_df['hour_utc'].dt.date
        daily_forecast = forecast_df.groupby('date').agg({
            'predicted_power_kw': 'sum',
            'gti': ['sum', 'max', 'mean'],
            't_cell': ['mean', 'max'],
            'tdr_factor': 'mean',
            'precipitation_mm': 'sum'
        }).reset_index()
        
        # Flatten column names
        daily_forecast.columns = ['date', 'energy_kwh', 'gti_total', 'gti_max', 'gti_mean',
                                   't_cell_avg', 't_cell_max', 'tdr_avg', 'precip_total']
        
        # Save forecast data
        web_bridge.writeTmpFile("forecast_predictions.csv", forecast_df.to_csv())
        web_bridge.writeTmpFile("forecast_daily.csv", daily_forecast.to_csv())
        
        # Prepare daily breakdown for response
        daily_breakdown = []
        for _, row in daily_forecast.iterrows():
            daily_breakdown.append({
                'date': str(row['date']),
                'energy_kwh': round(clean_float(row['energy_kwh']), 2),
                'gti_total': round(clean_float(row['gti_total']), 0),
                'gti_max': round(clean_float(row['gti_max']), 0),
                't_cell_avg': round(clean_float(row['t_cell_avg']), 1),
                'tdr_factor': round(clean_float(row['tdr_avg']), 4),
                'precip_mm': round(clean_float(row['precip_total']), 1)
            })
        
        # Prepare hourly data for Plotly charts
        hourly_data = {
            'timestamps': forecast_df['hour_utc'].dt.strftime('%Y-%m-%dT%H:%M').tolist(),
            'predicted_kw': [round(clean_float(x), 3) for x in forecast_df['predicted_power_kw'].tolist()],
            'gti': [round(clean_float(x), 1) for x in forecast_df['gti'].tolist()],
            't_cell': [round(clean_float(x), 1) for x in forecast_df['t_cell'].tolist()],
            'tdr_factor': [round(clean_float(x), 4) for x in forecast_df['tdr_factor'].tolist()]
        }
        
        # Prepare daily data for Plotly bar chart
        daily_chart_data = {
            'dates': [str(d) for d in daily_forecast['date'].tolist()],
            'energy_kwh': [round(clean_float(x), 2) for x in daily_forecast['energy_kwh'].tolist()],
            'gti_max': [round(clean_float(x), 0) for x in daily_forecast['gti_max'].tolist()],
            't_cell_avg': [round(clean_float(x), 1) for x in daily_forecast['t_cell_avg'].tolist()]
        }
        
        return {
            'status': 'success',
            'forecast_days': forecast_days,
            'tdr_applied': apply_tdr,
            'forecast_period': {
                'start': str(forecast_df['hour_utc'].min()),
                'end': str(forecast_df['hour_utc'].max())
            },
            'summary': {
                'total_energy_kwh': round(clean_float(daily_forecast['energy_kwh'].sum()), 1),
                'daily_average_kwh': round(clean_float(daily_forecast['energy_kwh'].mean()), 1),
                'peak_energy_kwh': round(clean_float(daily_forecast['energy_kwh'].max()), 1),
                'min_energy_kwh': round(clean_float(daily_forecast['energy_kwh'].min()), 1),
                'peak_date': str(daily_forecast.loc[daily_forecast['energy_kwh'].idxmax(), 'date']),
                'avg_tdr_factor': round(clean_float(forecast_df['tdr_factor'].mean()), 4),
                'avg_gti': round(clean_float(forecast_df['gti'].mean()), 0),
                'max_gti': round(clean_float(forecast_df['gti'].max()), 0),
                'avg_t_cell': round(clean_float(forecast_df['t_cell'].mean()), 1)
            },
            'daily_breakdown': daily_breakdown,
            'hourly_data': hourly_data,
            'daily_chart_data': daily_chart_data,
            'hourly_records': len(forecast_df),
            'data_source': 'Open-Meteo Forecast API'
        }
    except Exception as e:
        return {'status': 'error', 'message': str(e)}, 500


def process_get_forecast_timeseries():
    """Get forecast time series data for frontend plotting."""
    try:
        if simulation_state['forecast_predictions_df'] is None:
            return {'status': 'error', 'message': 'No forecast data available'}, 400
        
        df = simulation_state['forecast_predictions_df']
        
        response = {
            'timestamps': df['hour_utc'].dt.strftime('%Y-%m-%d %H:%M').tolist(),
            'predicted_kw': [clean_float(x) for x in df['predicted_power_kw'].tolist()],
            'gti': [clean_float(x) for x in df['gti'].tolist()],
            't_cell': [clean_float(x) for x in df['t_cell'].tolist()],
            'tdr_factor': [clean_float(x) for x in df['tdr_factor'].tolist()]
        }
        
        return response
    except Exception as e:
        return {'status': 'error', 'message': str(e)}, 500
    