Upload folder using huggingface_hub
Browse files- utils/__init__.py +0 -0
- utils/__pycache__/__init__.cpython-311.pyc +0 -0
- utils/__pycache__/metrics.cpython-311.pyc +0 -0
- utils/__pycache__/validators.cpython-311.pyc +0 -0
- utils/metrics.py +208 -0
- utils/validators.py +203 -0
utils/__init__.py
ADDED
|
File without changes
|
utils/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (161 Bytes). View file
|
|
|
utils/__pycache__/metrics.cpython-311.pyc
ADDED
|
Binary file (8.67 kB). View file
|
|
|
utils/__pycache__/validators.cpython-311.pyc
ADDED
|
Binary file (7.47 kB). View file
|
|
|
utils/metrics.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Forecast evaluation metrics
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
from typing import Dict, Any
|
| 7 |
+
import numpy as np
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def calculate_metrics(
|
| 14 |
+
actual: pd.Series,
|
| 15 |
+
forecast: pd.Series,
|
| 16 |
+
include_percentage: bool = True
|
| 17 |
+
) -> Dict[str, float]:
|
| 18 |
+
"""
|
| 19 |
+
Calculate forecast accuracy metrics
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
actual: Actual values
|
| 23 |
+
forecast: Forecasted values
|
| 24 |
+
include_percentage: Include percentage-based metrics
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
Dictionary of metrics
|
| 28 |
+
"""
|
| 29 |
+
try:
|
| 30 |
+
# Ensure same length
|
| 31 |
+
min_len = min(len(actual), len(forecast))
|
| 32 |
+
actual = actual.iloc[:min_len].values
|
| 33 |
+
forecast = forecast.iloc[:min_len].values
|
| 34 |
+
|
| 35 |
+
# Remove NaN values
|
| 36 |
+
mask = ~(np.isnan(actual) | np.isnan(forecast))
|
| 37 |
+
actual = actual[mask]
|
| 38 |
+
forecast = forecast[mask]
|
| 39 |
+
|
| 40 |
+
if len(actual) == 0:
|
| 41 |
+
return {'error': 'No valid values for metric calculation'}
|
| 42 |
+
|
| 43 |
+
metrics = {}
|
| 44 |
+
|
| 45 |
+
# Mean Absolute Error
|
| 46 |
+
metrics['MAE'] = float(np.mean(np.abs(actual - forecast)))
|
| 47 |
+
|
| 48 |
+
# Root Mean Squared Error
|
| 49 |
+
metrics['RMSE'] = float(np.sqrt(np.mean((actual - forecast) ** 2)))
|
| 50 |
+
|
| 51 |
+
# Mean Error (bias)
|
| 52 |
+
metrics['ME'] = float(np.mean(forecast - actual))
|
| 53 |
+
|
| 54 |
+
if include_percentage:
|
| 55 |
+
# Mean Absolute Percentage Error
|
| 56 |
+
# Avoid division by zero
|
| 57 |
+
mask_nonzero = actual != 0
|
| 58 |
+
if mask_nonzero.any():
|
| 59 |
+
mape = np.mean(np.abs((actual[mask_nonzero] - forecast[mask_nonzero]) / actual[mask_nonzero])) * 100
|
| 60 |
+
metrics['MAPE'] = float(mape)
|
| 61 |
+
|
| 62 |
+
# Symmetric MAPE
|
| 63 |
+
denominator = (np.abs(actual) + np.abs(forecast)) / 2
|
| 64 |
+
mask_nonzero = denominator != 0
|
| 65 |
+
if mask_nonzero.any():
|
| 66 |
+
smape = np.mean(np.abs(actual[mask_nonzero] - forecast[mask_nonzero]) / denominator[mask_nonzero]) * 100
|
| 67 |
+
metrics['sMAPE'] = float(smape)
|
| 68 |
+
|
| 69 |
+
# R-squared
|
| 70 |
+
ss_res = np.sum((actual - forecast) ** 2)
|
| 71 |
+
ss_tot = np.sum((actual - np.mean(actual)) ** 2)
|
| 72 |
+
if ss_tot != 0:
|
| 73 |
+
metrics['R2'] = float(1 - (ss_res / ss_tot))
|
| 74 |
+
|
| 75 |
+
return metrics
|
| 76 |
+
|
| 77 |
+
except Exception as e:
|
| 78 |
+
logger.error(f"Error calculating metrics: {str(e)}", exc_info=True)
|
| 79 |
+
return {'error': str(e)}
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def calculate_coverage(
|
| 83 |
+
actual: pd.Series,
|
| 84 |
+
lower_bound: pd.Series,
|
| 85 |
+
upper_bound: pd.Series
|
| 86 |
+
) -> float:
|
| 87 |
+
"""
|
| 88 |
+
Calculate coverage of prediction intervals
|
| 89 |
+
|
| 90 |
+
Args:
|
| 91 |
+
actual: Actual values
|
| 92 |
+
lower_bound: Lower bound of prediction interval
|
| 93 |
+
upper_bound: Upper bound of prediction interval
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
Coverage percentage (0-100)
|
| 97 |
+
"""
|
| 98 |
+
try:
|
| 99 |
+
# Ensure same length
|
| 100 |
+
min_len = min(len(actual), len(lower_bound), len(upper_bound))
|
| 101 |
+
actual = actual.iloc[:min_len].values
|
| 102 |
+
lower_bound = lower_bound.iloc[:min_len].values
|
| 103 |
+
upper_bound = upper_bound.iloc[:min_len].values
|
| 104 |
+
|
| 105 |
+
# Count values within bounds
|
| 106 |
+
within_bounds = (actual >= lower_bound) & (actual <= upper_bound)
|
| 107 |
+
coverage = np.mean(within_bounds) * 100
|
| 108 |
+
|
| 109 |
+
return float(coverage)
|
| 110 |
+
|
| 111 |
+
except Exception as e:
|
| 112 |
+
logger.error(f"Error calculating coverage: {str(e)}", exc_info=True)
|
| 113 |
+
return 0.0
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def calculate_interval_width(
|
| 117 |
+
lower_bound: pd.Series,
|
| 118 |
+
upper_bound: pd.Series
|
| 119 |
+
) -> Dict[str, float]:
|
| 120 |
+
"""
|
| 121 |
+
Calculate statistics about prediction interval width
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
lower_bound: Lower bound of prediction interval
|
| 125 |
+
upper_bound: Upper bound of prediction interval
|
| 126 |
+
|
| 127 |
+
Returns:
|
| 128 |
+
Dictionary with width statistics
|
| 129 |
+
"""
|
| 130 |
+
try:
|
| 131 |
+
widths = upper_bound - lower_bound
|
| 132 |
+
|
| 133 |
+
return {
|
| 134 |
+
'mean_width': float(widths.mean()),
|
| 135 |
+
'median_width': float(widths.median()),
|
| 136 |
+
'min_width': float(widths.min()),
|
| 137 |
+
'max_width': float(widths.max()),
|
| 138 |
+
'std_width': float(widths.std())
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
except Exception as e:
|
| 142 |
+
logger.error(f"Error calculating interval width: {str(e)}", exc_info=True)
|
| 143 |
+
return {}
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def format_metric(value: float, metric_name: str) -> str:
|
| 147 |
+
"""
|
| 148 |
+
Format metric value for display
|
| 149 |
+
|
| 150 |
+
Args:
|
| 151 |
+
value: Metric value
|
| 152 |
+
metric_name: Name of the metric
|
| 153 |
+
|
| 154 |
+
Returns:
|
| 155 |
+
Formatted string
|
| 156 |
+
"""
|
| 157 |
+
if metric_name in ['MAPE', 'sMAPE', 'R2']:
|
| 158 |
+
return f"{value:.2f}%"
|
| 159 |
+
elif metric_name in ['MAE', 'RMSE', 'ME']:
|
| 160 |
+
if abs(value) >= 1000:
|
| 161 |
+
return f"{value:,.2f}"
|
| 162 |
+
else:
|
| 163 |
+
return f"{value:.4f}"
|
| 164 |
+
else:
|
| 165 |
+
return f"{value:.4f}"
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def summarize_forecast_quality(
|
| 169 |
+
forecast_df: pd.DataFrame,
|
| 170 |
+
confidence_levels: list
|
| 171 |
+
) -> Dict[str, Any]:
|
| 172 |
+
"""
|
| 173 |
+
Summarize the quality of a forecast
|
| 174 |
+
|
| 175 |
+
Args:
|
| 176 |
+
forecast_df: DataFrame with forecast results
|
| 177 |
+
confidence_levels: List of confidence levels
|
| 178 |
+
|
| 179 |
+
Returns:
|
| 180 |
+
Summary dictionary
|
| 181 |
+
"""
|
| 182 |
+
try:
|
| 183 |
+
summary = {
|
| 184 |
+
'horizon': len(forecast_df),
|
| 185 |
+
'forecast_range': {
|
| 186 |
+
'min': float(forecast_df['forecast'].min()),
|
| 187 |
+
'max': float(forecast_df['forecast'].max()),
|
| 188 |
+
'mean': float(forecast_df['forecast'].mean())
|
| 189 |
+
}
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
# Analyze interval widths for each confidence level
|
| 193 |
+
interval_widths = {}
|
| 194 |
+
for cl in confidence_levels:
|
| 195 |
+
lower_col = f'lower_{cl}'
|
| 196 |
+
upper_col = f'upper_{cl}'
|
| 197 |
+
|
| 198 |
+
if lower_col in forecast_df.columns and upper_col in forecast_df.columns:
|
| 199 |
+
width = (forecast_df[upper_col] - forecast_df[lower_col]).mean()
|
| 200 |
+
interval_widths[f'{cl}%'] = float(width)
|
| 201 |
+
|
| 202 |
+
summary['interval_widths'] = interval_widths
|
| 203 |
+
|
| 204 |
+
return summary
|
| 205 |
+
|
| 206 |
+
except Exception as e:
|
| 207 |
+
logger.error(f"Error summarizing forecast: {str(e)}", exc_info=True)
|
| 208 |
+
return {}
|
utils/validators.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Input validation utilities
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
from typing import Dict, List, Optional, Any
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from config.constants import MAX_FILE_SIZE, ALLOWED_EXTENSIONS
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def validate_file_upload(filename: str, filesize: int) -> Dict[str, Any]:
|
| 14 |
+
"""
|
| 15 |
+
Validate uploaded file
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
filename: Name of the uploaded file
|
| 19 |
+
filesize: Size of the file in bytes
|
| 20 |
+
|
| 21 |
+
Returns:
|
| 22 |
+
Validation result dictionary
|
| 23 |
+
"""
|
| 24 |
+
issues = []
|
| 25 |
+
|
| 26 |
+
# Check file extension
|
| 27 |
+
extension = filename.split('.')[-1].lower() if '.' in filename else ''
|
| 28 |
+
if extension not in ALLOWED_EXTENSIONS:
|
| 29 |
+
issues.append(f"Invalid file type '{extension}'. Allowed: {', '.join(ALLOWED_EXTENSIONS)}")
|
| 30 |
+
|
| 31 |
+
# Check file size
|
| 32 |
+
if filesize > MAX_FILE_SIZE:
|
| 33 |
+
max_mb = MAX_FILE_SIZE / (1024 * 1024)
|
| 34 |
+
actual_mb = filesize / (1024 * 1024)
|
| 35 |
+
issues.append(f"File too large ({actual_mb:.1f}MB). Maximum: {max_mb:.0f}MB")
|
| 36 |
+
|
| 37 |
+
if filesize == 0:
|
| 38 |
+
issues.append("File is empty")
|
| 39 |
+
|
| 40 |
+
if issues:
|
| 41 |
+
return {'valid': False, 'issues': issues}
|
| 42 |
+
|
| 43 |
+
return {'valid': True}
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def validate_column_selection(
|
| 47 |
+
data: pd.DataFrame,
|
| 48 |
+
date_column: Optional[str],
|
| 49 |
+
target_column: Optional[str]
|
| 50 |
+
) -> Dict[str, Any]:
|
| 51 |
+
"""
|
| 52 |
+
Validate column selection
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
data: DataFrame to validate
|
| 56 |
+
date_column: Selected date column
|
| 57 |
+
target_column: Selected target column
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
Validation result dictionary
|
| 61 |
+
"""
|
| 62 |
+
issues = []
|
| 63 |
+
|
| 64 |
+
if date_column is None:
|
| 65 |
+
issues.append("Please select a date column")
|
| 66 |
+
elif date_column not in data.columns:
|
| 67 |
+
issues.append(f"Date column '{date_column}' not found in data")
|
| 68 |
+
|
| 69 |
+
if target_column is None:
|
| 70 |
+
issues.append("Please select a target column")
|
| 71 |
+
elif target_column not in data.columns:
|
| 72 |
+
issues.append(f"Target column '{target_column}' not found in data")
|
| 73 |
+
|
| 74 |
+
# Check if columns are the same
|
| 75 |
+
if date_column and target_column and date_column == target_column:
|
| 76 |
+
issues.append("Date and target columns must be different")
|
| 77 |
+
|
| 78 |
+
if issues:
|
| 79 |
+
return {'valid': False, 'issues': issues}
|
| 80 |
+
|
| 81 |
+
return {'valid': True}
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def validate_forecast_parameters(
|
| 85 |
+
horizon: int,
|
| 86 |
+
confidence_levels: List[int],
|
| 87 |
+
data_length: int
|
| 88 |
+
) -> Dict[str, Any]:
|
| 89 |
+
"""
|
| 90 |
+
Validate forecast parameters
|
| 91 |
+
|
| 92 |
+
Args:
|
| 93 |
+
horizon: Forecast horizon
|
| 94 |
+
confidence_levels: List of confidence levels
|
| 95 |
+
data_length: Length of the input data
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
Validation result dictionary
|
| 99 |
+
"""
|
| 100 |
+
issues = []
|
| 101 |
+
warnings = []
|
| 102 |
+
|
| 103 |
+
# Validate horizon
|
| 104 |
+
if horizon <= 0:
|
| 105 |
+
issues.append("Forecast horizon must be positive")
|
| 106 |
+
elif horizon > 365:
|
| 107 |
+
warnings.append("Very long forecast horizon (>365 days) may be unreliable")
|
| 108 |
+
|
| 109 |
+
# Check if sufficient data
|
| 110 |
+
if data_length < horizon * 2:
|
| 111 |
+
warnings.append(
|
| 112 |
+
f"Limited historical data ({data_length} points) for {horizon}-period forecast. "
|
| 113 |
+
"Recommend at least 2x horizon length."
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
# Validate confidence levels
|
| 117 |
+
if not confidence_levels:
|
| 118 |
+
issues.append("Please select at least one confidence level")
|
| 119 |
+
|
| 120 |
+
for cl in confidence_levels:
|
| 121 |
+
if cl <= 0 or cl >= 100:
|
| 122 |
+
issues.append(f"Invalid confidence level: {cl}%. Must be between 0 and 100.")
|
| 123 |
+
|
| 124 |
+
if issues:
|
| 125 |
+
return {'valid': False, 'issues': issues, 'warnings': warnings}
|
| 126 |
+
|
| 127 |
+
if warnings:
|
| 128 |
+
return {'valid': True, 'warnings': warnings}
|
| 129 |
+
|
| 130 |
+
return {'valid': True}
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def sanitize_input(text: str, max_length: int = 1000) -> str:
|
| 134 |
+
"""
|
| 135 |
+
Sanitize text input
|
| 136 |
+
|
| 137 |
+
Args:
|
| 138 |
+
text: Input text
|
| 139 |
+
max_length: Maximum allowed length
|
| 140 |
+
|
| 141 |
+
Returns:
|
| 142 |
+
Sanitized text
|
| 143 |
+
"""
|
| 144 |
+
if text is None:
|
| 145 |
+
return ""
|
| 146 |
+
|
| 147 |
+
# Remove control characters
|
| 148 |
+
text = ''.join(char for char in text if ord(char) >= 32 or char in '\n\r\t')
|
| 149 |
+
|
| 150 |
+
# Limit length
|
| 151 |
+
if len(text) > max_length:
|
| 152 |
+
text = text[:max_length]
|
| 153 |
+
|
| 154 |
+
return text.strip()
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def validate_data_quality(data: pd.DataFrame, target_column: str) -> Dict[str, Any]:
|
| 158 |
+
"""
|
| 159 |
+
Validate data quality for forecasting
|
| 160 |
+
|
| 161 |
+
Args:
|
| 162 |
+
data: Input DataFrame
|
| 163 |
+
target_column: Name of the target column
|
| 164 |
+
|
| 165 |
+
Returns:
|
| 166 |
+
Quality validation result
|
| 167 |
+
"""
|
| 168 |
+
issues = []
|
| 169 |
+
warnings = []
|
| 170 |
+
|
| 171 |
+
# Check for all NaN values
|
| 172 |
+
if data[target_column].isna().all():
|
| 173 |
+
issues.append("Target column contains only missing values")
|
| 174 |
+
return {'valid': False, 'issues': issues}
|
| 175 |
+
|
| 176 |
+
# Check for constant values
|
| 177 |
+
if data[target_column].nunique() == 1:
|
| 178 |
+
warnings.append("Target column has constant values - forecast may be trivial")
|
| 179 |
+
|
| 180 |
+
# Check for infinite values
|
| 181 |
+
inf_count = np.isinf(data[target_column]).sum()
|
| 182 |
+
if inf_count > 0:
|
| 183 |
+
issues.append(f"Target column contains {inf_count} infinite values")
|
| 184 |
+
|
| 185 |
+
# Check for very high variance
|
| 186 |
+
if data[target_column].std() > 1e6:
|
| 187 |
+
warnings.append("Target column has very high variance - consider scaling")
|
| 188 |
+
|
| 189 |
+
# Check for zeros
|
| 190 |
+
zero_pct = (data[target_column] == 0).sum() / len(data) * 100
|
| 191 |
+
if zero_pct > 50:
|
| 192 |
+
warnings.append(f"{zero_pct:.1f}% of values are zero")
|
| 193 |
+
|
| 194 |
+
if issues:
|
| 195 |
+
return {'valid': False, 'issues': issues, 'warnings': warnings}
|
| 196 |
+
|
| 197 |
+
if warnings:
|
| 198 |
+
return {'valid': True, 'warnings': warnings}
|
| 199 |
+
|
| 200 |
+
return {'valid': True}
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
import numpy as np
|