abhaypratapsingh111 commited on
Commit
6b9e3e8
·
verified ·
1 Parent(s): fc813d5

Upload folder using huggingface_hub

Browse files
utils/__init__.py ADDED
File without changes
utils/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (161 Bytes). View file
 
utils/__pycache__/metrics.cpython-311.pyc ADDED
Binary file (8.67 kB). View file
 
utils/__pycache__/validators.cpython-311.pyc ADDED
Binary file (7.47 kB). View file
 
utils/metrics.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Forecast evaluation metrics
3
+ """
4
+
5
+ import logging
6
+ from typing import Dict, Any
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def calculate_metrics(
14
+ actual: pd.Series,
15
+ forecast: pd.Series,
16
+ include_percentage: bool = True
17
+ ) -> Dict[str, float]:
18
+ """
19
+ Calculate forecast accuracy metrics
20
+
21
+ Args:
22
+ actual: Actual values
23
+ forecast: Forecasted values
24
+ include_percentage: Include percentage-based metrics
25
+
26
+ Returns:
27
+ Dictionary of metrics
28
+ """
29
+ try:
30
+ # Ensure same length
31
+ min_len = min(len(actual), len(forecast))
32
+ actual = actual.iloc[:min_len].values
33
+ forecast = forecast.iloc[:min_len].values
34
+
35
+ # Remove NaN values
36
+ mask = ~(np.isnan(actual) | np.isnan(forecast))
37
+ actual = actual[mask]
38
+ forecast = forecast[mask]
39
+
40
+ if len(actual) == 0:
41
+ return {'error': 'No valid values for metric calculation'}
42
+
43
+ metrics = {}
44
+
45
+ # Mean Absolute Error
46
+ metrics['MAE'] = float(np.mean(np.abs(actual - forecast)))
47
+
48
+ # Root Mean Squared Error
49
+ metrics['RMSE'] = float(np.sqrt(np.mean((actual - forecast) ** 2)))
50
+
51
+ # Mean Error (bias)
52
+ metrics['ME'] = float(np.mean(forecast - actual))
53
+
54
+ if include_percentage:
55
+ # Mean Absolute Percentage Error
56
+ # Avoid division by zero
57
+ mask_nonzero = actual != 0
58
+ if mask_nonzero.any():
59
+ mape = np.mean(np.abs((actual[mask_nonzero] - forecast[mask_nonzero]) / actual[mask_nonzero])) * 100
60
+ metrics['MAPE'] = float(mape)
61
+
62
+ # Symmetric MAPE
63
+ denominator = (np.abs(actual) + np.abs(forecast)) / 2
64
+ mask_nonzero = denominator != 0
65
+ if mask_nonzero.any():
66
+ smape = np.mean(np.abs(actual[mask_nonzero] - forecast[mask_nonzero]) / denominator[mask_nonzero]) * 100
67
+ metrics['sMAPE'] = float(smape)
68
+
69
+ # R-squared
70
+ ss_res = np.sum((actual - forecast) ** 2)
71
+ ss_tot = np.sum((actual - np.mean(actual)) ** 2)
72
+ if ss_tot != 0:
73
+ metrics['R2'] = float(1 - (ss_res / ss_tot))
74
+
75
+ return metrics
76
+
77
+ except Exception as e:
78
+ logger.error(f"Error calculating metrics: {str(e)}", exc_info=True)
79
+ return {'error': str(e)}
80
+
81
+
82
+ def calculate_coverage(
83
+ actual: pd.Series,
84
+ lower_bound: pd.Series,
85
+ upper_bound: pd.Series
86
+ ) -> float:
87
+ """
88
+ Calculate coverage of prediction intervals
89
+
90
+ Args:
91
+ actual: Actual values
92
+ lower_bound: Lower bound of prediction interval
93
+ upper_bound: Upper bound of prediction interval
94
+
95
+ Returns:
96
+ Coverage percentage (0-100)
97
+ """
98
+ try:
99
+ # Ensure same length
100
+ min_len = min(len(actual), len(lower_bound), len(upper_bound))
101
+ actual = actual.iloc[:min_len].values
102
+ lower_bound = lower_bound.iloc[:min_len].values
103
+ upper_bound = upper_bound.iloc[:min_len].values
104
+
105
+ # Count values within bounds
106
+ within_bounds = (actual >= lower_bound) & (actual <= upper_bound)
107
+ coverage = np.mean(within_bounds) * 100
108
+
109
+ return float(coverage)
110
+
111
+ except Exception as e:
112
+ logger.error(f"Error calculating coverage: {str(e)}", exc_info=True)
113
+ return 0.0
114
+
115
+
116
+ def calculate_interval_width(
117
+ lower_bound: pd.Series,
118
+ upper_bound: pd.Series
119
+ ) -> Dict[str, float]:
120
+ """
121
+ Calculate statistics about prediction interval width
122
+
123
+ Args:
124
+ lower_bound: Lower bound of prediction interval
125
+ upper_bound: Upper bound of prediction interval
126
+
127
+ Returns:
128
+ Dictionary with width statistics
129
+ """
130
+ try:
131
+ widths = upper_bound - lower_bound
132
+
133
+ return {
134
+ 'mean_width': float(widths.mean()),
135
+ 'median_width': float(widths.median()),
136
+ 'min_width': float(widths.min()),
137
+ 'max_width': float(widths.max()),
138
+ 'std_width': float(widths.std())
139
+ }
140
+
141
+ except Exception as e:
142
+ logger.error(f"Error calculating interval width: {str(e)}", exc_info=True)
143
+ return {}
144
+
145
+
146
+ def format_metric(value: float, metric_name: str) -> str:
147
+ """
148
+ Format metric value for display
149
+
150
+ Args:
151
+ value: Metric value
152
+ metric_name: Name of the metric
153
+
154
+ Returns:
155
+ Formatted string
156
+ """
157
+ if metric_name in ['MAPE', 'sMAPE', 'R2']:
158
+ return f"{value:.2f}%"
159
+ elif metric_name in ['MAE', 'RMSE', 'ME']:
160
+ if abs(value) >= 1000:
161
+ return f"{value:,.2f}"
162
+ else:
163
+ return f"{value:.4f}"
164
+ else:
165
+ return f"{value:.4f}"
166
+
167
+
168
+ def summarize_forecast_quality(
169
+ forecast_df: pd.DataFrame,
170
+ confidence_levels: list
171
+ ) -> Dict[str, Any]:
172
+ """
173
+ Summarize the quality of a forecast
174
+
175
+ Args:
176
+ forecast_df: DataFrame with forecast results
177
+ confidence_levels: List of confidence levels
178
+
179
+ Returns:
180
+ Summary dictionary
181
+ """
182
+ try:
183
+ summary = {
184
+ 'horizon': len(forecast_df),
185
+ 'forecast_range': {
186
+ 'min': float(forecast_df['forecast'].min()),
187
+ 'max': float(forecast_df['forecast'].max()),
188
+ 'mean': float(forecast_df['forecast'].mean())
189
+ }
190
+ }
191
+
192
+ # Analyze interval widths for each confidence level
193
+ interval_widths = {}
194
+ for cl in confidence_levels:
195
+ lower_col = f'lower_{cl}'
196
+ upper_col = f'upper_{cl}'
197
+
198
+ if lower_col in forecast_df.columns and upper_col in forecast_df.columns:
199
+ width = (forecast_df[upper_col] - forecast_df[lower_col]).mean()
200
+ interval_widths[f'{cl}%'] = float(width)
201
+
202
+ summary['interval_widths'] = interval_widths
203
+
204
+ return summary
205
+
206
+ except Exception as e:
207
+ logger.error(f"Error summarizing forecast: {str(e)}", exc_info=True)
208
+ return {}
utils/validators.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Input validation utilities
3
+ """
4
+
5
+ import logging
6
+ from typing import Dict, List, Optional, Any
7
+ import pandas as pd
8
+ from config.constants import MAX_FILE_SIZE, ALLOWED_EXTENSIONS
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def validate_file_upload(filename: str, filesize: int) -> Dict[str, Any]:
14
+ """
15
+ Validate uploaded file
16
+
17
+ Args:
18
+ filename: Name of the uploaded file
19
+ filesize: Size of the file in bytes
20
+
21
+ Returns:
22
+ Validation result dictionary
23
+ """
24
+ issues = []
25
+
26
+ # Check file extension
27
+ extension = filename.split('.')[-1].lower() if '.' in filename else ''
28
+ if extension not in ALLOWED_EXTENSIONS:
29
+ issues.append(f"Invalid file type '{extension}'. Allowed: {', '.join(ALLOWED_EXTENSIONS)}")
30
+
31
+ # Check file size
32
+ if filesize > MAX_FILE_SIZE:
33
+ max_mb = MAX_FILE_SIZE / (1024 * 1024)
34
+ actual_mb = filesize / (1024 * 1024)
35
+ issues.append(f"File too large ({actual_mb:.1f}MB). Maximum: {max_mb:.0f}MB")
36
+
37
+ if filesize == 0:
38
+ issues.append("File is empty")
39
+
40
+ if issues:
41
+ return {'valid': False, 'issues': issues}
42
+
43
+ return {'valid': True}
44
+
45
+
46
+ def validate_column_selection(
47
+ data: pd.DataFrame,
48
+ date_column: Optional[str],
49
+ target_column: Optional[str]
50
+ ) -> Dict[str, Any]:
51
+ """
52
+ Validate column selection
53
+
54
+ Args:
55
+ data: DataFrame to validate
56
+ date_column: Selected date column
57
+ target_column: Selected target column
58
+
59
+ Returns:
60
+ Validation result dictionary
61
+ """
62
+ issues = []
63
+
64
+ if date_column is None:
65
+ issues.append("Please select a date column")
66
+ elif date_column not in data.columns:
67
+ issues.append(f"Date column '{date_column}' not found in data")
68
+
69
+ if target_column is None:
70
+ issues.append("Please select a target column")
71
+ elif target_column not in data.columns:
72
+ issues.append(f"Target column '{target_column}' not found in data")
73
+
74
+ # Check if columns are the same
75
+ if date_column and target_column and date_column == target_column:
76
+ issues.append("Date and target columns must be different")
77
+
78
+ if issues:
79
+ return {'valid': False, 'issues': issues}
80
+
81
+ return {'valid': True}
82
+
83
+
84
+ def validate_forecast_parameters(
85
+ horizon: int,
86
+ confidence_levels: List[int],
87
+ data_length: int
88
+ ) -> Dict[str, Any]:
89
+ """
90
+ Validate forecast parameters
91
+
92
+ Args:
93
+ horizon: Forecast horizon
94
+ confidence_levels: List of confidence levels
95
+ data_length: Length of the input data
96
+
97
+ Returns:
98
+ Validation result dictionary
99
+ """
100
+ issues = []
101
+ warnings = []
102
+
103
+ # Validate horizon
104
+ if horizon <= 0:
105
+ issues.append("Forecast horizon must be positive")
106
+ elif horizon > 365:
107
+ warnings.append("Very long forecast horizon (>365 days) may be unreliable")
108
+
109
+ # Check if sufficient data
110
+ if data_length < horizon * 2:
111
+ warnings.append(
112
+ f"Limited historical data ({data_length} points) for {horizon}-period forecast. "
113
+ "Recommend at least 2x horizon length."
114
+ )
115
+
116
+ # Validate confidence levels
117
+ if not confidence_levels:
118
+ issues.append("Please select at least one confidence level")
119
+
120
+ for cl in confidence_levels:
121
+ if cl <= 0 or cl >= 100:
122
+ issues.append(f"Invalid confidence level: {cl}%. Must be between 0 and 100.")
123
+
124
+ if issues:
125
+ return {'valid': False, 'issues': issues, 'warnings': warnings}
126
+
127
+ if warnings:
128
+ return {'valid': True, 'warnings': warnings}
129
+
130
+ return {'valid': True}
131
+
132
+
133
+ def sanitize_input(text: str, max_length: int = 1000) -> str:
134
+ """
135
+ Sanitize text input
136
+
137
+ Args:
138
+ text: Input text
139
+ max_length: Maximum allowed length
140
+
141
+ Returns:
142
+ Sanitized text
143
+ """
144
+ if text is None:
145
+ return ""
146
+
147
+ # Remove control characters
148
+ text = ''.join(char for char in text if ord(char) >= 32 or char in '\n\r\t')
149
+
150
+ # Limit length
151
+ if len(text) > max_length:
152
+ text = text[:max_length]
153
+
154
+ return text.strip()
155
+
156
+
157
+ def validate_data_quality(data: pd.DataFrame, target_column: str) -> Dict[str, Any]:
158
+ """
159
+ Validate data quality for forecasting
160
+
161
+ Args:
162
+ data: Input DataFrame
163
+ target_column: Name of the target column
164
+
165
+ Returns:
166
+ Quality validation result
167
+ """
168
+ issues = []
169
+ warnings = []
170
+
171
+ # Check for all NaN values
172
+ if data[target_column].isna().all():
173
+ issues.append("Target column contains only missing values")
174
+ return {'valid': False, 'issues': issues}
175
+
176
+ # Check for constant values
177
+ if data[target_column].nunique() == 1:
178
+ warnings.append("Target column has constant values - forecast may be trivial")
179
+
180
+ # Check for infinite values
181
+ inf_count = np.isinf(data[target_column]).sum()
182
+ if inf_count > 0:
183
+ issues.append(f"Target column contains {inf_count} infinite values")
184
+
185
+ # Check for very high variance
186
+ if data[target_column].std() > 1e6:
187
+ warnings.append("Target column has very high variance - consider scaling")
188
+
189
+ # Check for zeros
190
+ zero_pct = (data[target_column] == 0).sum() / len(data) * 100
191
+ if zero_pct > 50:
192
+ warnings.append(f"{zero_pct:.1f}% of values are zero")
193
+
194
+ if issues:
195
+ return {'valid': False, 'issues': issues, 'warnings': warnings}
196
+
197
+ if warnings:
198
+ return {'valid': True, 'warnings': warnings}
199
+
200
+ return {'valid': True}
201
+
202
+
203
+ import numpy as np