Spaces:

CausalNLP
/

causal-agent

Running

App Files Files Community

causal-agent / auto_causal /methods /regression_discontinuity /estimator.py

FireShadow

Initial clean commit

1721aea 4 months ago

raw

history blame contribute delete

19.5 kB

	"""
	Regression Discontinuity Design (RDD) Estimator.

	Tries to use DoWhy's RDD implementation first, falling back to a basic
	comparison of linear fits around the cutoff if DoWhy fails.
	"""

	import pandas as pd
	import statsmodels.api as sm
	from dowhy import CausalModel
	from typing import Dict, Any, List, Optional
	import logging
	from langchain.chat_models.base import BaseChatModel # For type hinting llm

	from .diagnostics import run_rdd_diagnostics
	from .llm_assist import interpret_rdd_results

	logger = logging.getLogger(__name__)

	# Attempt to import specific functions from the evan-magnusson/rdd package
	_rdd_estimator_func_em = None
	_rdd_optimal_bw_func_em = None
	_rdd_em_import_error_message = ""
	try:
	import rdd
	from rdd import rdd

	logger.info("Successfully imported 'rdd' and 'optimal_bandwidth' from evan-magnusson/rdd package.")
	except ImportError as e:
	_rdd_em_import_error_message = f"ImportError for evan-magnusson/rdd: {e}. This package is needed for 'effect_estimate_rdd'."
	logger.warning(_rdd_em_import_error_message)
	except Exception as e: # Catch other potential errors during import
	_rdd_em_import_error_message = f"An unexpected error occurred during import from evan-magnusson/rdd: {e}"
	logger.warning(_rdd_em_import_error_message)

	def estimate_effect_dowhy(df: pd.DataFrame, treatment: str, outcome: str, running_variable: str, cutoff_value: float, covariates: Optional[List[str]], **kwargs) -> Dict[str, Any]:
	"""Estimate RDD effect using DoWhy."""
	logger.info("Attempting RDD estimation using DoWhy.")
	if covariates:
	logger.warning("Covariates provided but may not be used by the DoWhy RDD method_name='rdd'. Support varies.")
	# For DoWhy RDD, we don't typically specify common causes in the model
	# constructor in the same way as backdoor. The running variable is handled
	# via method_params. Covariates might be used by specific underlying estimators
	# if supported, but the basic RDD identification doesn't use them directly.
	model = CausalModel(
	data=df,
	treatment=treatment,
	outcome=outcome,
	# No explicit graph needed for iv.regression_discontinuity method
	)

	# Identify the effect (DoWhy internally identifies RDD as IV)
	# Although potentially redundant if method_name implies identification,
	# the API requires identified_estimand as the first argument.
	identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)

	# Estimate using RDD method
	# Note: DoWhy's RDD often has limited direct support for covariates.
	# Bandwidth selection is crucial and often done internally or specified.
	bandwidth = kwargs.get('bandwidth') # Get user-specified bandwidth if provided
	if bandwidth is None:
	# Very basic default bandwidth if none provided - consider better methods
	range_rv = df[running_variable].max() - df[running_variable].min()
	bandwidth = 0.1 * range_rv
	logger.warning(f"No bandwidth specified, using basic default: {bandwidth:.3f}")

	estimate = model.estimate_effect(
	identified_estimand, # ADD identified_estimand argument
	method_name="iv.regression_discontinuity",
	method_params={
	'rd_variable_name': running_variable,
	'rd_threshold_value': cutoff_value,
	'rd_bandwidth': bandwidth,
	# 'covariates': covariates # Support depends on DoWhy version/estimator
	},
	test_significance=True # Ask DoWhy to calculate p-values if possible
	)

	# Extract results - DoWhy's RDD estimate structure might vary
	effect = estimate.value
	# DoWhy's RDD significance testing might be limited/indirect
	# Try to get p-value if estimate object supports it, else None
	p_value = getattr(estimate, 'test_significance_pvalue', None)
	if isinstance(p_value, (list, tuple)):
	p_value = p_value[0] # Handle cases where it might be wrapped

	# Confidence intervals might not be directly available from this method easily
	conf_int = getattr(estimate, 'confidence_interval', None)
	std_err = getattr(estimate, 'standard_error', None)

	return {
	'effect_estimate': effect,
	'p_value': p_value,
	'confidence_interval': conf_int,
	'standard_error': std_err,
	'method_details': f"DoWhy RDD (Bandwidth: {bandwidth:.3f})",
	}

	def estimate_effect_fallback(df: pd.DataFrame, treatment: str, outcome: str, running_variable: str, cutoff_value: float, covariates: Optional[List[str]], **kwargs) -> Dict[str, Any]:
	"""Estimate RDD effect using simple linear regression comparison fallback."""
	logger.warning("DoWhy RDD failed or not used. Falling back to simple linear regression comparison.")
	if covariates:
	logger.warning("Covariates provided but are ignored in the fallback RDD linear regression estimation.")

	bandwidth = kwargs.get('bandwidth')
	if bandwidth is None:
	range_rv = df[running_variable].max() - df[running_variable].min()
	bandwidth = 0.1 * range_rv
	logger.warning(f"No bandwidth specified for fallback, using basic default: {bandwidth:.3f}")

	# Filter data within bandwidth
	df_bw = df[(df[running_variable] >= cutoff_value - bandwidth) & (df[running_variable] <= cutoff_value + bandwidth)].copy()
	if df_bw.empty:
	raise ValueError("No data within the specified bandwidth.")

	df_bw['above_cutoff'] = (df_bw[running_variable] >= cutoff_value).astype(int)

	# Define predictors for the regression
	# Interaction term allows different slopes above and below the cutoff
	df_bw['running_centered'] = df_bw[running_variable] - cutoff_value
	df_bw['running_x_above'] = df_bw['running_centered'] * df_bw['above_cutoff']
	predictors = ['above_cutoff', 'running_centered', 'running_x_above']

	# Covariates are NOT included in this basic RDD model
	# if covariates:
	# predictors.extend(covariates) # REMOVED as per user request

	required_cols = [outcome] + predictors
	missing_cols = [col for col in required_cols if col not in df_bw.columns]
	if missing_cols:
	raise ValueError(f"Fallback RDD missing columns: {missing_cols}")

	df_analysis = df_bw[required_cols].dropna()
	if df_analysis.empty:
	raise ValueError("No data remaining after dropping NaNs for fallback RDD.")

	X = df_analysis[predictors]
	X = sm.add_constant(X)
	y = df_analysis[outcome]

	formula = f"{outcome} ~ {' + '.join(predictors)} + const"
	logger.info(f"Running fallback RDD regression: {formula}")

	model = sm.OLS(y, X)
	# Use robust standard errors
	results = model.fit(cov_type='HC1')

	# The coefficient for 'above_cutoff' represents the jump at the cutoff
	effect = results.params['above_cutoff']
	p_value = results.pvalues['above_cutoff']
	conf_int = results.conf_int().loc['above_cutoff'].tolist()
	std_err = results.bse['above_cutoff']

	return {
	'effect_estimate': effect,
	'p_value': p_value,
	'confidence_interval': conf_int,
	'standard_error': std_err,
	'method_details': f"Fallback Linear Interaction (Bandwidth: {bandwidth:.3f})",
	'formula': formula,
	'model_summary': results.summary()
	}

	def effect_estimate_rdd(
	df: pd.DataFrame,
	outcome: str,
	running_variable: str,
	cutoff_value: float,
	treatment: Optional[str] = None, # Kept for API consistency, but unused by evan-magnusson/rdd
	covariates: Optional[List[str]] = None,
	bandwidth: Optional[float] = None,
	**kwargs
	) -> Dict[str, Any]:
	"""
	Estimates RDD effect using the 'evan-magnusson/rdd' package.
	Uses IK optimal bandwidth selection from the same package by default.
	"""
	logger.info(f"Attempting RDD estimation using 'evan-magnusson/rdd' for outcome '{outcome}' and running variable '{running_variable}'.")



	if treatment:
	logger.info(f"Treatment variable '{treatment}' provided but is not explicitly used by the evan-magnusson/rdd estimation function.")
	if covariates:
	logger.warning("Covariates provided but are ignored by this 'evan-magnusson/rdd' implementation.")

	# --- Bandwidth Selection ---
	final_bandwidth = None
	bandwidth_selection_method = "unknown"

	if bandwidth is not None and bandwidth > 0:
	logger.info(f"Using user-specified bandwidth: {bandwidth:.4f}")
	final_bandwidth = bandwidth
	bandwidth_selection_method = "user-specified"
	else:
	if bandwidth is not None and bandwidth <= 0:
	logger.warning(f"User-specified bandwidth {bandwidth} is not positive. Attempting IK optimal bandwidth selection.")
	try:
	logger.info(f"Attempting IK optimal bandwidth selection using _rdd_optimal_bw_func_em for {outcome} ~ {running_variable} cut at {cutoff_value}.")
	optimal_bw_val = rdd.optimal_bandwidth(df[outcome], df[running_variable], cut=cutoff_value)
	if optimal_bw_val is not None and optimal_bw_val > 0:
	final_bandwidth = optimal_bw_val
	bandwidth_selection_method = "ik_optimal (evan-magnusson/rdd)"
	logger.info(f"IK optimal bandwidth from evan-magnusson/rdd: {final_bandwidth:.4f}")
	else:
	logger.warning(f"IK optimal bandwidth from evan-magnusson/rdd was None or non-positive: {optimal_bw_val}. Falling back to default.")
	except Exception as e:
	logger.warning(f"IK optimal bandwidth selection from evan-magnusson/rdd failed: {e}. Falling back to default.")

	if final_bandwidth is None: # Fallback if user did not specify and IK failed/invalid
	logger.info("Falling back to default bandwidth (10% of running variable range).")
	rv_min = df[running_variable].min()
	rv_max = df[running_variable].max()
	rv_range = rv_max - rv_min
	if rv_range > 0:
	final_bandwidth = 0.1 * rv_range
	bandwidth_selection_method = "default_10_percent_range"
	logger.info(f"Using default 10% range bandwidth: {final_bandwidth:.4f}")
	else:
	err_msg = "Running variable range is not positive. Cannot determine a default bandwidth for evan-magnusson/rdd."
	logger.error(err_msg)
	raise ValueError(err_msg)

	if final_bandwidth is None or final_bandwidth <= 0:
	raise ValueError(f"Could not determine a valid positive bandwidth for evan-magnusson/rdd. Last method: {bandwidth_selection_method}")

	# --- RDD Estimation ---
	try:
	logger.info(f"Running RDD estimation with evan-magnusson/rdd: y='{outcome}', x='{running_variable}', cut={cutoff_value}, bw={final_bandwidth:.4f}")
	# The evan-magnusson/rdd package's rdd function typically handles dataframes directly
	# Ensure correct xname for truncated_data
	data_rdd = rdd.truncated_data(df, running_variable,final_bandwidth, cut=cutoff_value)
	model = rdd.rdd(
	data_rdd,
	xname=running_variable, # Correct: Name of the running variable column
	yname=outcome, # Correct: Name of the outcome variable column
	cut=cutoff_value
	)

	# Extract results - this package creates a treatment dummy 'TREATED'
	# The 'model' object has a 'results' attribute which is a statsmodels result instance
	sm_results = model.fit()
	print(sm_results.summary())

	# Extract results - using 'TREATED' based on the provided summary output
	effect = sm_results.params.get('TREATED')
	std_err = sm_results.bse.get('TREATED')
	p_value = sm_results.pvalues.get('TREATED')

	conf_int_series = sm_results.conf_int()
	conf_int = conf_int_series.loc['TREATED'].tolist() if 'TREATED' in conf_int_series.index else [None, None]

	n_obs = model.nobs # or model.n_ if nobs is not available (check package details)

	# The formula is implicit in the local linear regression performed by the package
	# Update to reflect 'TREATED' as the dummy variable name if consistently used by the package
	formula_desc = f"Local linear RDD: {outcome} ~ TREATED + {running_variable}_centered + TREATED*{running_variable}_centered (implicit, from evan-magnusson/rdd)"

	return {
	'effect_estimate': effect,
	'standard_error': std_err,
	'p_value': p_value,
	'confidence_interval': conf_int,
	'method_details': f"RDD (evan-magnusson/rdd package, Bandwidth: {final_bandwidth:.4f})",
	'bandwidth_used': final_bandwidth,
	'bandwidth_selection_method': bandwidth_selection_method,
	'n_obs_in_bandwidth': int(n_obs) if n_obs is not None else None,
	'formula': formula_desc,
	'model_summary': sm_results.summary().as_text() if sm_results else "Summary not available."
	}

	except Exception as e:
	logger.error(f"RDD estimation using 'evan-magnusson/rdd' failed: {e}", exc_info=True)
	# Consider re-raising or returning a more structured error
	raise e # Or return a dict like in the import failure case

	def estimate_effect(
	df: pd.DataFrame,
	treatment: str,
	outcome: str,
	running_variable: str,
	cutoff_value: float,
	covariates: Optional[List[str]] = None,
	bandwidth: Optional[float] = None, # Optional bandwidth param
	query: Optional[str] = None,
	llm: Optional[BaseChatModel] = None,
	**kwargs # Capture other args like rd_estimator from DoWhy if needed
	) -> Dict[str, Any]:
	"""
	Estimates the causal effect using Regression Discontinuity Design.

	Tries DoWhy implementation first if use_dowhy=True, otherwise uses fallback.

	Args:
	df: Input DataFrame.
	treatment: Name of the treatment variable (often implicitly defined by cutoff).
	DoWhy might still need it, fallback doesn't use it directly.
	outcome: Name of the outcome variable.
	running_variable: Name of the variable determining treatment assignment.
	cutoff: The threshold value for the running variable.
	covariates: Optional list of covariate names (support varies).
	bandwidth: Optional bandwidth around the cutoff. If None, a default is used.
	use_dowhy: Whether to attempt using the DoWhy library first.
	query: Optional user query for context.
	llm: Optional Language Model instance.
	**kwargs: Additional keyword arguments for underlying methods.

	Returns:
	Dictionary containing estimation results.
	"""
	required_args = {
	"running_variable": running_variable,
	"cutoff_value": cutoff_value
	}
	if any(val is None for val in required_args.values()):
	raise ValueError(f"Missing required RDD arguments: running_variable and cutoff must be provided.")

	results = {}
	rdd_em_estimation_error = None # Error from effect_estimate_rdd (evan-magnusson)
	fallback_estimation_error = None # Error from estimate_effect_fallback

	# --- Try effect_estimate_rdd (evan-magnusson/rdd) First ---
	try:
	logger.info("Attempting RDD estimation using 'effect_estimate_rdd' (evan-magnusson/rdd package).")
	# Note: treatment is passed but might be unused, covariates are also passed but typically ignored by this specific rdd package
	results = effect_estimate_rdd(
	df,
	outcome,
	running_variable,
	cutoff_value,
	treatment=treatment, # For API consistency, though evan-magnusson/rdd doesn't use it explicitly
	covariates=covariates,
	bandwidth=bandwidth,
	**kwargs
	)
	results['method_used'] = 'evan-magnusson/rdd' # Ensure method_used is set
	logger.info("Successfully estimated effect using 'effect_estimate_rdd'.")
	except ImportError as ie: # Specifically catch import errors for the rdd package
	logger.warning(f"'effect_estimate_rdd' could not run due to ImportError (likely evan-magnusson/rdd package not available/functional): {ie}")
	rdd_em_estimation_error = ie
	except Exception as e:
	logger.warning(f"'effect_estimate_rdd' failed during execution: {e}")
	rdd_em_estimation_error = e

	# --- Fallback to estimate_effect_fallback if effect_estimate_rdd failed ---
	if not results: # If effect_estimate_rdd wasn't used or failed
	logger.info("'effect_estimate_rdd' did not produce results. Attempting fallback using 'estimate_effect_fallback'.")
	try:
	fallback_results = estimate_effect_fallback(df, treatment, outcome, running_variable, cutoff_value, covariates, bandwidth=bandwidth, **kwargs)
	results.update(fallback_results)
	results['method_used'] = 'Fallback RDD (Linear Interaction with Robust Errors)'
	fallback_estimation_error = None # Clear fallback error if it succeeded
	logger.info("Successfully estimated effect using 'estimate_effect_fallback'.")
	except Exception as e:
	logger.error(f"Fallback RDD estimation ('estimate_effect_fallback') also failed: {e}")
	fallback_estimation_error = e

	# Determine final error status
	final_estimation_error = None
	if not results: # If still no results, determine which error to report
	if fallback_estimation_error: # Fallback was attempted and failed
	final_estimation_error = fallback_estimation_error
	logger.error(f"All RDD estimation attempts failed. Last error (from fallback): {final_estimation_error}")
	elif rdd_em_estimation_error: # effect_estimate_rdd was attempted and failed, fallback was not (or also failed but error not captured)
	final_estimation_error = rdd_em_estimation_error
	logger.error(f"All RDD estimation attempts failed. Last error (from effect_estimate_rdd): {final_estimation_error}")
	else:
	logger.error("All RDD estimation attempts failed for an unknown reason.")

	if final_estimation_error:
	raise ValueError(f"RDD estimation failed. Last error: {final_estimation_error}")
	else:
	raise ValueError("RDD estimation failed using all available methods for an unknown reason.")

	# --- Diagnostics ---
	try:
	diag_results = run_rdd_diagnostics(df, outcome, running_variable, cutoff_value, covariates, bandwidth)
	results['diagnostics'] = diag_results
	except Exception as diag_e:
	logger.error(f"RDD Diagnostics failed: {diag_e}")
	results['diagnostics'] = {"status": "Failed", "error": str(diag_e)}

	# --- Interpretation ---
	try:
	interpretation = interpret_rdd_results(results, results.get('diagnostics'), llm=llm)
	results['interpretation'] = interpretation
	except Exception as interp_e:
	logger.error(f"RDD Interpretation failed: {interp_e}")
	results['interpretation'] = "Interpretation failed."

	# Add info about primary attempt if fallback was used
	if rdd_em_estimation_error and results.get('method_used', '').startswith('Fallback'):
	results['primary_rdd_em_error_info'] = str(rdd_em_estimation_error)

	return results