Spaces:

CausalNLP
/

causal-agent

Running

App Files Files Community

causal-agent / auto_causal /tools /method_executor_tool.py

FireShadow

Initial clean commit

1721aea 4 months ago

raw

history blame contribute delete

10.8 kB

	"""
	Method Executor Tool for the causal inference agent.

	Executes the selected causal inference method using its implementation function.
	"""

	import pandas as pd
	from typing import Dict, Any, Optional, List, Union
	from langchain.tools import tool
	import traceback # For error logging
	import logging # Add logging

	# Import the mapping and potentially preprocessing utils
	from auto_causal.methods import METHOD_MAPPING
	from auto_causal.methods.utils import preprocess_data # Assuming preprocess exists
	from auto_causal.components.state_manager import create_workflow_state_update
	from auto_causal.config import get_llm_client # IMPORT LLM Client Factory

	# Import shared models from central location
	from auto_causal.models import (
	Variables,
	TemporalStructure, # Needed indirectly by DatasetAnalysis
	DatasetInfo, # Needed indirectly by DatasetAnalysis
	DatasetAnalysis,
	MethodExecutorInput
	)

	# Add this module-level variable, typically near imports or at the top
	CURRENT_OUTPUT_LOG_FILE = None

	logger = logging.getLogger(__name__)

	@tool
	def method_executor_tool(inputs: MethodExecutorInput, original_query: Optional[str] = None) -> Dict[str, Any]: # Use Pydantic Input
	'''Execute the selected causal inference method function using structured input.

	Args:
	inputs: Pydantic model containing method, variables, dataset_path,
	dataset_analysis, and dataset_description.

	Returns:
	Dict with numerical results, context for next step, and workflow state.
	'''
	# Access data from input model
	method = inputs.method
	variables_dict = inputs.variables.model_dump()
	dataset_path = inputs.dataset_path
	dataset_analysis_dict = inputs.dataset_analysis.model_dump()
	dataset_description_str = inputs.dataset_description
	validation_info = inputs.validation_info # Can be passed if needed

	logger.info(f"Executing method: {method}")

	try:
	# --- Get LLM Instance ---
	llm_instance = None
	try:
	llm_instance = get_llm_client()
	except Exception as llm_e:
	logger.warning(f"Could not get LLM client in method_executor_tool: {llm_e}. LLM-dependent features in method will be disabled.")

	# 1. Load Data
	if not dataset_path:
	raise ValueError("Dataset path is missing.")
	df = pd.read_csv(dataset_path)

	# 2. Extract Key Variables needed by estimate_func signature
	treatment = variables_dict.get("treatment_variable")
	outcome = variables_dict.get("outcome_variable")
	covariates = variables_dict.get("covariates", [])
	query_str = original_query if original_query is not None else inputs.original_query

	if not all([treatment, outcome]):
	raise ValueError("Treatment or Outcome variable not found in 'variables' dict.")

	# 3. Preprocess Data
	required_cols_for_method = [treatment, outcome] + covariates
	# Add method-specific required vars from the variables_dict
	if method == "instrumental_variable" and variables_dict.get("instrument_variable"):
	required_cols_for_method.append(variables_dict["instrument_variable"])
	elif method == "regression_discontinuity" and variables_dict.get("running_variable"):
	required_cols_for_method.append(variables_dict["running_variable"])

	missing_df_cols = [col for col in required_cols_for_method if col not in df.columns]
	if missing_df_cols:
	raise ValueError(f"Dataset at {dataset_path} is missing required columns for method '{method}': {missing_df_cols}")

	df_processed, updated_treatment, updated_outcome, updated_covariates, column_mappings = \
	preprocess_data(df, treatment, outcome, covariates, verbose=False)

	# 4. Get the correct method execution function
	if method not in METHOD_MAPPING:
	raise ValueError(f"Method '{method}' not found in METHOD_MAPPING.")
	estimate_func = METHOD_MAPPING[method]

	# 5. Execute the method
	# Pass only necessary args from variables_dict as kwargs
	# (e.g., instrument_variable, running_variable, cutoff_value, etc.)
	# Avoid passing the entire variables_dict as estimate_func expects specific args
	kwargs_for_method = {}
	for key in ["instrument_variable", "time_variable", "group_variable",
	"running_variable", "cutoff_value"]:
	if key in variables_dict and variables_dict[key] is not None:
	kwargs_for_method[key] = variables_dict[key]

	# Add new fields from the Variables model (which is inputs.variables)
	if hasattr(inputs, 'variables'): # ensure variables object exists on inputs
	if inputs.variables.treatment_reference_level is not None:
	kwargs_for_method['treatment_reference_level'] = inputs.variables.treatment_reference_level
	if inputs.variables.interaction_term_suggested is not None: # boolean, so check for None to allow False
	kwargs_for_method['interaction_term_suggested'] = inputs.variables.interaction_term_suggested
	if inputs.variables.interaction_variable_candidate is not None:
	kwargs_for_method['interaction_variable_candidate'] = inputs.variables.interaction_variable_candidate

	# Add query if needed by llm_assist functions within the method
	kwargs_for_method['query'] = query_str
	kwargs_for_method['column_mappings'] = column_mappings


	results_dict = estimate_func(
	df=df_processed,
	treatment=updated_treatment,
	outcome=updated_outcome,
	covariates=updated_covariates,
	dataset_description=dataset_description_str,
	query_str=query_str,
	llm=llm_instance,
	**kwargs_for_method # Pass specific args needed by the method
	)

	# 6. Prepare output
	logger.info(f"Method execution successful. Effect estimate: {results_dict.get('effect_estimate')}")

	# Add workflow state
	workflow_update = create_workflow_state_update(
	current_step="method_execution",
	step_completed_flag="method_executed",
	next_tool="explainer_tool",
	next_step_reason="Now we need to explain the results and their implications"
	)

	# --- Prepare Output Dictionary ---
	# Structure required by explainer_tool: context + nested "results"
	final_output = {
	# Nested dictionary for numerical results and diagnostics
	"results": {
	# Core estimation results (extracted from results_dict)
	"effect_estimate": results_dict.get("effect_estimate"),
	"confidence_interval": results_dict.get("confidence_interval"),
	"standard_error": results_dict.get("standard_error"),
	"p_value": results_dict.get("p_value"),
	"method_used": results_dict.get("method_used"),
	"llm_assumption_check": results_dict.get("llm_assumption_check"),
	"raw_results": results_dict.get("raw_results"),
	# Diagnostics and Refutation results
	"diagnostics": results_dict.get("diagnostics"),
	"refutation_results": results_dict.get("refutation_results")
	},
	# Top-level context to be passed along
	"variables": variables_dict,
	"dataset_analysis": dataset_analysis_dict,
	"dataset_description": dataset_description_str,
	"validation_info": validation_info, # Pass validation info
	"original_query": inputs.original_query,
	"column_mappings": column_mappings # Add column_mappings to the output
	# Workflow state will be added next
	}

	# Add workflow state to the final output
	final_output.update(workflow_update.get('workflow_state', {}))

	# --- Logging logic (moved from output_formatter.py) ---
	# Prepare a summary dict for logging
	summary_keys = {"query", "method_used", "causal_effect", "standard_error", "confidence_interval"}
	# Try to get these from the available context
	summary_dict = {
	"query": inputs.original_query if hasattr(inputs, 'original_query') else None,
	"method_used": results_dict.get("method_used"),
	"causal_effect": results_dict.get("effect_estimate"),
	"standard_error": results_dict.get("standard_error"),
	"confidence_interval": results_dict.get("confidence_interval")
	}
	print(f"summary_dict: {summary_dict}")
	print(f"CURRENT_OUTPUT_LOG_FILE: {CURRENT_OUTPUT_LOG_FILE}")
	if CURRENT_OUTPUT_LOG_FILE and summary_dict:
	try:
	import json
	log_entry = {"type": "analysis_result", "data": summary_dict}
	with open(CURRENT_OUTPUT_LOG_FILE, mode='a', encoding='utf-8') as log_file:
	log_file.write('\n' + json.dumps(log_entry) + '\n')
	except Exception as e:
	print(f"[ERROR] method_executor_tool.py: Failed to write analysis results to log file '{CURRENT_OUTPUT_LOG_FILE}': {e}")

	return final_output

	except Exception as e:
	error_message = f"Error executing method {method}: {str(e)}"
	logger.error(error_message, exc_info=True)

	# Return error state, include context if available
	workflow_update = create_workflow_state_update(
	current_step="method_execution",
	step_completed_flag=False,
	next_tool="explainer_tool", # Or error handler?
	next_step_reason=f"Failed during method execution: {error_message}"
	)
	# Ensure error output still contains necessary context keys if possible
	error_result = {"error": error_message,
	"variables": variables_dict if 'variables_dict' in locals() else {},
	"dataset_analysis": dataset_analysis_dict if 'dataset_analysis_dict' in locals() else {},
	"dataset_description": dataset_description_str if 'dataset_description_str' in locals() else None,
	"original_query": inputs.original_query if hasattr(inputs, 'original_query') else None,
	"column_mappings": column_mappings if 'column_mappings' in locals() else {} # Also add to error output
	}
	error_result.update(workflow_update.get('workflow_state', {}))
	return error_result