Spaces:
Build error
Build error
| import pandas as pd | |
| import requests | |
| from bs4 import BeautifulSoup | |
| def fetch_wikipedia_tables( | |
| url: str, | |
| handle_special_chars: bool = True, | |
| ) -> list[pd.DataFrame]: | |
| """ | |
| Fetch tables from a Wikipedia URL with robust error handling. | |
| Parameters: | |
| ----------- | |
| url : str | |
| The Wikipedia URL to fetch tables from. | |
| handle_special_chars : bool, default True | |
| Whether to clean special characters in data before parsing. | |
| Returns: | |
| -------- | |
| list of pd.DataFrame | |
| A list of pandas DataFrames containing the tables found on the page. | |
| """ | |
| try: | |
| all_tables = _fetch_tables_with_bs4(url) | |
| if handle_special_chars: | |
| # Clean tables to handle special characters and formatting issues | |
| for i, table in enumerate(all_tables): | |
| all_tables[i] = _clean_table(table) | |
| if all_tables: | |
| return all_tables | |
| else: | |
| print(f"No tables found at {url}") | |
| return [] | |
| except Exception as e: | |
| print(f"Error fetching tables: {e}") | |
| return [] | |
| def _fetch_tables_with_bs4(url: str) -> list[pd.DataFrame]: | |
| """Method to fetch tables using BeautifulSoup.""" | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| tables = [] | |
| for table in soup.find_all("table", {"class": "wikitable"}): | |
| data = [] | |
| headers = [] | |
| # Extract headers | |
| for th in table.find_all("th"): | |
| headers.append(th.text.strip()) | |
| # If no headers found in th tags, try first tr | |
| if not headers and table.find("tr"): | |
| for td in table.find("tr").find_all(["th", "td"]): | |
| headers.append(td.text.strip()) | |
| # Extract rows | |
| for row in table.find_all("tr")[1:] if headers else table.find_all("tr"): | |
| row_data = [] | |
| for cell in row.find_all(["td", "th"]): | |
| row_data.append(cell.text.strip()) | |
| if row_data: # Skip empty rows | |
| data.append(row_data) | |
| # Create DataFrame | |
| if data: | |
| if headers and len(headers) == len(data[0]): | |
| df = pd.DataFrame(data, columns=headers) | |
| else: | |
| df = pd.DataFrame(data) | |
| tables.append(df) | |
| return tables | |
| except Exception as e: | |
| print(f"Error in BeautifulSoup fallback: {e}") | |
| return [] | |
| def _clean_table(df: pd.DataFrame) -> pd.DataFrame: | |
| """Clean a table by handling special characters and formatting issues.""" | |
| # Make a copy to avoid modifying the original | |
| df = df.copy() | |
| # Handle all string columns | |
| for col in df.columns: | |
| if df[col].dtype == "object": | |
| # Replace common problematic characters | |
| df[col] = df[col].astype(str).str.replace(";", "", regex=False) | |
| df[col] = df[col].str.replace("−", "-", regex=False) # Replace minus sign | |
| df[col] = df[col].str.replace( | |
| "\xa0", " ", regex=False | |
| ) # Replace non-breaking space | |
| df[col] = df[col].str.replace("\n", " ", regex=False) # Replace newlines | |
| df[col] = df[col].str.strip() # Strip whitespace | |
| # Remove reference tags like [1], [2], etc. | |
| df[col] = df[col].str.replace(r"\[\d+\]", "", regex=True) | |
| return df | |