Spaces:
Runtime error
Runtime error
initial app
Browse files
README.md
CHANGED
|
@@ -1,12 +1,15 @@
|
|
| 1 |
---
|
| 2 |
title: Dataset Spreadsheets
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: purple
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.6.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
title: Dataset Spreadsheets
|
| 3 |
+
emoji: π€π
|
| 4 |
colorFrom: purple
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.6.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
hf_oauth: true
|
| 11 |
+
hf_oauth_scopes:
|
| 12 |
+
- read-repos
|
| 13 |
---
|
| 14 |
|
| 15 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from functools import lru_cache
|
| 2 |
+
|
| 3 |
+
import duckdb
|
| 4 |
+
import gradio as gr
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import requests
|
| 7 |
+
from duckdb import DuckDBPyRelation
|
| 8 |
+
from duckdb.typing import DuckDBPyType
|
| 9 |
+
from huggingface_hub import HfApi
|
| 10 |
+
|
| 11 |
+
Table = DuckDBPyRelation
|
| 12 |
+
Dtype = DuckDBPyType
|
| 13 |
+
READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
|
| 14 |
+
EMPTY_TABLE = duckdb.sql("SELECT null as col_1, null as col_2, null as col_3, null as col_4 FROM range(10)")
|
| 15 |
+
PAGE_SIZE = 100
|
| 16 |
+
NUM_TRENDING_DATASETS = 10
|
| 17 |
+
NUM_USER_DATASETS = 10
|
| 18 |
+
css = """
|
| 19 |
+
.transparent-dropdown, .transparent-dropdown .container .wrap, .transparent-accordion {
|
| 20 |
+
background: var(--body-background-fill);
|
| 21 |
+
}
|
| 22 |
+
.gradio-container {
|
| 23 |
+
padding: var(--size-4) 0 !important;
|
| 24 |
+
max-width: 98% !important;
|
| 25 |
+
}
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
@lru_cache(maxsize=3)
|
| 29 |
+
def cached_duckdb_sql(query: str) -> Table:
|
| 30 |
+
return duckdb.sql(query)
|
| 31 |
+
|
| 32 |
+
def to_json_df(tbl: Table) -> pd.DataFrame:
|
| 33 |
+
query = ", ".join("nullif(([" + col + "]::JSON)[0]::VARCHAR, 'null') as " + col for col in tbl.columns)
|
| 34 |
+
return duckdb.sql(f"SELECT {query} FROM tbl").df()
|
| 35 |
+
|
| 36 |
+
def from_json_df(df: pd.DataFrame, dtypes: list[Dtype]) -> Table:
|
| 37 |
+
query = ", ".join("(ifnull(" + col + ", 'null')::JSON)::" + dtype + " as " + col for col, dtype in zip(df.columns, dtypes))
|
| 38 |
+
return duckdb.sql(f"SELECT {query} FROM df")
|
| 39 |
+
|
| 40 |
+
with gr.Blocks(css=css) as demo:
|
| 41 |
+
loading_codes_json = gr.JSON(visible=False)
|
| 42 |
+
with gr.Row():
|
| 43 |
+
with gr.Column():
|
| 44 |
+
gr.Markdown("# <p style='text-align:center;'>π€ (WIP) Hugging Face Dataset Spreadsheets π</p>\n\n<p style='text-align:center;'>Edit any dataset on Hugging Face (full list <a href='https://huggingface.co/datasets' target='_blank'>here</a>)")
|
| 45 |
+
with gr.Group():
|
| 46 |
+
with gr.Row():
|
| 47 |
+
dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, scale=10)
|
| 48 |
+
subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
|
| 49 |
+
split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
|
| 50 |
+
gr.LoginButton()
|
| 51 |
+
dataframe = gr.DataFrame(to_json_df(EMPTY_TABLE), interactive=True, wrap=True)
|
| 52 |
+
|
| 53 |
+
def show_subset_dropdown(dataset: str):
|
| 54 |
+
if dataset and "/" not in dataset.strip().strip("/"):
|
| 55 |
+
return []
|
| 56 |
+
resp = requests.get(f"https://datasets-server.huggingface.co/compatible-libraries?dataset={dataset}", timeout=3).json()
|
| 57 |
+
loading_codes = ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] in READ_PARQUET_FUNCTIONS] or [[]])[0] or []
|
| 58 |
+
subsets = [loading_code["config_name"] for loading_code in loading_codes]
|
| 59 |
+
subset = (subsets or [""])[0]
|
| 60 |
+
return dict(choices=subsets, value=subset, visible=len(subsets) > 1, key=hash(str(loading_codes))), loading_codes
|
| 61 |
+
|
| 62 |
+
def show_split_dropdown(subset: str, loading_codes: list[dict]):
|
| 63 |
+
splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
|
| 64 |
+
split = (splits or [""])[0]
|
| 65 |
+
return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
|
| 66 |
+
|
| 67 |
+
def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]):
|
| 68 |
+
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
|
| 69 |
+
if dataset and subset and split and pattern:
|
| 70 |
+
tbl = cached_duckdb_sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {PAGE_SIZE}")
|
| 71 |
+
else:
|
| 72 |
+
tbl = EMPTY_TABLE
|
| 73 |
+
return dict(value=to_json_df(tbl))
|
| 74 |
+
|
| 75 |
+
@demo.load(outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown, dataframe])
|
| 76 |
+
def _fetch_datasets(request: gr.Request, oauth_token: gr.OAuthToken | None):
|
| 77 |
+
api = HfApi(token=oauth_token.token if oauth_token else None)
|
| 78 |
+
datasets = list(api.list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"]))
|
| 79 |
+
if oauth_token and (user := api.whoami().get("name")):
|
| 80 |
+
datasets += list(api.list_datasets(limit=NUM_USER_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"], author=user))
|
| 81 |
+
dataset = request.query_params.get("dataset") or datasets[0].id
|
| 82 |
+
subsets, loading_codes = show_subset_dropdown(dataset)
|
| 83 |
+
splits = show_split_dropdown(subsets["value"], loading_codes)
|
| 84 |
+
input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes)
|
| 85 |
+
return {
|
| 86 |
+
dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset),
|
| 87 |
+
loading_codes_json: loading_codes,
|
| 88 |
+
subset_dropdown: gr.Dropdown(**subsets),
|
| 89 |
+
split_dropdown: gr.Dropdown(**splits),
|
| 90 |
+
dataframe: gr.DataFrame(**input_dataframe),
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
@dataset_dropdown.select(inputs=dataset_dropdown, outputs=[loading_codes_json, subset_dropdown, split_dropdown, dataframe])
|
| 94 |
+
def _show_subset_dropdown(dataset: str):
|
| 95 |
+
subsets, loading_codes = show_subset_dropdown(dataset)
|
| 96 |
+
splits = show_split_dropdown(subsets["value"], loading_codes)
|
| 97 |
+
input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes)
|
| 98 |
+
return {
|
| 99 |
+
loading_codes_json: loading_codes,
|
| 100 |
+
subset_dropdown: gr.Dropdown(**subsets),
|
| 101 |
+
split_dropdown: gr.Dropdown(**splits),
|
| 102 |
+
dataframe: gr.DataFrame(**input_dataframe),
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
@subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[split_dropdown, dataframe])
|
| 106 |
+
def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
|
| 107 |
+
splits = show_split_dropdown(subset, loading_codes)
|
| 108 |
+
input_dataframe = show_input_dataframe(dataset, subset, splits["value"], loading_codes)
|
| 109 |
+
return {
|
| 110 |
+
split_dropdown: gr.Dropdown(**splits),
|
| 111 |
+
dataframe: gr.DataFrame(**input_dataframe),
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
@split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[dataframe])
|
| 115 |
+
def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
|
| 116 |
+
input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes)
|
| 117 |
+
return {
|
| 118 |
+
dataframe: gr.DataFrame(**input_dataframe),
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
if __name__ == "__main__":
|
| 123 |
+
demo.launch()
|