|
|
from datasets import load_dataset, DatasetDict |
|
|
from PIL import Image, ImageFile, UnidentifiedImageError |
|
|
import io |
|
|
from tqdm import tqdm |
|
|
|
|
|
ImageFile.LOAD_TRUNCATED_IMAGES = True |
|
|
|
|
|
|
|
|
cache_dir = "/bask/projects/p/phwq4930-gbm/Zeyu/PathVLM/.cache" |
|
|
dataset_name = "CNX-PathLLM/Pathcap" |
|
|
dataset = load_dataset(dataset_name, split="train", cache_dir=cache_dir) |
|
|
|
|
|
print(f"original dataset size: {len(dataset)}") |
|
|
|
|
|
|
|
|
valid_indices = [] |
|
|
|
|
|
|
|
|
for idx in tqdm(range(len(dataset))): |
|
|
try: |
|
|
example = dataset[idx] |
|
|
|
|
|
text = example["txt"] |
|
|
if not isinstance(text, str): |
|
|
raise ValueError(f"not a string: {text}") |
|
|
valid_indices.append(idx) |
|
|
except Exception as e: |
|
|
print(f"Cannot recognize file {idx}: {e}") |
|
|
|
|
|
|
|
|
filtered_dataset = dataset.select(valid_indices) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"filtered dataset size: {len(filtered_dataset)}") |
|
|
|
|
|
if len(dataset) != len(filtered_dataset): |
|
|
|
|
|
filtered_dataset_dict = DatasetDict({"train": filtered_dataset}) |
|
|
|
|
|
filtered_dataset_dict.push_to_hub(dataset_name) |
|
|
|