Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| import json | |
| import os | |
| BASE_URL = "https://courses.analyticsvidhya.com/collections/courses?page=" | |
| OUTPUT_FILE = os.path.join(os.path.dirname(__file__), "../data/courses.json") | |
| def scrape_courses(): | |
| courses = [] | |
| for page in range(1, 10): | |
| URL = f"{BASE_URL}{page}" | |
| print(f"Scraping URL: {URL}") | |
| response = requests.get(URL) | |
| print(f"Response status: {response.status_code}") | |
| # Check if request was successful | |
| if response.status_code != 200: | |
| print(f"Failed to fetch the webpage. Status code: {response.status_code}") | |
| continue | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| # Locate course containers | |
| course_items = soup.find_all("li", class_="products__list-item") | |
| print(f"Found {len(course_items)} course containers on page {page}.") | |
| # Loop through each course container to extract details | |
| for item in course_items: | |
| # Extract course link | |
| link_tag = item.find("a", class_="course-card") | |
| course_link = link_tag.get("href", "#") if link_tag else "#" | |
| if not course_link.startswith("http"): | |
| course_link = f"https://courses.analyticsvidhya.com{course_link}" | |
| # Extract course title | |
| title_tag = link_tag.find("h3") if link_tag else None | |
| title = title_tag.text.strip() if title_tag else "No Title" | |
| # Extract course image | |
| image_tag = link_tag.find("img", class_="course-card__img") if link_tag else None | |
| image_url = image_tag.get("src", "No Image URL") if image_tag else "No Image URL" | |
| # Extract course description | |
| lesson_tag = link_tag.find("span", class_="course-card__lesson-count") if link_tag else None | |
| description = lesson_tag.text.strip() if lesson_tag else "No Description" | |
| # Add the extracted details to the list | |
| courses.append({ | |
| "title": title, | |
| "description": description, | |
| "image_url": image_url, | |
| "course_link": course_link, | |
| }) | |
| # Debugging: Print the first few courses | |
| print(f"Scraped {len(courses)} courses.") | |
| for course in courses[:3]: | |
| print(course) | |
| # Ensure the directory for the output file exists | |
| os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) | |
| # Save the course data to a JSON file | |
| with open(OUTPUT_FILE, "w") as f: | |
| json.dump(courses, f, indent=4) | |
| print(f"Data saved to {os.path.abspath(OUTPUT_FILE)}") | |
| if __name__ == "__main__": | |
| scrape_courses() |