Extract Schema.org Data Script
Maybe this is helpful for somebody...
Description
This script extracts Schema.org data from a given URL and saves it to a file.
Usage
- Run the Script: Execute the script in a Python environment.
- Input URL: Enter the URL of the webpage (without 'https://') when prompted.
- Output: The extracted data is saved in schema_data.txt.
Features
- Extracts JSON-LD data from webpages.
- Identifies and counts schema types and fields.
- Saves formatted data along with metadata to a file.
Requirements
-
Python libraries: requests, beautifulsoup4.
# extract_schema_data.py # Author: Christopher Hüneke # Date: 07.07.2024 # Description: This script extracts Schema.org data from a given URL and saves it to a file. import requests from bs4 import BeautifulSoup import json import os from collections import defaultdict # Function to extract Schema.org data from a given URL def extract_schema_data(url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') schema_data = [] schema_types = set() field_count = defaultdict(int) # Recursive helper function to extract types and field frequencies from JSON data def extract_types_and_fields(data): if isinstance(data, dict): if '@type' in data: if isinstance(data['@type'], list): schema_types.update(data['@type']) else: schema_types.add(data['@type']) for key, value in data.items(): field_count[key] += 1 extract_types_and_fields(value) elif isinstance(data, list): for item in data: extract_types_and_fields(item) # Look for all <script> tags with type="application/ld+json" for script in soup.find_all('script', type='application/ld+json'): try: json_data = json.loads(script.string) schema_data.append(json_data) extract_types_and_fields(json_data) except json.JSONDecodeError as e: print(f"Error decoding JSON: {e}") return schema_data, schema_types, field_count # Function to format Schema.org data for readable output def format_schema_data(schema_data): formatted_data = "" for data in schema_data: formatted_data += json.dumps(data, indent=4) + "\n\n" return formatted_data # Function to get the meta title of the page def get_meta_title(url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') title_tag = soup.find('title') return title_tag.string if title_tag else 'No title found' # Function to save extracted data to a file def save_to_file(url, title, schema_types, formatted_data, field_count, filename='schema_data.txt'): try: with open(filename, 'w', encoding='utf-8') as file: file.write(f"URL: {url}\n") file.write(f"TITLE: {title}\n") file.write(f"SCHEMA TYPES: {', '.join(schema_types)}\n\n") file.write("Field Frequencies:\n") for field, count in field_count.items(): file.write(f"{field}: {count}\n") file.write("\nSchema Data:\n") file.write(formatted_data) print(f"Schema.org data successfully saved to {filename}") except Exception as e: print(f"Error saving to file: {e}") # Main function to orchestrate the extraction and saving process def main(): url_input = input("Please enter the URL without 'https://': ") url = f"https://{url_input}" schema_data, schema_types, field_count = extract_schema_data(url) if not schema_data: print("No Schema.org data found.") return meta_title = get_meta_title(url) formatted_data = format_schema_data(schema_data) save_to_file(url, meta_title, schema_types, formatted_data, field_count) if __name__ == "__main__": main()