Extract Schema.org Data Script (Python)

Question

Chris Hüneke 5 Expert in AI-powered Digital Marketing

10 Months Ago

Extract Schema.org Data Script

Maybe this is helpful for somebody...

Description

This script extracts Schema.org data from a given URL and saves it to a file.

Usage

Run the Script: Execute the script in a Python environment.
Input URL: Enter the URL of the webpage (without 'https://') when prompted.
Output: The extracted data is saved in schema_data.txt.

Features

Extracts JSON-LD data from webpages.
Identifies and counts schema types and fields.
Saves formatted data along with metadata to a file.

Requirements

Python libraries: requests, beautifulsoup4.

  # extract_schema_data.py
  # Author: Christopher Hüneke
  # Date: 07.07.2024
  # Description: This script extracts Schema.org data from a given URL and saves it to a file.

  import requests
  from bs4 import BeautifulSoup
  import json
  import os
  from collections import defaultdict

  # Function to extract Schema.org data from a given URL
  def extract_schema_data(url):
      response = requests.get(url)
      soup = BeautifulSoup(response.content, 'html.parser')

      schema_data = []
      schema_types = set()
      field_count = defaultdict(int)

      # Recursive helper function to extract types and field frequencies from JSON data
      def extract_types_and_fields(data):
          if isinstance(data, dict):
              if '@type' in data:
                  if isinstance(data['@type'], list):
                      schema_types.update(data['@type'])
                  else:
                      schema_types.add(data['@type'])
              for key, value in data.items():
                  field_count[key] += 1
                  extract_types_and_fields(value)
          elif isinstance(data, list):
              for item in data:
                  extract_types_and_fields(item)

      # Look for all <script> tags with type="application/ld+json"
      for script in soup.find_all('script', type='application/ld+json'):
          try:
              json_data = json.loads(script.string)
              schema_data.append(json_data)
              extract_types_and_fields(json_data)
          except json.JSONDecodeError as e:
              print(f"Error decoding JSON: {e}")

      return schema_data, schema_types, field_count

  # Function to format Schema.org data for readable output
  def format_schema_data(schema_data):
      formatted_data = ""
      for data in schema_data:
          formatted_data += json.dumps(data, indent=4) + "\n\n"
      return formatted_data

  # Function to get the meta title of the page
  def get_meta_title(url):
      response = requests.get(url)
      soup = BeautifulSoup(response.content, 'html.parser')
      title_tag = soup.find('title')
      return title_tag.string if title_tag else 'No title found'

  # Function to save extracted data to a file
  def save_to_file(url, title, schema_types, formatted_data, field_count, filename='schema_data.txt'):
      try:
          with open(filename, 'w', encoding='utf-8') as file:
              file.write(f"URL: {url}\n")
              file.write(f"TITLE: {title}\n")
              file.write(f"SCHEMA TYPES: {', '.join(schema_types)}\n\n")
              file.write("Field Frequencies:\n")
              for field, count in field_count.items():
                  file.write(f"{field}: {count}\n")
              file.write("\nSchema Data:\n")
              file.write(formatted_data)
          print(f"Schema.org data successfully saved to {filename}")
      except Exception as e:
          print(f"Error saving to file: {e}")

  # Main function to orchestrate the extraction and saving process
  def main():
      url_input = input("Please enter the URL without 'https://': ")
      url = f"https://{url_input}"

      schema_data, schema_types, field_count = extract_schema_data(url)
      if not schema_data:
          print("No Schema.org data found.")
          return

      meta_title = get_meta_title(url)
      formatted_data = format_schema_data(schema_data)
      save_to_file(url, meta_title, schema_types, formatted_data, field_count)

  if __name__ == "__main__":
      main()

json python seo

1 Contributor
1 Reply
54 Views
2 Minutes Discussion Span
Latest Post 10 Months Ago Latest Post by Chris Hüneke

Reply to this topic

Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.

Chris Hüneke 5 Expert in AI-powered Digital Marketing · Answer 1 · 2024-08-04T12:53:27+00:00

Sorry, i got that wrong. I can't post scripts in Digital Marketing section, right?