From 318035493283aba94719e4a1a0c77e8eb010f305 Mon Sep 17 00:00:00 2001
From: Stefan Siegl <stefan.siegl@tech11.com>
Date: Sun, 27 Jul 2025 18:21:14 +0200
Subject: [PATCH] initial commit.

---
 .gitignore |   2 ++
 README.md  |  41 +++++++++++++++++++++
 scraper.py | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 147 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100644 scraper.py
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ab8c74d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+hafensommer-*.json
+hafensommer-2025.html
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..7fba4cb
--- /dev/null
+++ b/README.md
@@ -0,0 +1,41 @@
+# scraper-hafensommer
+
+minimal HTML to Event JSON scraper for Würzburg Hafensommer
+
+## Vibe Coding Inspiration :)
+
+That's the prompt I've used to create it (with current ChatGPT)
+
+
+write a simple website scraper script.  first download the page https://www.adticket.de/Hafensommer-Wurzburg.html
+
+use css selector `.w-paged-listing__list-item` to match a event.
+every event shall be stored to a single json file.  css select child node with `.c-list-item-event` and pick data attribute `data-sync-id` and pick id from there.  save the event json to a file named `hafensommer-{{id}}.json`
+
+json shall follow schema.org/Event format.
+
+select child node of `time` element type for `startDate` property (element looks like `<time datetime="2025-07-31T20:00:00">`)
+
+pick `name` property from `<h3 class="c-list-item-event__headline">Mine  |  Support: Epilog</h3>` ... format as `Hafensommer: Mine`
+... also extract this to `"performer": { "@type": "Person", "name": "Mine" }`
+
+also initialize `location` hard-coded to ` "location": {
+    "@type": "PostalAddress",
+    "name": "Freitreppe Alter Hafen",
+    "streetAddress": "Oskar-Laredo-Platz 1",
+    "postalCode": "97080",
+    "addressLocality": "Würzburg"
+  }`
+
+pick `image` property from `<img src="https://cdn.adticket.de/core/img/event/detailEvent_2347271.jpg" alt="" class="c-list-item-event__image">`
+
+check the following html for offer url `https://www.adticket.de/Mine-Support-Epilog/Wuerzburg-Freitreppe-Alter-Hafen/31-07-2025_20-00.html`
+
+extract price from ` <div class="c-list-item-event__event-min-price">            <span>ab 40,00 €</span>`
+
+provide in json like: `  "offers": {
+    "@type": "Offer",
+    "url": "https://www.adticket.de/Mine-Support-Epilog/Wuerzburg-Freitreppe-Alter-Hafen/31-07-2025_20-00.html",
+    "price": 40,
+    "priceCurrency": "EUR"
+  },`
diff --git a/scraper.py b/scraper.py
new file mode 100644
index 0000000..de5203f
--- /dev/null
+++ b/scraper.py
@@ -0,0 +1,104 @@
+import requests
+from bs4 import BeautifulSoup
+import json
+import os
+import re
+
+URL = "https://www.adticket.de/Hafensommer-Wurzburg.html"
+BASE_URL = "https://www.adticket.de"
+
+# Download the page
+# response = requests.get(URL)
+# response.raise_for_status()
+# 
+# soup = BeautifulSoup(response.text, 'html.parser')
+
+with open('hafensommer-2025.html', 'r', encoding='utf-8') as f:
+    html_content = f.read()
+
+soup = BeautifulSoup(html_content, 'html.parser')
+
+events = soup.select('.w-paged-listing__list-item')
+
+for event in events:
+    event_node = event.select_one('.c-list-item-event')
+    if not event_node:
+        continue
+
+    # ID from data-sync-id
+    sync_id = event_node.get('data-sync-id')
+    if not sync_id:
+        continue
+
+    # Name
+    headline = event_node.select_one('h3.c-list-item-event__headline')
+    if headline:
+        raw_name = headline.text.strip().split('|')[0].strip()
+        name = f"Hafensommer: {raw_name}"
+    else:
+        continue
+
+    # Performer
+    performer = {
+        "@type": "Person",
+        "name": raw_name
+    }
+
+    # Start Date
+    time_elem = event_node.select_one('time[datetime]')
+    if time_elem:
+        start_date = time_elem['datetime']
+    else:
+        continue
+
+    # Image
+    img_elem = event_node.select_one('img.c-list-item-event__image')
+    image_url = img_elem['src'] if img_elem else None
+
+    # Offer URL
+    offer_url = event_node.get('href')
+    print (offer_url)
+
+    # Price
+    price_elem = event_node.select_one('.c-list-item-event__event-min-price span')
+    if price_elem:
+        price_match = re.search(r'([\d,]+)', price_elem.text)
+        if price_match:
+            price = float(price_match.group(1).replace(',', '.'))
+        else:
+            price = None
+    else:
+        price = None
+
+    event_json = {
+        "@context": "https://schema.org",
+        "@type": "Event",
+        "name": name,
+        "startDate": start_date,
+        "performer": performer,
+        "location": {
+            "@type": "PostalAddress",
+            "name": "Freitreppe Alter Hafen",
+            "streetAddress": "Oskar-Laredo-Platz 1",
+            "postalCode": "97080",
+            "addressLocality": "Würzburg"
+        }
+    }
+
+    if image_url:
+        event_json["image"] = image_url
+
+    if offer_url and price is not None:
+        event_json["offers"] = {
+            "@type": "Offer",
+            "url": offer_url,
+            "price": price,
+            "priceCurrency": "EUR"
+        }
+
+    filename = f"hafensommer-{sync_id}.json"
+    with open(filename, 'w', encoding='utf-8') as f:
+        json.dump(event_json, f, ensure_ascii=False, indent=2)
+
+    print(f"Saved {filename}")
+