initial commit.
This commit is contained in:
commit
3180354932
3 changed files with 147 additions and 0 deletions
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
hafensommer-*.json
|
||||||
|
hafensommer-2025.html
|
41
README.md
Normal file
41
README.md
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
# scraper-hafensommer
|
||||||
|
|
||||||
|
minimal HTML to Event JSON scraper for Würzburg Hafensommer
|
||||||
|
|
||||||
|
## Vibe Coding Inspiration :)
|
||||||
|
|
||||||
|
That's the prompt I've used to create it (with current ChatGPT)
|
||||||
|
|
||||||
|
|
||||||
|
write a simple website scraper script. first download the page https://www.adticket.de/Hafensommer-Wurzburg.html
|
||||||
|
|
||||||
|
use css selector `.w-paged-listing__list-item` to match a event.
|
||||||
|
every event shall be stored to a single json file. css select child node with `.c-list-item-event` and pick data attribute `data-sync-id` and pick id from there. save the event json to a file named `hafensommer-{{id}}.json`
|
||||||
|
|
||||||
|
json shall follow schema.org/Event format.
|
||||||
|
|
||||||
|
select child node of `time` element type for `startDate` property (element looks like `<time datetime="2025-07-31T20:00:00">`)
|
||||||
|
|
||||||
|
pick `name` property from `<h3 class="c-list-item-event__headline">Mine | Support: Epilog</h3>` ... format as `Hafensommer: Mine`
|
||||||
|
... also extract this to `"performer": { "@type": "Person", "name": "Mine" }`
|
||||||
|
|
||||||
|
also initialize `location` hard-coded to ` "location": {
|
||||||
|
"@type": "PostalAddress",
|
||||||
|
"name": "Freitreppe Alter Hafen",
|
||||||
|
"streetAddress": "Oskar-Laredo-Platz 1",
|
||||||
|
"postalCode": "97080",
|
||||||
|
"addressLocality": "Würzburg"
|
||||||
|
}`
|
||||||
|
|
||||||
|
pick `image` property from `<img src="https://cdn.adticket.de/core/img/event/detailEvent_2347271.jpg" alt="" class="c-list-item-event__image">`
|
||||||
|
|
||||||
|
check the following html for offer url `https://www.adticket.de/Mine-Support-Epilog/Wuerzburg-Freitreppe-Alter-Hafen/31-07-2025_20-00.html`
|
||||||
|
|
||||||
|
extract price from ` <div class="c-list-item-event__event-min-price"> <span>ab 40,00 €</span>`
|
||||||
|
|
||||||
|
provide in json like: ` "offers": {
|
||||||
|
"@type": "Offer",
|
||||||
|
"url": "https://www.adticket.de/Mine-Support-Epilog/Wuerzburg-Freitreppe-Alter-Hafen/31-07-2025_20-00.html",
|
||||||
|
"price": 40,
|
||||||
|
"priceCurrency": "EUR"
|
||||||
|
},`
|
104
scraper.py
Normal file
104
scraper.py
Normal file
|
@ -0,0 +1,104 @@
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
URL = "https://www.adticket.de/Hafensommer-Wurzburg.html"
|
||||||
|
BASE_URL = "https://www.adticket.de"
|
||||||
|
|
||||||
|
# Download the page
|
||||||
|
# response = requests.get(URL)
|
||||||
|
# response.raise_for_status()
|
||||||
|
#
|
||||||
|
# soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
|
with open('hafensommer-2025.html', 'r', encoding='utf-8') as f:
|
||||||
|
html_content = f.read()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
|
||||||
|
events = soup.select('.w-paged-listing__list-item')
|
||||||
|
|
||||||
|
for event in events:
|
||||||
|
event_node = event.select_one('.c-list-item-event')
|
||||||
|
if not event_node:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ID from data-sync-id
|
||||||
|
sync_id = event_node.get('data-sync-id')
|
||||||
|
if not sync_id:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Name
|
||||||
|
headline = event_node.select_one('h3.c-list-item-event__headline')
|
||||||
|
if headline:
|
||||||
|
raw_name = headline.text.strip().split('|')[0].strip()
|
||||||
|
name = f"Hafensommer: {raw_name}"
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Performer
|
||||||
|
performer = {
|
||||||
|
"@type": "Person",
|
||||||
|
"name": raw_name
|
||||||
|
}
|
||||||
|
|
||||||
|
# Start Date
|
||||||
|
time_elem = event_node.select_one('time[datetime]')
|
||||||
|
if time_elem:
|
||||||
|
start_date = time_elem['datetime']
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Image
|
||||||
|
img_elem = event_node.select_one('img.c-list-item-event__image')
|
||||||
|
image_url = img_elem['src'] if img_elem else None
|
||||||
|
|
||||||
|
# Offer URL
|
||||||
|
offer_url = event_node.get('href')
|
||||||
|
print (offer_url)
|
||||||
|
|
||||||
|
# Price
|
||||||
|
price_elem = event_node.select_one('.c-list-item-event__event-min-price span')
|
||||||
|
if price_elem:
|
||||||
|
price_match = re.search(r'([\d,]+)', price_elem.text)
|
||||||
|
if price_match:
|
||||||
|
price = float(price_match.group(1).replace(',', '.'))
|
||||||
|
else:
|
||||||
|
price = None
|
||||||
|
else:
|
||||||
|
price = None
|
||||||
|
|
||||||
|
event_json = {
|
||||||
|
"@context": "https://schema.org",
|
||||||
|
"@type": "Event",
|
||||||
|
"name": name,
|
||||||
|
"startDate": start_date,
|
||||||
|
"performer": performer,
|
||||||
|
"location": {
|
||||||
|
"@type": "PostalAddress",
|
||||||
|
"name": "Freitreppe Alter Hafen",
|
||||||
|
"streetAddress": "Oskar-Laredo-Platz 1",
|
||||||
|
"postalCode": "97080",
|
||||||
|
"addressLocality": "Würzburg"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if image_url:
|
||||||
|
event_json["image"] = image_url
|
||||||
|
|
||||||
|
if offer_url and price is not None:
|
||||||
|
event_json["offers"] = {
|
||||||
|
"@type": "Offer",
|
||||||
|
"url": offer_url,
|
||||||
|
"price": price,
|
||||||
|
"priceCurrency": "EUR"
|
||||||
|
}
|
||||||
|
|
||||||
|
filename = f"hafensommer-{sync_id}.json"
|
||||||
|
with open(filename, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(event_json, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
print(f"Saved {filename}")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue