initial commit.
This commit is contained in:
commit
3180354932
3 changed files with 147 additions and 0 deletions
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
hafensommer-*.json
|
||||
hafensommer-2025.html
|
41
README.md
Normal file
41
README.md
Normal file
|
@ -0,0 +1,41 @@
|
|||
# scraper-hafensommer
|
||||
|
||||
minimal HTML to Event JSON scraper for Würzburg Hafensommer
|
||||
|
||||
## Vibe Coding Inspiration :)
|
||||
|
||||
That's the prompt I've used to create it (with current ChatGPT)
|
||||
|
||||
|
||||
write a simple website scraper script. first download the page https://www.adticket.de/Hafensommer-Wurzburg.html
|
||||
|
||||
use css selector `.w-paged-listing__list-item` to match a event.
|
||||
every event shall be stored to a single json file. css select child node with `.c-list-item-event` and pick data attribute `data-sync-id` and pick id from there. save the event json to a file named `hafensommer-{{id}}.json`
|
||||
|
||||
json shall follow schema.org/Event format.
|
||||
|
||||
select child node of `time` element type for `startDate` property (element looks like `<time datetime="2025-07-31T20:00:00">`)
|
||||
|
||||
pick `name` property from `<h3 class="c-list-item-event__headline">Mine | Support: Epilog</h3>` ... format as `Hafensommer: Mine`
|
||||
... also extract this to `"performer": { "@type": "Person", "name": "Mine" }`
|
||||
|
||||
also initialize `location` hard-coded to ` "location": {
|
||||
"@type": "PostalAddress",
|
||||
"name": "Freitreppe Alter Hafen",
|
||||
"streetAddress": "Oskar-Laredo-Platz 1",
|
||||
"postalCode": "97080",
|
||||
"addressLocality": "Würzburg"
|
||||
}`
|
||||
|
||||
pick `image` property from `<img src="https://cdn.adticket.de/core/img/event/detailEvent_2347271.jpg" alt="" class="c-list-item-event__image">`
|
||||
|
||||
check the following html for offer url `https://www.adticket.de/Mine-Support-Epilog/Wuerzburg-Freitreppe-Alter-Hafen/31-07-2025_20-00.html`
|
||||
|
||||
extract price from ` <div class="c-list-item-event__event-min-price"> <span>ab 40,00 €</span>`
|
||||
|
||||
provide in json like: ` "offers": {
|
||||
"@type": "Offer",
|
||||
"url": "https://www.adticket.de/Mine-Support-Epilog/Wuerzburg-Freitreppe-Alter-Hafen/31-07-2025_20-00.html",
|
||||
"price": 40,
|
||||
"priceCurrency": "EUR"
|
||||
},`
|
104
scraper.py
Normal file
104
scraper.py
Normal file
|
@ -0,0 +1,104 @@
|
|||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
URL = "https://www.adticket.de/Hafensommer-Wurzburg.html"
|
||||
BASE_URL = "https://www.adticket.de"
|
||||
|
||||
# Download the page
|
||||
# response = requests.get(URL)
|
||||
# response.raise_for_status()
|
||||
#
|
||||
# soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
with open('hafensommer-2025.html', 'r', encoding='utf-8') as f:
|
||||
html_content = f.read()
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
events = soup.select('.w-paged-listing__list-item')
|
||||
|
||||
for event in events:
|
||||
event_node = event.select_one('.c-list-item-event')
|
||||
if not event_node:
|
||||
continue
|
||||
|
||||
# ID from data-sync-id
|
||||
sync_id = event_node.get('data-sync-id')
|
||||
if not sync_id:
|
||||
continue
|
||||
|
||||
# Name
|
||||
headline = event_node.select_one('h3.c-list-item-event__headline')
|
||||
if headline:
|
||||
raw_name = headline.text.strip().split('|')[0].strip()
|
||||
name = f"Hafensommer: {raw_name}"
|
||||
else:
|
||||
continue
|
||||
|
||||
# Performer
|
||||
performer = {
|
||||
"@type": "Person",
|
||||
"name": raw_name
|
||||
}
|
||||
|
||||
# Start Date
|
||||
time_elem = event_node.select_one('time[datetime]')
|
||||
if time_elem:
|
||||
start_date = time_elem['datetime']
|
||||
else:
|
||||
continue
|
||||
|
||||
# Image
|
||||
img_elem = event_node.select_one('img.c-list-item-event__image')
|
||||
image_url = img_elem['src'] if img_elem else None
|
||||
|
||||
# Offer URL
|
||||
offer_url = event_node.get('href')
|
||||
print (offer_url)
|
||||
|
||||
# Price
|
||||
price_elem = event_node.select_one('.c-list-item-event__event-min-price span')
|
||||
if price_elem:
|
||||
price_match = re.search(r'([\d,]+)', price_elem.text)
|
||||
if price_match:
|
||||
price = float(price_match.group(1).replace(',', '.'))
|
||||
else:
|
||||
price = None
|
||||
else:
|
||||
price = None
|
||||
|
||||
event_json = {
|
||||
"@context": "https://schema.org",
|
||||
"@type": "Event",
|
||||
"name": name,
|
||||
"startDate": start_date,
|
||||
"performer": performer,
|
||||
"location": {
|
||||
"@type": "PostalAddress",
|
||||
"name": "Freitreppe Alter Hafen",
|
||||
"streetAddress": "Oskar-Laredo-Platz 1",
|
||||
"postalCode": "97080",
|
||||
"addressLocality": "Würzburg"
|
||||
}
|
||||
}
|
||||
|
||||
if image_url:
|
||||
event_json["image"] = image_url
|
||||
|
||||
if offer_url and price is not None:
|
||||
event_json["offers"] = {
|
||||
"@type": "Offer",
|
||||
"url": offer_url,
|
||||
"price": price,
|
||||
"priceCurrency": "EUR"
|
||||
}
|
||||
|
||||
filename = f"hafensommer-{sync_id}.json"
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
json.dump(event_json, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"Saved {filename}")
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue