megalith-osm/gpx.py

162 lines
5.0 KiB
Python
Raw Normal View History

2024-02-26 13:02:43 +00:00
import csv
import json
import argparse
from html import escape
from pathlib import Path
import re
import requests
parser = argparse.ArgumentParser(
prog="megalithosm",
description="Fetch data from megalith sources and produce a GPX file",
)
parser.add_argument(
"-q",
"--quality",
default=5,
type=int,
help="Include sites of this quality or higher",
)
parser.add_argument("-r", "--refetch", action="store_true", default=False)
args = parser.parse_args()
country_range = 6
def cache_stone_circles():
response = requests.get(
"http://www.stone-circles.org.uk/stone/Cluster/Coords/coords.js"
)
if response.status_code != 200:
raise Exception(
"Failed to fetch data from stone-circles.org.uk: {}".format(response.text)
)
Path("cache").mkdir(exist_ok=True)
with open("cache/stone-circles.js", "w") as f:
f.write(response.text)
def cache_megalithic():
def megalithic_url(country):
return "https://www.megalithic.co.uk/cache/csvmap_country{}.csv".format(country)
for country in range(1, country_range):
# Megalithic doesn't really want people scraping
response = requests.get(
megalithic_url(country),
headers={
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0"
},
)
if response.status_code != 200:
raise Exception(
"Failed to fetch data from Megalithic.UK: {}".format(response.text)
)
Path("cache").mkdir(exist_ok=True)
with open("cache/megalithic-{}.csv".format(country), "w") as f:
f.write(response.text)
if args.refetch or not Path("cache/stone-circles.js").exists():
print("Refreshing stone-circles.org.uk cache")
cache_stone_circles()
if args.refetch or not all(
Path("cache/megalithic-{}.csv".format(c)).exists() for c in range(1, country_range)
):
print("Refreshing megalithic.co.uk cache")
cache_megalithic()
print("Post-processing data")
data = []
for country in range(1, country_range):
with open("cache/megalithic-{}.csv".format(country)) as f:
reader = csv.DictReader(f, delimiter="|")
types = set()
for row in reader:
types.add(row["Type"])
data.append(
{
"lat": row["lat"],
"lng": row["lng"],
"name": row["Name"],
"type": row["Type"],
"url": "https://megalithic.co.uk/article.php?sid={}".format(
row["SID"]
),
"quality": int(row["Condition"]),
}
)
with open("cache/stone-circles.js") as f:
content = f.read()
content = re.match(r".+ = \[(\[.+]),?];", content.replace("\n", ""))
content = re.sub(r'\\(?!")', "", content.groups()[0])
arr = json.loads("[{}]".format(content))
for item in arr:
data.append(
{
"lat": item[0],
"lng": item[1],
"name": re.sub(
r"<.+?>", "", re.match(r"<b>(.+)</b>", item[2]).groups()[0]
),
"type": re.sub(r".+>", "", item[2].replace("<br>", " ")),
"url": "http://www.stone-circles.org.uk/stone/{}".format(
re.search(r"href=([a-zA-Z.]+)", item[2]).groups()[0]
),
"quality": 5,
}
)
print("Generating GPX")
with open("Megaliths.gpx", "w") as gpx_file:
gpx_file.write(
"""<?xml version="1.0" encoding="UTF-8"?>
<gpx xmlns="http://www.topografix.com/GPX/1/1"
version="1.1"
creator="megalithosm"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.topografix.com/GPX/1/1 http://www.topografix.com/GPX/1/1/gpx.xsd">
<metadata>
<name>Megalith sites</name>
<author>
<name>Jude Southworth</name>
</author>
</metadata>"""
)
seen_sites = set()
for poi in data:
norm_name = re.sub(r"\s", "", poi["name"]).lower()
site_key = "{},{},{}".format(poi["lat"], poi["lng"], norm_name)
if site_key in seen_sites:
print("Omitting duplicate site: {}".format(poi["name"]))
continue
if poi["quality"] < args.quality:
continue
if poi["type"] in ["Museum", "Modern Stone Circle etc"]:
print("Omitting uninteresting feature: {}".format(poi["name"]))
continue
# Deduplicate entries
seen_sites.add(site_key)
name = "{} ({})".format(poi["name"], poi["type"].strip())
gpx_file.write(
"""
<wpt lat="{}" lon="{}">
<name>{}</name>
<desc>{}</desc>
</wpt>""".format(
poi["lat"], poi["lng"], escape(name), escape(poi["url"])
)
)
gpx_file.write("""\n</gpx>""")