import csv import json import argparse from html import escape from pathlib import Path import re import requests parser = argparse.ArgumentParser( prog="megalithosm", description="Fetch data from megalith sources and produce a GPX file", ) parser.add_argument( "-q", "--quality", default=5, type=int, help="Include sites of this quality or higher", ) parser.add_argument("-r", "--refetch", action="store_true", default=False) args = parser.parse_args() country_range = 6 def cache_stone_circles(): response = requests.get( "http://www.stone-circles.org.uk/stone/Cluster/Coords/coords.js" ) if response.status_code != 200: raise Exception( "Failed to fetch data from stone-circles.org.uk: {}".format(response.text) ) Path("cache").mkdir(exist_ok=True) with open("cache/stone-circles.js", "w") as f: f.write(response.text) def cache_megalithic(): def megalithic_url(country): return "https://www.megalithic.co.uk/cache/csvmap_country{}.csv".format(country) for country in range(1, country_range): # Megalithic doesn't really want people scraping response = requests.get( megalithic_url(country), headers={ "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0" }, ) if response.status_code != 200: raise Exception( "Failed to fetch data from Megalithic.UK: {}".format(response.text) ) Path("cache").mkdir(exist_ok=True) with open("cache/megalithic-{}.csv".format(country), "w") as f: f.write(response.text) if args.refetch or not Path("cache/stone-circles.js").exists(): print("Refreshing stone-circles.org.uk cache") cache_stone_circles() if args.refetch or not all( Path("cache/megalithic-{}.csv".format(c)).exists() for c in range(1, country_range) ): print("Refreshing megalithic.co.uk cache") cache_megalithic() print("Post-processing data") data = [] for country in range(1, country_range): with open("cache/megalithic-{}.csv".format(country)) as f: reader = csv.DictReader(f, delimiter="|") types = set() for row in reader: types.add(row["Type"]) data.append( { "lat": row["lat"], "lng": row["lng"], "name": row["Name"], "type": row["Type"], "url": "https://megalithic.co.uk/article.php?sid={}".format( row["SID"] ), "quality": int(row["Condition"]), } ) with open("cache/stone-circles.js") as f: content = f.read() content = re.match(r".+ = \[(\[.+]),?];", content.replace("\n", "")) content = re.sub(r'\\(?!")', "", content.groups()[0]) arr = json.loads("[{}]".format(content)) for item in arr: data.append( { "lat": item[0], "lng": item[1], "name": re.sub( r"<.+?>", "", re.match(r"(.+)", item[2]).groups()[0] ), "type": re.sub(r".+>", "", item[2].replace("
", " ")), "url": "http://www.stone-circles.org.uk/stone/{}".format( re.search(r"href=([a-zA-Z.]+)", item[2]).groups()[0] ), "quality": 5, } ) print("Generating GPX") with open("Megaliths.gpx", "w") as gpx_file: gpx_file.write( """ Megalith sites Jude Southworth """ ) seen_sites = set() for poi in data: norm_name = re.sub(r"\s", "", poi["name"]).lower() site_key = "{},{},{}".format(poi["lat"], poi["lng"], norm_name) if site_key in seen_sites: print("Omitting duplicate site: {}".format(poi["name"])) continue if poi["quality"] < args.quality: continue if poi["type"] in ["Museum", "Modern Stone Circle etc"]: print("Omitting uninteresting feature: {}".format(poi["name"])) continue # Deduplicate entries seen_sites.add(site_key) name = "{} ({})".format(poi["name"], poi["type"].strip()) gpx_file.write( """ {} {} """.format( poi["lat"], poi["lng"], escape(name), escape(poi["url"]) ) ) gpx_file.write("""\n""")