megalith-osm/gpx.py

import csv
import json
import argparse
from html import escape
from pathlib import Path
import re

import requests

parser = argparse.ArgumentParser(
    prog="megalithosm",
    description="Fetch data from megalith sources and produce a GPX file",
)

parser.add_argument(
    "-q",
    "--quality",
    default=5,
    type=int,
    help="Include sites of this quality or higher",
)
parser.add_argument("-r", "--refetch", action="store_true", default=False)

args = parser.parse_args()

country_range = 6


def cache_stone_circles():
    response = requests.get(
        "http://www.stone-circles.org.uk/stone/Cluster/Coords/coords.js"
    )
    if response.status_code != 200:
        raise Exception(
            "Failed to fetch data from stone-circles.org.uk: {}".format(response.text)
        )

    Path("cache").mkdir(exist_ok=True)
    with open("cache/stone-circles.js", "w") as f:
        f.write(response.text)


def cache_megalithic():
    def megalithic_url(country):
        return "https://www.megalithic.co.uk/cache/csvmap_country{}.csv".format(country)

    for country in range(1, country_range):
        # Megalithic doesn't really want people scraping
        response = requests.get(
            megalithic_url(country),
            headers={
                "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0"
            },
        )
        if response.status_code != 200:
            raise Exception(
                "Failed to fetch data from Megalithic.UK: {}".format(response.text)
            )

        Path("cache").mkdir(exist_ok=True)
        with open("cache/megalithic-{}.csv".format(country), "w") as f:
            f.write(response.text)


if args.refetch or not Path("cache/stone-circles.js").exists():
    print("Refreshing stone-circles.org.uk cache")
    cache_stone_circles()

if args.refetch or not all(
    Path("cache/megalithic-{}.csv".format(c)).exists() for c in range(1, country_range)
):
    print("Refreshing megalithic.co.uk cache")
    cache_megalithic()


print("Post-processing data")
data = []
for country in range(1, country_range):
    with open("cache/megalithic-{}.csv".format(country)) as f:
        reader = csv.DictReader(f, delimiter="|")
        types = set()
        for row in reader:
            types.add(row["Type"])
            data.append(
                {
                    "lat": row["lat"],
                    "lng": row["lng"],
                    "name": row["Name"],
                    "type": row["Type"],
                    "url": "https://megalithic.co.uk/article.php?sid={}".format(
                        row["SID"]
                    ),
                    "quality": int(row["Condition"]),
                }
            )

    with open("cache/stone-circles.js") as f:
        content = f.read()
        content = re.match(r".+ = \[(\[.+]),?];", content.replace("\n", ""))
        content = re.sub(r'\\(?!")', "", content.groups()[0])
        arr = json.loads("[{}]".format(content))
        for item in arr:
            data.append(
                {
                    "lat": item[0],
                    "lng": item[1],
                    "name": re.sub(
                        r"<.+?>", "", re.match(r"<b>(.+)</b>", item[2]).groups()[0]
                    ),
                    "type": re.sub(r".+>", "", item[2].replace("<br>", " ")),
                    "url": "http://www.stone-circles.org.uk/stone/{}".format(
                        re.search(r"href=([a-zA-Z.]+)", item[2]).groups()[0]
                    ),
                    "quality": 5,
                }
            )


print("Generating GPX")
with open("Megaliths.gpx", "w") as gpx_file:
    gpx_file.write(
        """<?xml version="1.0" encoding="UTF-8"?>
<gpx xmlns="http://www.topografix.com/GPX/1/1" 
     version="1.1" 
     creator="megalithosm"
     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
     xsi:schemaLocation="http://www.topografix.com/GPX/1/1 http://www.topografix.com/GPX/1/1/gpx.xsd">
    <metadata>
        <name>Megalith sites</name>
        <author>
            <name>Jude Southworth</name>
        </author>
    </metadata>"""
    )

    seen_sites = set()
    for poi in data:
        norm_name = re.sub(r"\s", "", poi["name"]).lower()
        site_key = "{},{},{}".format(poi["lat"], poi["lng"], norm_name)
        if site_key in seen_sites:
            print("Omitting duplicate site: {}".format(poi["name"]))
            continue
        if poi["quality"] < args.quality:
            continue
        if poi["type"] in ["Museum", "Modern Stone Circle etc"]:
            print("Omitting uninteresting feature: {}".format(poi["name"]))
            continue

        # Deduplicate entries
        seen_sites.add(site_key)
        name = "{} ({})".format(poi["name"], poi["type"].strip())
        gpx_file.write(
            """
    <wpt lat="{}" lon="{}">
        <name>{}</name>
        <desc>{}</desc>
    </wpt>""".format(
                poi["lat"], poi["lng"], escape(name), escape(poi["url"])
            )
        )
    gpx_file.write("""\n</gpx>""")
Moved to single file 2024-02-26 13:02:43 +00:00			`import csv`
			`import json`
			`import argparse`
			`from html import escape`
			`from pathlib import Path`
			`import re`

			`import requests`

			`parser = argparse.ArgumentParser(`
			`prog="megalithosm",`
			`description="Fetch data from megalith sources and produce a GPX file",`
			`)`

			`parser.add_argument(`
			`"-q",`
			`"--quality",`
			`default=5,`
			`type=int,`
			`help="Include sites of this quality or higher",`
			`)`
			`parser.add_argument("-r", "--refetch", action="store_true", default=False)`

			`args = parser.parse_args()`

			`country_range = 6`


			`def cache_stone_circles():`
			`response = requests.get(`
			`"http://www.stone-circles.org.uk/stone/Cluster/Coords/coords.js"`
			`)`
			`if response.status_code != 200:`
			`raise Exception(`
			`"Failed to fetch data from stone-circles.org.uk: {}".format(response.text)`
			`)`

			`Path("cache").mkdir(exist_ok=True)`
			`with open("cache/stone-circles.js", "w") as f:`
			`f.write(response.text)`


			`def cache_megalithic():`
			`def megalithic_url(country):`
			`return "https://www.megalithic.co.uk/cache/csvmap_country{}.csv".format(country)`

			`for country in range(1, country_range):`
			`# Megalithic doesn't really want people scraping`
			`response = requests.get(`
			`megalithic_url(country),`
			`headers={`
			`"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0"`
			`},`
			`)`
			`if response.status_code != 200:`
			`raise Exception(`
			`"Failed to fetch data from Megalithic.UK: {}".format(response.text)`
			`)`

			`Path("cache").mkdir(exist_ok=True)`
			`with open("cache/megalithic-{}.csv".format(country), "w") as f:`
			`f.write(response.text)`


			`if args.refetch or not Path("cache/stone-circles.js").exists():`
			`print("Refreshing stone-circles.org.uk cache")`
			`cache_stone_circles()`

			`if args.refetch or not all(`
			`Path("cache/megalithic-{}.csv".format(c)).exists() for c in range(1, country_range)`
			`):`
			`print("Refreshing megalithic.co.uk cache")`
			`cache_megalithic()`


			`print("Post-processing data")`
			`data = []`
			`for country in range(1, country_range):`
			`with open("cache/megalithic-{}.csv".format(country)) as f:`
			`reader = csv.DictReader(f, delimiter="\|")`
			`types = set()`
			`for row in reader:`
			`types.add(row["Type"])`
			`data.append(`
			`{`
			`"lat": row["lat"],`
			`"lng": row["lng"],`
			`"name": row["Name"],`
			`"type": row["Type"],`
			`"url": "https://megalithic.co.uk/article.php?sid={}".format(`
			`row["SID"]`
			`),`
			`"quality": int(row["Condition"]),`
			`}`
			`)`

			`with open("cache/stone-circles.js") as f:`
			`content = f.read()`
			`content = re.match(r".+ = \[(\[.+]),?];", content.replace("\n", ""))`
			`content = re.sub(r'\\(?!")', "", content.groups()[0])`
			`arr = json.loads("[{}]".format(content))`
			`for item in arr:`
			`data.append(`
			`{`
			`"lat": item[0],`
			`"lng": item[1],`
			`"name": re.sub(`
			`r"<.+?>", "", re.match(r"<b>(.+)</b>", item[2]).groups()[0]`
			`),`
			`"type": re.sub(r".+>", "", item[2].replace("<br>", " ")),`
			`"url": "http://www.stone-circles.org.uk/stone/{}".format(`
			`re.search(r"href=([a-zA-Z.]+)", item[2]).groups()[0]`
			`),`
			`"quality": 5,`
			`}`
			`)`


			`print("Generating GPX")`
			`with open("Megaliths.gpx", "w") as gpx_file:`
			`gpx_file.write(`
			`"""<?xml version="1.0" encoding="UTF-8"?>`
			`<gpx xmlns="http://www.topografix.com/GPX/1/1"`
			`version="1.1"`
			`creator="megalithosm"`
			`xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"`
			`xsi:schemaLocation="http://www.topografix.com/GPX/1/1 http://www.topografix.com/GPX/1/1/gpx.xsd">`
			`<metadata>`
			`<name>Megalith sites</name>`
			`<author>`
			`<name>Jude Southworth</name>`
			`</author>`
			`</metadata>"""`
			`)`

			`seen_sites = set()`
			`for poi in data:`
			`norm_name = re.sub(r"\s", "", poi["name"]).lower()`
			`site_key = "{},{},{}".format(poi["lat"], poi["lng"], norm_name)`
			`if site_key in seen_sites:`
			`print("Omitting duplicate site: {}".format(poi["name"]))`
			`continue`
			`if poi["quality"] < args.quality:`
			`continue`
			`if poi["type"] in ["Museum", "Modern Stone Circle etc"]:`
			`print("Omitting uninteresting feature: {}".format(poi["name"]))`
			`continue`

			`# Deduplicate entries`
			`seen_sites.add(site_key)`
			`name = "{} ({})".format(poi["name"], poi["type"].strip())`
			`gpx_file.write(`
			`"""`
			`<wpt lat="{}" lon="{}">`
			`<name>{}</name>`
			`<desc>{}</desc>`
			`</wpt>""".format(`
			`poi["lat"], poi["lng"], escape(name), escape(poi["url"])`
			`)`
			`)`
			`gpx_file.write("""\n</gpx>""")`