Rewilding-specimens/image_fetch.py

import glob
import time
import urllib.request

import requests
import rdflib

HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'}

def find_image(rdf_doc):
    """
    Returns the first dwc:associatedMedia resource in the rdf_doc
    """
    graph = rdflib.Graph()
    graph.parse(rdf_doc)
    mediaref = rdflib.URIRef("http://rs.tdwg.org/dwc/terms/associatedMedia")
    for subj, pred, obj in graph:
        if pred == mediaref:
            return obj


with open("barcode_cleaned.csv") as bcfile:
    for line in bcfile:
        barcode = line.split(",")[0]

        # skip files we already have already scraped
        if glob.glob("specimen_img_raw/" + barcode + "*"):
            print("we already have", barcode, "skipping to...")
            continue

        rdf_url = "https://www.botanicalcollections.be/specimen/" + barcode + "/rdf"
        print("fetching", rdf_url)
        img_url = find_image(rdf_url)
        img_ext = str(img_url).rsplit(".", 1)[1]
        print("image url:", img_url)
        save_path = "specimen_img_raw/" + barcode + "." + img_ext
        save_resp = requests.get(img_url, headers=HEADERS, allow_redirects=True)
        open(save_path, "wb").write(save_resp.content)
        break
initial image scraper code 2 years ago			`import glob`
			`import time`
			`import urllib.request`

			`import requests`
			`import rdflib`

			`HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'}`

			`def find_image(rdf_doc):`
			`"""`
			`Returns the first dwc:associatedMedia resource in the rdf_doc`
			`"""`
			`graph = rdflib.Graph()`
			`graph.parse(rdf_doc)`
			`mediaref = rdflib.URIRef("http://rs.tdwg.org/dwc/terms/associatedMedia")`
			`for subj, pred, obj in graph:`
			`if pred == mediaref:`
			`return obj`


			`with open("barcode_cleaned.csv") as bcfile:`
			`for line in bcfile:`
			`barcode = line.split(",")[0]`

			`# skip files we already have already scraped`
			`if glob.glob("specimen_img_raw/" + barcode + "*"):`
			`print("we already have", barcode, "skipping to...")`
			`continue`

			`rdf_url = "https://www.botanicalcollections.be/specimen/" + barcode + "/rdf"`
			`print("fetching", rdf_url)`
			`img_url = find_image(rdf_url)`
			`img_ext = str(img_url).rsplit(".", 1)[1]`
			`print("image url:", img_url)`
			`save_path = "specimen_img_raw/" + barcode + "." + img_ext`
			`save_resp = requests.get(img_url, headers=HEADERS, allow_redirects=True)`
			`open(save_path, "wb").write(save_resp.content)`
			`break`