import glob import time import urllib.error import urllib.request import requests import rdflib HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'} def find_image(rdf_doc): """ Returns the first dwc:associatedMedia resource in the rdf_doc """ graph = rdflib.Graph() graph.parse(rdf_doc) mediaref = rdflib.URIRef("http://rs.tdwg.org/dwc/terms/associatedMedia") for subj, pred, obj in graph: if pred == mediaref: return obj with open("belgian_colony_data_all.csv") as bcfile: for line in bcfile: barcode = line.split(",")[0] # skip files we already have already scraped if glob.glob("specimen_img_raw/" + barcode + "*"): print("we already have", barcode, "skipping to...") continue rdf_url = "https://www.botanicalcollections.be/specimen/" + barcode + "/rdf" print("fetching", rdf_url) try: img_url = find_image(rdf_url) except urllib.error.HTTPError as er: print("Error on ", rdf_url, str(er)) continue if img_url is None: print("Error. No image for barcode.") continue img_ext = str(img_url).rsplit(".", 1)[1] print("image url:", img_url) save_path = "specimen_img_raw/" + barcode + "." + img_ext save_resp = requests.get(img_url, headers=HEADERS, allow_redirects=True) open(save_path, "wb").write(save_resp.content) time.sleep(1)