├── bmanalysis ├── data │ └── imgetter.py └── preprocessing.py └── README.md /bmanalysis/data/imgetter.py: -------------------------------------------------------------------------------- 1 | # image downloader 2 | from PIL import Image 3 | import requests 4 | import pandas as pd 5 | from io import BytesIO 6 | from glob import glob 7 | import pdb 8 | 9 | 10 | def imgage_from_row(row: pd.Series, objids: list): 11 | "obtain image from row" 12 | objid = row["objid"] 13 | imurl = row["Image"] 14 | # pdb.set_trace() 15 | if isinstance(imurl, str): 16 | if "https" in imurl: 17 | if objid not in objids: 18 | resp = requests.get(imurl, stream=True, verify=False) 19 | img = Image.open(BytesIO(resp.content)) 20 | img.save("images/" + objid + ".png") 21 | 22 | 23 | if __name__ == "__main__": 24 | df = pd.read_csv("./EgyptBritishMuseum-2021-03-05CSVUnique.csv") 25 | imgs = glob("images/*.png") 26 | imgs = [im[7 : (len(im) - 4)] for im in imgs] 27 | for i, row in df.iterrows(): 28 | imgage_from_row(row, objids=imgs) 29 | pass 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bmanalysis 2 | Scripts for analysing British Museum Collections 3 | 4 | # Data 5 | 6 | The data is collected from the online collection of [British Museum](https://www.britishmuseum.org/collection). 7 | 8 | The search criteria is the following: 9 | 10 | - Place: Egypt 11 | - Production date: 700 BC - 330 BC 12 | 13 | It yielded 15.104 items. Some of those items are determined to be duplicated by `pd.DataFrame`. 14 | The elimination gave us the csv file, dully named as `EgyptBritishMuseum-2021-03-05CSVUnique.csv`. 15 | The date 2021-03-05 is the date of retrieval of the data. 16 | You'll notice that some items have very vague production dates (5th C to 5 AD, etc), or completely 17 | off the chart, like 980BC-780BC etc, I have no idea why. 18 | 19 | I have added a very simple script that scrapes images from the given links in csv file as well. 20 | You just need to create a `images` folder under the `data`. Beware that the ssl verification is set to `False` in `requests`. 21 | Feel free to correct it if you are really concerned about it. 22 | 23 | # Dependencies 24 | 25 | - pandas 26 | - requests 27 | - opencv 28 | - scikit-learn 29 | -------------------------------------------------------------------------------- /bmanalysis/preprocessing.py: -------------------------------------------------------------------------------- 1 | # basic preprocessing functions 2 | import pandas as pd 3 | import re 4 | import pprint 5 | import json 6 | 7 | 8 | def get_unique_serie(df: pd.DataFrame, colname: str): 9 | "get unique values of given column as a serie" 10 | return pd.Series(df[colname].unique()) 11 | 12 | 13 | def get_uvals(uval: str, is_sep: bool = True) -> set: 14 | "uval" 15 | uvals = set() 16 | if ";" in uval: 17 | if is_sep: 18 | for t in uval.split(";"): 19 | uvals.add(t.strip() if t else "") 20 | else: 21 | uvals.add(uval.strip() if uval else "") 22 | else: 23 | uvals.add(uval) 24 | return uvals 25 | 26 | 27 | def get_unique_values(df: pd.DataFrame, colname: str, is_sep: bool = True) -> set: 28 | "get object type values inside data frame" 29 | dfo_type = get_unique_serie(df, colname) 30 | otypes = set() 31 | for do in dfo_type: 32 | uvals = get_uvals(do, is_sep) 33 | otypes = otypes.union(uvals) 34 | return otypes 35 | 36 | 37 | def get_nb(s: str) -> float: 38 | "find number in string" 39 | nb_matcher = re.compile(r"\d+(?:\.\d+)?") 40 | ss = nb_matcher.search(s) 41 | if ss is None: 42 | return 43 | else: 44 | return float(ss.group(0)) 45 | 46 | 47 | def get_unit(s: str) -> str: 48 | "get unit from dimension string" 49 | units = [" meter", " centimetres", "cm", "m"] 50 | for u in units: 51 | if u in s: 52 | return u[1:].lower() if u != "m" else u 53 | 54 | 55 | def get_dimension_type(s: str) -> str: 56 | "get dimension type from dimension part" 57 | return s.split(":").pop(0).strip().lower() 58 | 59 | 60 | def get_dimension(col_value: str) -> dict: 61 | "get dimension" 62 | dims = {} 63 | for value in col_value.split(";"): 64 | vtype = get_dimension_type(value) 65 | v_val = get_nb(value) 66 | vunit = get_unit(value) 67 | dims[vtype] = (vunit, v_val) 68 | return dims 69 | 70 | 71 | def parse_dimension(df: pd.DataFrame): 72 | "parse dimensions inside dataframe" 73 | dfcol = get_unique_serie(df, colname="Dimensions") 74 | ds = set() 75 | dnames = set() 76 | dunits = set() 77 | for colvalue in dfcol: 78 | dimensions = get_dimension(colvalue) 79 | for dname, (dunit, dvals) in dimensions.items(): 80 | dunits.add(dunit) 81 | dnames.add(dname) 82 | dims = sorted(list(dimensions.items()), key=lambda x: x[0]) 83 | ds.add(tuple(dims)) 84 | return ds, dnames, dunits 85 | 86 | 87 | class DimensionPart: 88 | "Dimension Part" 89 | 90 | def __init__(self, n: str, v: float, u: str): 91 | self.name = n 92 | self.value = v 93 | self.unit = u 94 | 95 | def to_dict(self): 96 | return {self.name: {self.unit: self.value}} 97 | 98 | def to_tuple(self): 99 | return (self.name, self.unit, self.value) 100 | 101 | 102 | def get_date_value(s: str): 103 | dval = get_nb(s) 104 | if dval is None: 105 | return 106 | supp = s.upper() 107 | if "THC" in supp: 108 | dval = 100 * dval 109 | if "BC" in supp: 110 | return -dval 111 | elif "AD" in s.upper(): 112 | return dval 113 | else: 114 | return dval 115 | 116 | 117 | def is_circa(s: str): 118 | "if circa contained within string" 119 | return "circa" in s 120 | 121 | 122 | def get_date_part(s: str, nb: int): 123 | "date part" 124 | splited = s.split("-") 125 | if len(splited) == 1: 126 | return splited.pop(0) 127 | else: 128 | return splited.pop(nb) 129 | 130 | 131 | def get_start_date(s: str): 132 | "start date" 133 | dval = get_date_value(get_date_part(s, 0)) 134 | if dval is None: 135 | return dval 136 | else: 137 | return int(dval) 138 | 139 | 140 | def get_end_date(s: str): 141 | "end date" 142 | dval = get_date_value(get_date_part(s, 1)) 143 | if dval is None: 144 | print(s) 145 | print(dval) 146 | else: 147 | return int(dval) 148 | 149 | 150 | def get_date(s: str): 151 | "get date" 152 | d = {} 153 | d["start"] = None 154 | d["end"] = None 155 | d["is_circa"] = False 156 | if "Sixth century BC" == s: 157 | d["start"] = -599 158 | d["end"] = -500 159 | return d 160 | elif "(mid-late)5thC BC" == s: 161 | d["start"] = -450 162 | d["end"] = -400 163 | return d 164 | elif "Fourth century BC (?)" == s: 165 | d["start"] = -399 166 | d["end"] = -300 167 | d["is_circa"] = True 168 | return d 169 | elif "Fifth century BC (?)" == s: 170 | d["start"] = -499 171 | d["end"] = -400 172 | d["is_circa"] = True 173 | return d 174 | else: 175 | d["start"] = get_start_date(s) 176 | d["end"] = get_end_date(s) 177 | d["is_circa"] = is_circa(s) 178 | return d 179 | 180 | 181 | def get_dates(dates: pd.Series): 182 | "" 183 | ds = set() 184 | for d in dates: 185 | ddict = get_date(d) 186 | dtpl = tuple(sorted(list(ddict.items()), key=lambda x: (x[0], x[1]))) 187 | ds.add(dtpl) 188 | return ds 189 | 190 | 191 | def dfjsonable(df_dict): 192 | dd = {} 193 | row_nb = len(df_dict["Image"]) 194 | for key, values in df_dict.items(): 195 | dd[key] = [] 196 | if "Production date" in key: 197 | for val in values.values(): 198 | if pd.isna(val) is False: 199 | d = get_date(val) 200 | dd[key].append(d) 201 | else: 202 | dd[key].append(val) 203 | elif "Dimensions" in key: 204 | for val in values.values(): 205 | if pd.isna(val) is False: 206 | d = get_dimension(val) 207 | dd[key].append(d) 208 | else: 209 | dd[key].append(val) 210 | elif "Object type" in key: 211 | for val in values.values(): 212 | if pd.isna(val) is False: 213 | d = get_uvals(val, is_sep=True) 214 | dd[key].append(list(d)) 215 | else: 216 | dd[key].append(val) 217 | elif "Culture" in key: 218 | for val in values.values(): 219 | if pd.isna(val) is False: 220 | d = get_uvals(val, is_sep=True) 221 | dd[key].append(list(d)) 222 | else: 223 | dd[key].append(val) 224 | elif "Technique" in key: 225 | for val in values.values(): 226 | if pd.isna(val) is False: 227 | d = get_uvals(val, is_sep=True) 228 | dd[key].append(list(d)) 229 | else: 230 | dd[key].append(val) 231 | else: 232 | for val in values.values(): 233 | dd[key].append(val) 234 | return dd 235 | 236 | 237 | def wjson(dd, path="./data/EgyptBritishMuseum-2021-03-05v2.json"): 238 | with open(path, "w", encoding="utf-8") as f: 239 | js = json.dumps(dd, ensure_ascii=False, indent=2) 240 | f.write(js) 241 | 242 | if __name__ == "__main__": 243 | path = "./data/EgyptBritishMuseum-2021-03-05CSVUnique.csv" 244 | df = pd.read_csv(path) 245 | df_dict = df.to_dict() 246 | dd = dfjsonable(df_dict) 247 | # wjson(dd) # be sure to change path before uncommenting 248 | --------------------------------------------------------------------------------