├── bmanalysis
    ├── data
    │   └── imgetter.py
    └── preprocessing.py
└── README.md


/bmanalysis/data/imgetter.py:
--------------------------------------------------------------------------------
 1 | # image downloader
 2 | from PIL import Image
 3 | import requests
 4 | import pandas as pd
 5 | from io import BytesIO
 6 | from glob import glob
 7 | import pdb
 8 | 
 9 | 
10 | def imgage_from_row(row: pd.Series, objids: list):
11 |     "obtain image from row"
12 |     objid = row["objid"]
13 |     imurl = row["Image"]
14 |     # pdb.set_trace()
15 |     if isinstance(imurl, str):
16 |         if "https" in imurl:
17 |             if objid not in objids:
18 |                 resp = requests.get(imurl, stream=True, verify=False)
19 |                 img = Image.open(BytesIO(resp.content))
20 |                 img.save("images/" + objid + ".png")
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     df = pd.read_csv("./EgyptBritishMuseum-2021-03-05CSVUnique.csv")
25 |     imgs = glob("images/*.png")
26 |     imgs = [im[7 : (len(im) - 4)] for im in imgs]
27 |     for i, row in df.iterrows():
28 |         imgage_from_row(row, objids=imgs)
29 |         pass
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # bmanalysis
 2 | Scripts for analysing British Museum Collections
 3 | 
 4 | # Data
 5 | 
 6 | The data is collected from the online collection of [British Museum](https://www.britishmuseum.org/collection).
 7 | 
 8 | The search criteria is the following:
 9 | 
10 | - Place: Egypt
11 | - Production date: 700 BC - 330 BC
12 | 
13 | It yielded 15.104 items. Some of those items are determined to be duplicated by `pd.DataFrame`.
14 | The elimination gave us the csv file, dully named as `EgyptBritishMuseum-2021-03-05CSVUnique.csv`.
15 | The date 2021-03-05 is the date of retrieval of the data. 
16 | You'll notice that some items have very vague production dates (5th C to 5 AD, etc), or completely
17 | off the chart, like 980BC-780BC etc, I have no idea why.
18 | 
19 | I have added a very simple script that scrapes images from the given links in csv file as well.
20 | You just need to create a `images` folder under the `data`. Beware that the ssl verification is set to `False` in `requests`. 
21 | Feel free to correct it if you are really concerned about it.
22 | 
23 | # Dependencies
24 | 
25 | - pandas
26 | - requests
27 | - opencv
28 | - scikit-learn
29 | 


--------------------------------------------------------------------------------
/bmanalysis/preprocessing.py:
--------------------------------------------------------------------------------
  1 | # basic preprocessing functions
  2 | import pandas as pd
  3 | import re
  4 | import pprint
  5 | import json
  6 | 
  7 | 
  8 | def get_unique_serie(df: pd.DataFrame, colname: str):
  9 |     "get unique values of given column as a serie"
 10 |     return pd.Series(df[colname].unique())
 11 | 
 12 | 
 13 | def get_uvals(uval: str, is_sep: bool = True) -> set:
 14 |     "uval"
 15 |     uvals = set()
 16 |     if ";" in uval:
 17 |         if is_sep:
 18 |             for t in uval.split(";"):
 19 |                 uvals.add(t.strip() if t else "")
 20 |         else:
 21 |             uvals.add(uval.strip() if uval else "")
 22 |     else:
 23 |         uvals.add(uval)
 24 |     return uvals
 25 | 
 26 | 
 27 | def get_unique_values(df: pd.DataFrame, colname: str, is_sep: bool = True) -> set:
 28 |     "get object type values inside data frame"
 29 |     dfo_type = get_unique_serie(df, colname)
 30 |     otypes = set()
 31 |     for do in dfo_type:
 32 |         uvals = get_uvals(do, is_sep)
 33 |         otypes = otypes.union(uvals)
 34 |     return otypes
 35 | 
 36 | 
 37 | def get_nb(s: str) -> float:
 38 |     "find number in string"
 39 |     nb_matcher = re.compile(r"\d+(?:\.\d+)?")
 40 |     ss = nb_matcher.search(s)
 41 |     if ss is None:
 42 |         return
 43 |     else:
 44 |         return float(ss.group(0))
 45 | 
 46 | 
 47 | def get_unit(s: str) -> str:
 48 |     "get unit from dimension string"
 49 |     units = [" meter", " centimetres", "cm", "m"]
 50 |     for u in units:
 51 |         if u in s:
 52 |             return u[1:].lower() if u != "m" else u
 53 | 
 54 | 
 55 | def get_dimension_type(s: str) -> str:
 56 |     "get dimension type from dimension part"
 57 |     return s.split(":").pop(0).strip().lower()
 58 | 
 59 | 
 60 | def get_dimension(col_value: str) -> dict:
 61 |     "get dimension"
 62 |     dims = {}
 63 |     for value in col_value.split(";"):
 64 |         vtype = get_dimension_type(value)
 65 |         v_val = get_nb(value)
 66 |         vunit = get_unit(value)
 67 |         dims[vtype] = (vunit, v_val)
 68 |     return dims
 69 | 
 70 | 
 71 | def parse_dimension(df: pd.DataFrame):
 72 |     "parse dimensions inside dataframe"
 73 |     dfcol = get_unique_serie(df, colname="Dimensions")
 74 |     ds = set()
 75 |     dnames = set()
 76 |     dunits = set()
 77 |     for colvalue in dfcol:
 78 |         dimensions = get_dimension(colvalue)
 79 |         for dname, (dunit, dvals) in dimensions.items():
 80 |             dunits.add(dunit)
 81 |             dnames.add(dname)
 82 |         dims = sorted(list(dimensions.items()), key=lambda x: x[0])
 83 |         ds.add(tuple(dims))
 84 |     return ds, dnames, dunits
 85 | 
 86 | 
 87 | class DimensionPart:
 88 |     "Dimension Part"
 89 | 
 90 |     def __init__(self, n: str, v: float, u: str):
 91 |         self.name = n
 92 |         self.value = v
 93 |         self.unit = u
 94 | 
 95 |     def to_dict(self):
 96 |         return {self.name: {self.unit: self.value}}
 97 | 
 98 |     def to_tuple(self):
 99 |         return (self.name, self.unit, self.value)
100 | 
101 | 
102 | def get_date_value(s: str):
103 |     dval = get_nb(s)
104 |     if dval is None:
105 |         return
106 |     supp = s.upper()
107 |     if "THC" in supp:
108 |         dval = 100 * dval
109 |     if "BC" in supp:
110 |         return -dval
111 |     elif "AD" in s.upper():
112 |         return dval
113 |     else:
114 |         return dval
115 | 
116 | 
117 | def is_circa(s: str):
118 |     "if circa contained within string"
119 |     return "circa" in s
120 | 
121 | 
122 | def get_date_part(s: str, nb: int):
123 |     "date part"
124 |     splited = s.split("-")
125 |     if len(splited) == 1:
126 |         return splited.pop(0)
127 |     else:
128 |         return splited.pop(nb)
129 | 
130 | 
131 | def get_start_date(s: str):
132 |     "start date"
133 |     dval = get_date_value(get_date_part(s, 0))
134 |     if dval is None:
135 |         return dval
136 |     else:
137 |         return int(dval)
138 | 
139 | 
140 | def get_end_date(s: str):
141 |     "end date"
142 |     dval = get_date_value(get_date_part(s, 1))
143 |     if dval is None:
144 |         print(s)
145 |         print(dval)
146 |     else:
147 |         return int(dval)
148 | 
149 | 
150 | def get_date(s: str):
151 |     "get date"
152 |     d = {}
153 |     d["start"] = None
154 |     d["end"] = None
155 |     d["is_circa"] = False
156 |     if "Sixth century BC" == s:
157 |         d["start"] = -599
158 |         d["end"] = -500
159 |         return d
160 |     elif "(mid-late)5thC BC" == s:
161 |         d["start"] = -450
162 |         d["end"] = -400
163 |         return d
164 |     elif "Fourth century BC (?)" == s:
165 |         d["start"] = -399
166 |         d["end"] = -300
167 |         d["is_circa"] = True
168 |         return d
169 |     elif "Fifth century BC (?)" == s:
170 |         d["start"] = -499
171 |         d["end"] = -400
172 |         d["is_circa"] = True
173 |         return d
174 |     else:
175 |         d["start"] = get_start_date(s)
176 |         d["end"] = get_end_date(s)
177 |         d["is_circa"] = is_circa(s)
178 |         return d
179 | 
180 | 
181 | def get_dates(dates: pd.Series):
182 |     ""
183 |     ds = set()
184 |     for d in dates:
185 |         ddict = get_date(d)
186 |         dtpl = tuple(sorted(list(ddict.items()), key=lambda x: (x[0], x[1])))
187 |         ds.add(dtpl)
188 |     return ds
189 | 
190 | 
191 | def dfjsonable(df_dict):
192 |     dd = {}
193 |     row_nb = len(df_dict["Image"])
194 |     for key, values in df_dict.items():
195 |         dd[key] = []
196 |         if "Production date" in key:
197 |             for val in values.values():
198 |                 if pd.isna(val) is False:
199 |                     d = get_date(val)
200 |                     dd[key].append(d)
201 |                 else:
202 |                     dd[key].append(val)
203 |         elif "Dimensions" in key:
204 |             for val in values.values():
205 |                 if pd.isna(val) is False:
206 |                     d = get_dimension(val)
207 |                     dd[key].append(d)
208 |                 else:
209 |                     dd[key].append(val)
210 |         elif "Object type" in key:
211 |             for val in values.values():
212 |                 if pd.isna(val) is False:
213 |                     d = get_uvals(val, is_sep=True)
214 |                     dd[key].append(list(d))
215 |                 else:
216 |                     dd[key].append(val)
217 |         elif "Culture" in key:
218 |             for val in values.values():
219 |                 if pd.isna(val) is False:
220 |                     d = get_uvals(val, is_sep=True)
221 |                     dd[key].append(list(d))
222 |                 else:
223 |                     dd[key].append(val)
224 |         elif "Technique" in key:
225 |             for val in values.values():
226 |                 if pd.isna(val) is False:
227 |                     d = get_uvals(val, is_sep=True)
228 |                     dd[key].append(list(d))
229 |                 else:
230 |                     dd[key].append(val)
231 |         else:
232 |             for val in values.values():
233 |                 dd[key].append(val)
234 |     return dd
235 | 
236 | 
237 | def wjson(dd, path="./data/EgyptBritishMuseum-2021-03-05v2.json"):
238 |     with open(path, "w", encoding="utf-8") as f:
239 |         js = json.dumps(dd, ensure_ascii=False, indent=2)
240 |         f.write(js)
241 | 
242 | if __name__ == "__main__":
243 |     path = "./data/EgyptBritishMuseum-2021-03-05CSVUnique.csv"
244 |     df = pd.read_csv(path)
245 |     df_dict = df.to_dict()
246 |     dd = dfjsonable(df_dict)
247 |     # wjson(dd) # be sure to change path before uncommenting
248 | 


--------------------------------------------------------------------------------