├── .gitignore ├── AssemblingOntology.py ├── AssemblingTerms.py ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── FOSMAP.json ├── FOSMAP_700.json ├── InterimTerms.json ├── LICENSE ├── MatchedFOS.json ├── Methodology.md ├── OSDG-Ontology.json ├── OSDG-Ontology.xlsx ├── OSDG-Ontology_ver-min-1.json ├── OSDG_DATA_SOURCES.md ├── README.md ├── comparison_fos_update.xlsx ├── images ├── Methodology-visual_0511_Updated.png ├── OSDG.png └── OSDG_new.png ├── raw_data ├── 0_add │ ├── 00_add_validated │ │ ├── 0_PuigOntology │ │ │ ├── 0_ProcessedKeyTerms.json │ │ │ ├── 0_process_key_terms.py │ │ │ └── Ontology.csv │ │ └── 6_SDGIO_terms │ │ │ ├── 6_ProcessedKeyTerms.json │ │ │ ├── 6_process_key_terms.py │ │ │ └── SDG Terms by Indicator.xlsx │ ├── 01_add_generated │ │ ├── 1_FP7-4-SD_edited │ │ │ ├── 1_ProcessedKeyTerms.json │ │ │ ├── 1_process_key_terms.py │ │ │ ├── FOSMAP.json │ │ │ ├── NewWU.json │ │ │ ├── ProjectFOS.json │ │ │ ├── WU_projectSDGs.json │ │ │ └── bad_fos.py │ │ ├── 2_LinkedSDG_Concepts │ │ │ ├── 2_ProcessedKeyTerms.json │ │ │ ├── 2_process_key_terms.py │ │ │ └── LinkedSDG_Data.xlsx │ │ ├── 3_SDGPathfinder_DocumentConcepts │ │ │ ├── 3_ProcessedKeyTerms.json │ │ │ ├── 3_process_key_terms.py │ │ │ └── OECD_SDG_betas.xlsx │ │ ├── 4_SDGPathfinder_Keywords │ │ │ ├── 4_ProcessedKeyTerms.json │ │ │ ├── 4_process_key_terms.py │ │ │ └── keywords.csv │ │ ├── 5_LinkedSDG_DocumentExtracts │ │ │ ├── 5_ProcessedKeyTerms.json │ │ │ ├── 5_process_key_terms.py │ │ │ └── LinkedSDG_DocumentExtracts.xlsx │ │ ├── 7_EC_Policy_Doc_Terms │ │ │ ├── 7_ProcessedKeyTerms.json │ │ │ ├── 7_process_key_terms.py │ │ │ └── ECPolicyDocs_Ngrams REVISED.xlsx │ │ └── 9_SIRIS_Science4SDGs │ │ │ ├── 9_ProcessedKeyTerms.json │ │ │ ├── 9_process_key_terms.py │ │ │ ├── sdg_vocabulary_V1.2 [zenodo](single_shhet).xlsx │ │ │ └── sdg_vocabulary_V1.2 [zenodo].xlsx │ ├── 02_add_all_to_all │ │ ├── 10_PPMI_boost │ │ │ ├── 10_ProcessedFOS.json │ │ │ ├── 10_process_fos.py │ │ │ ├── SDG FOS updated 06 01.xlsx │ │ │ └── SDG FOS updated 06 12.xlsx │ │ └── 8_NABS_FOS │ │ │ ├── 8_ProcessedFOS.json │ │ │ ├── 8_process_fos.py │ │ │ └── NABS_FOS_update_2020-08-20_ed_VS.xlsx │ ├── GeneratedSdgTerms.json │ └── ValidatedSdgTerms.json ├── 1_replace │ ├── 11_TJL-24_review │ │ ├── 11_ReplaceFOS.json │ │ ├── 11_process_replace_fos.py │ │ └── osdg_fos_paper_citation_counts_REPLACE_v2_ed_VS.xlsx │ ├── 12_Review_2020-10-02 │ │ ├── 12_ReplaceFOS.json │ │ ├── 12_process_replace_fos.py │ │ └── replace-review_2020-10-02.csv │ └── ReplacedFOS.xlsx ├── 2_remove │ ├── 20_FP7-4-SD_edited │ │ ├── 20_RemoveFOS.json │ │ ├── 20_process_remove_fos.py │ │ └── bad_fos.csv │ ├── 21_8_NABS_FOS │ │ ├── 21_RemoveFOS.json │ │ ├── 21_process_remove_fos.py │ │ └── NABS_FOS_update_2020-08-20_NOT-RELEVANT__ed_VS.xlsx │ ├── 22_TJL-24_review │ │ ├── 22_RemoveFOS.json │ │ ├── 22_process_remove_fos.py │ │ └── osdg_fos_paper_citation_counts_REMOVE_v2_ed_VS.xlsx │ ├── 23_Restructuring_review │ │ ├── 23_RemoveFOS.json │ │ ├── 23_process_remove_fos.py │ │ └── sdg-fos_restructuring-v3_to-remove.xlsx │ ├── 24_Review_2020-10-02 │ │ ├── 24_RemoveFOS.json │ │ ├── 24_process_remove_fos.py │ │ └── remove-review_2020-10-02.csv │ ├── 25_TOL-7_MostPopularSDG3FOS │ │ ├── 25_RemoveFOS.json │ │ ├── 25_process_remove_fos.py │ │ └── TOL-7_MostPopularSDG3RemoveFOS.csv │ └── RemovedFOS.xlsx └── 3_blacklist │ ├── 30_8_NABS_FOS │ ├── 30_BlacklistFOS.csv │ ├── 30_process_blacklist_fos.py │ └── NABS_FOS_update_2020-08-20_NOT-RELEVANT__ed_VS.xlsx │ ├── AssembleBlacklist.py │ └── Blacklist.csv ├── requirements.txt ├── sampleAPICall.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .DS_Store 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # VSCode project settings 115 | .vscode 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | -------------------------------------------------------------------------------- /AssemblingOntology.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import cpu_count 2 | from tqdm import tqdm 3 | from utils import process_fosname, levenshtein_ratio, sws, sdg_label_sort 4 | 5 | import concurrent.futures 6 | import json 7 | import os 8 | import pandas as pd 9 | 10 | 11 | def process_add_all_to_all_fos(): 12 | path = 'raw_data/0_add/02_add_all_to_all' 13 | processed_fos = dict() 14 | add_all_to_all_data_paths = [ 15 | f'{path}/{directory_name}' 16 | for directory_name in os.listdir(path) 17 | if '.' not in directory_name 18 | ] 19 | for directory in add_all_to_all_data_paths: 20 | try: 21 | processed_sdg_fos_fname = list(filter(lambda oname: '_ProcessedFOS.json' in oname, os.listdir(directory)))[0] 22 | except IndexError: 23 | print('Sdg FOS are not processed in {directory}') 24 | continue 25 | with open(f'{directory}/{processed_sdg_fos_fname}', 'r') as file_: 26 | processed_sdg_fos = json.load(file_) 27 | for sdg_label, fos in processed_sdg_fos.items(): 28 | if sdg_label not in processed_fos.keys(): 29 | processed_fos[sdg_label] = set() 30 | processed_fos[sdg_label].update(map(lambda x: (str(x[0]), x[1]), fos)) 31 | 32 | return processed_fos 33 | 34 | 35 | def process_replace_fos(): 36 | replace_fos = [] 37 | 38 | path = 'raw_data/1_replace' 39 | add_replace_data_paths = sorted([ 40 | f'{path}/{directory_name}' 41 | for directory_name in os.listdir(path) 42 | if '.' not in directory_name 43 | ], 44 | key=lambda x: int(x.split('/')[-1].split('_')[0])) 45 | 46 | for directory in add_replace_data_paths: 47 | try: 48 | processed_replace_fos_fname = list(filter(lambda oname: '_ReplaceFOS.json' in oname, os.listdir(directory)))[0] 49 | except IndexError: 50 | print('Sdg replace FOS are not processed in {directory}') 51 | continue 52 | with open(f'{directory}/{processed_replace_fos_fname}', 'r') as file_: 53 | processed_replace_fos = json.load(file_) 54 | for fos_id, moves in processed_replace_fos.items(): 55 | for move in moves: 56 | replace_fos.append((str(fos_id), move)) 57 | 58 | return replace_fos 59 | 60 | 61 | def process_remove_fos(): 62 | remove_fos = dict() 63 | 64 | path = 'raw_data/2_remove' 65 | add_remove_data_paths = [ 66 | f'{path}/{directory_name}' 67 | for directory_name in os.listdir(path) 68 | if '.' not in directory_name 69 | ] 70 | for directory in add_remove_data_paths: 71 | try: 72 | processed_remove_fos_fname = list(filter(lambda oname: '_RemoveFOS.json' in oname, os.listdir(directory)))[0] 73 | except IndexError: 74 | print('Sdg remove FOS are not processed in {directory}') 75 | continue 76 | with open(f'{directory}/{processed_remove_fos_fname}', 'r') as file_: 77 | processed_remove_fos = json.load(file_) 78 | 79 | for sdg_label, fos_ids in processed_remove_fos.items(): 80 | if sdg_label not in remove_fos.keys(): 81 | remove_fos[sdg_label] = set() 82 | remove_fos[sdg_label].update(map(lambda fos_id: str(fos_id), fos_ids)) 83 | 84 | return remove_fos 85 | 86 | 87 | with open("InterimTerms.json", "r") as file_: 88 | sdg_terms = json.loads(file_.read()) 89 | 90 | with open('FOSMAP_700.json', 'r') as file_: 91 | fos_map_700 = json.load(file_) 92 | 93 | with open("FOSMAP.json", "r") as file_: 94 | fos_map = json.loads(file_.read()) 95 | fos_to_match = [(fos_id, process_fosname(fos_name)) for fos_id, fos_name in fos_map.items()] 96 | 97 | 98 | """ 99 | Matching with Fields of Study from MS Academic (v10-10-2019) 100 | Match criteria: 101 | all tokens from a concept must be present in FOS name 102 | levenstein similarity between concept and FOS name must be > 0.85 103 | """ 104 | sdg_matched_fos = dict() 105 | 106 | 107 | def _match_terms_to_fos(sdg_label, terms, fos_to_match, sws, use_pbar, total): 108 | sdg_matched_fos = dict() 109 | if use_pbar: 110 | step = total // len(terms) 111 | total = step * len(terms) 112 | p_bar = tqdm(terms, desc=f'Processing {sdg_label}', total=total, leave=True) 113 | for term, sources in terms: 114 | matched_fos = [] 115 | term_parts = list(filter(lambda w: w not in sws, term.split())) 116 | for fos_id, fos_name in fos_to_match: 117 | if all(p in fos_name for p in term_parts) and levenshtein_ratio(term, fos_name) > 0.85: 118 | matched_fos.append([str(fos_id), fos_name]) 119 | 120 | matched_fos = sorted(matched_fos, key=lambda x: x[1]) 121 | matched_fos_ids, matched_fos_names = list(map(lambda x: x[0], matched_fos)), list(map(lambda x: x[1], matched_fos)) 122 | sdg_matched_fos[term] = { 123 | "sources": sorted(sources), 124 | "matched_FOS_ids": matched_fos_ids, 125 | "matched_FOS_names": matched_fos_names 126 | } 127 | 128 | if use_pbar: 129 | p_bar.update(step) 130 | if use_pbar: 131 | p_bar.close() 132 | 133 | return sdg_label, sdg_matched_fos 134 | 135 | 136 | n_workers = cpu_count() - 1 137 | for sdg_label, terms in sdg_terms.items(): 138 | terms = list(terms.items()) 139 | term_batches = [] 140 | bs = (len(terms) + n_workers - 1) // n_workers 141 | for i in range(n_workers): 142 | batch = terms[i*bs:(i+1)*bs] 143 | if batch: 144 | term_batches.append(batch) 145 | with concurrent.futures.ProcessPoolExecutor(max_workers=n_workers) as executor: 146 | futures = [] 147 | for i, batch in enumerate(term_batches): 148 | use_pbar = i == (len(term_batches) - 2) 149 | futures.append(executor.submit( 150 | _match_terms_to_fos, 151 | sdg_label, batch, fos_to_match[:], sws, 152 | use_pbar=use_pbar, total=len(terms) 153 | )) 154 | 155 | for future in concurrent.futures.as_completed(futures): 156 | sdg_label, matched_fos = future.result() 157 | if sdg_label not in sdg_matched_fos.keys(): 158 | sdg_matched_fos[sdg_label] = dict() 159 | sdg_matched_fos[sdg_label].update(matched_fos) 160 | 161 | sdg_labels = sorted(sdg_matched_fos.keys(), key=sdg_label_sort) 162 | sdg_matched_fos = { 163 | sdg_label: { 164 | fos: sdg_matched_fos[sdg_label][fos] for fos in sorted(sdg_matched_fos[sdg_label].keys()) 165 | } for sdg_label in sdg_labels 166 | } 167 | with open("MatchedFOS.json", "w") as file_: 168 | json.dump(sdg_matched_fos, file_) 169 | 170 | 171 | sdg_fos = dict() 172 | for sdg_label, sdg_term_data in sdg_matched_fos.items(): 173 | foses = set() 174 | for term_data in list(sdg_term_data.values()): 175 | foses.update(term_data['matched_FOS_ids']) 176 | sdg_fos[sdg_label] = foses 177 | 178 | print('\n\n\t--- Percentage of matched FOS ---') 179 | for sdg_label, sdg_term_data in sdg_matched_fos.items(): 180 | c = sum(not term_data["matched_FOS_ids"] for term_data in sdg_term_data.values()) 181 | print(f'\t{sdg_label} - {100 - int(c * 100 / len(sdg_term_data))}%') 182 | 183 | 184 | """ 185 | Adding 0_add/02_all_to_all FOS 186 | """ 187 | processed_all_to_all_fos = process_add_all_to_all_fos() 188 | for sdg_label, foses in processed_all_to_all_fos.items(): 189 | print(f'{sdg_label} - {len(foses)}') 190 | fos_ids = list(map(lambda fos: fos[0], foses)) 191 | if sdg_label not in sdg_fos.keys(): 192 | sdg_fos[sdg_label] = set() 193 | sdg_fos[sdg_label].update(fos_ids) 194 | 195 | 196 | """ 197 | Replacing 1_replace/ FOS 198 | """ 199 | data_replaced_fos = {'fos_id': [], 'fos_name': [], 'from_sdg': [], 'to_sdg': []} 200 | processed_replace_fos = process_replace_fos() 201 | for fos_id, move in processed_replace_fos: 202 | fos_name = fos_map_700.get(fos_id, '') 203 | from_sdg, to_sdg = move 204 | try: 205 | sdg_fos[from_sdg].remove(fos_id) 206 | except KeyError: 207 | from_sdg = '' 208 | sdg_fos[to_sdg].add(fos_id) 209 | 210 | data_replaced_fos['fos_id'].append(fos_id) 211 | data_replaced_fos['fos_name'].append(fos_name) 212 | data_replaced_fos['from_sdg'].append(from_sdg) 213 | data_replaced_fos['to_sdg'].append(to_sdg) 214 | 215 | 216 | df_replaced = pd.DataFrame(data_replaced_fos) 217 | df_replaced.to_excel('raw_data/1_replace/ReplacedFOS.xlsx', index=False) 218 | 219 | """ 220 | Removing 2_remove/ FOS 221 | """ 222 | data_removed_fos = {'sdg_label': [], 'fos_id': [], 'fos_name': []} 223 | removed_fos = dict() 224 | processed_remove_fos = process_remove_fos() 225 | for sdg_label, fos_to_remove in processed_remove_fos.items(): 226 | if sdg_label not in removed_fos.keys(): 227 | removed_fos[sdg_label] = set() 228 | 229 | if sdg_label in sdg_fos.keys(): 230 | removed_fos[sdg_label].update(sdg_fos[sdg_label].intersection(fos_to_remove)) 231 | sdg_fos[sdg_label] = sdg_fos[sdg_label].difference(fos_to_remove) 232 | else: 233 | removed_fos[sdg_label] = [] 234 | 235 | for sdg_label, rm_fos_ids in removed_fos.items(): 236 | for fos_id in rm_fos_ids: 237 | fos_name = fos_map_700.get(str(fos_id)) 238 | if not fos_name: 239 | fos_name = '' 240 | data_removed_fos['sdg_label'].append(sdg_label) 241 | data_removed_fos['fos_id'].append(fos_id) 242 | data_removed_fos['fos_name'].append(fos_name) 243 | 244 | df_removed = pd.DataFrame(data_removed_fos).sort_values(['sdg_label', 'fos_name']) 245 | df_removed.to_excel('raw_data/2_remove/RemovedFOS.xlsx', index=False) 246 | 247 | """ 248 | Writing to file 249 | """ 250 | for sdg_label, fos_ids in sdg_fos.items(): 251 | sdg_fos[sdg_label] = sorted(fos_ids) 252 | 253 | print("\n\t--- Final FOS Count ---") 254 | for sdg_label, foses in sdg_fos.items(): 255 | print(f'\t{sdg_label} - {len(foses)}') 256 | 257 | with open('OSDG-Ontology.json', 'r') as file_: 258 | sdg_fos_old = json.load(file_) 259 | 260 | with open('OSDG-Ontology_ver-min-1.json', 'w') as file_: 261 | json.dump(sdg_fos_old, file_) 262 | 263 | with open("OSDG-Ontology.json", "w") as file_: 264 | json.dump(sdg_fos, file_) 265 | 266 | # Representative OSDG-Ontology 267 | data_ontology = {'SDG label': [], 'FOS-ID': [], 'FOS-Name': [], 'Link to MAG': []} 268 | for sdg_label, fos_ids in sdg_fos.items(): 269 | sdg_nr = int(sdg_label.split('_')[1]) 270 | for fos_id in fos_ids: 271 | fos_name = fos_map_700.get(fos_id, None) 272 | mag_link = f'https://academic.microsoft.com/topic/{fos_id}' 273 | data_ontology['SDG label'].append(sdg_nr) 274 | data_ontology['FOS-ID'].append(fos_id) 275 | data_ontology['FOS-Name'].append(fos_name) 276 | data_ontology['Link to MAG'].append(mag_link) 277 | 278 | df_ontology = pd.DataFrame(data_ontology).sort_values(['SDG label', 'FOS-Name', 'FOS-ID']) 279 | df_ontology['SDG label'] = df_ontology['SDG label'].apply(lambda sdg_nr: f'SDG_{sdg_nr}') 280 | 281 | df_ontology.to_excel('OSDG-Ontology.xlsx', index=False) 282 | 283 | 284 | """ 285 | Comparing to the last SdgFOS.json version 286 | """ 287 | with open('raw_data/0_add/02_add_all_to_all/8_NABS_FOS/8_ProcessedFOS.json', 'r') as file_: 288 | nabs = json.load(file_) 289 | 290 | with open('raw_data/0_add/02_add_all_to_all/10_PPMI_boost/10_ProcessedFOS.json', 'r') as file_: 291 | boost = json.load(file_) 292 | 293 | data = { 294 | 'sdg': [], 295 | 'add_or_remove': [], 296 | 'fos_id': [], 'fos_name': [], 297 | 'sources': [], 'isinReplaced': [], 'isinRemoved':[] 298 | } 299 | for sdg_label in sorted(set(list(sdg_fos.keys()) + list(sdg_fos_old.keys())), key=sdg_label_sort): 300 | old_foses = sdg_fos_old.get(sdg_label, []) 301 | new_foses = sdg_fos.get(sdg_label, []) 302 | 303 | added_foses = list(set(new_foses).difference(old_foses)) 304 | removed_foses = list(set(old_foses).difference(new_foses)) 305 | 306 | # Added 307 | for fos_id in added_foses: 308 | fos_name = fos_map_700[fos_id] 309 | 310 | sources = set() 311 | for mterm, mterm_data in sdg_matched_fos[sdg_label].items(): 312 | if fos_id in mterm_data['matched_FOS_ids']: 313 | sources.update(mterm_data['sources']) 314 | 315 | # 8 Nabs & 10 boost aka ATA 316 | nabs_fos_ids = list(map(lambda fos: fos[0], nabs.get(sdg_label, []))) 317 | boost_fos_ids = list(map(lambda fos: fos[0], boost.get(sdg_label, []))) 318 | if fos_id in nabs_fos_ids: 319 | sources.add('8_NABS_FOS') 320 | if fos_id in boost_fos_ids: 321 | sources.add('10_PPMI_fos') 322 | 323 | # Replaced 324 | isin_replaced = fos_id in df_replaced[df_replaced.to_sdg == sdg_label].fos_id.astype(str).tolist() 325 | 326 | data['sdg'].append(sdg_label) 327 | data['add_or_remove'].append('add') 328 | data['fos_id'].append(fos_id) 329 | data['fos_name'].append(fos_name) 330 | data['sources'].append(list(sources) if list(sources) else None) 331 | data['isinReplaced'].append(isin_replaced) 332 | data['isinRemoved'].append(False) 333 | 334 | # Removed 335 | for fos_id in removed_foses: 336 | fos_name = fos_map_700[fos_id] 337 | 338 | sources = set() 339 | for mterm, mterm_data in sdg_matched_fos[sdg_label].items(): 340 | if fos_id in mterm_data['matched_FOS_ids']: 341 | sources.update(mterm_data['sources']) 342 | 343 | # 8 Nabs & 10 boost aka ATA 344 | nabs_fos_ids = list(map(lambda fos: fos[0], nabs.get(sdg_label, []))) 345 | boost_fos_ids = list(map(lambda fos: fos[0], boost.get(sdg_label, []))) 346 | if fos_id in nabs_fos_ids: 347 | sources.add('8_NABS_FOS') 348 | if fos_id in boost_fos_ids: 349 | sources.add('10_PPMI_fos') 350 | 351 | # Replaced 352 | isin_replaced = fos_id in df_replaced[df_replaced.from_sdg == sdg_label].fos_id.astype(str).tolist() 353 | isin_removed = fos_id in df_removed[df_removed.sdg_label == sdg_label].fos_id.astype(str).tolist() 354 | 355 | data['sdg'].append(sdg_label) 356 | data['add_or_remove'].append('removed') 357 | data['fos_id'].append(fos_id) 358 | data['fos_name'].append(fos_name) 359 | data['sources'].append(list(sources) if list(sources) else None) 360 | data['isinReplaced'].append(isin_replaced) 361 | data['isinRemoved'].append(isin_removed) 362 | 363 | df_comparison = pd.DataFrame(data).sort_values(['add_or_remove', 'isinReplaced', 'isinRemoved', 'sdg']) 364 | df_comparison.to_excel('comparison_fos_update.xlsx', index=False) 365 | 366 | -------------------------------------------------------------------------------- /AssemblingTerms.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from utils import sdg_label_sort 3 | 4 | import json 5 | import os 6 | 7 | 8 | INTER_ADD_PATH = 'raw_data/0_add' 9 | 10 | add_validated_data_paths = [ 11 | f'{INTER_ADD_PATH}/00_add_validated/{directory_name}' 12 | for directory_name in os.listdir(f'{INTER_ADD_PATH}/00_add_validated') 13 | if '.' not in directory_name 14 | ] 15 | 16 | add_generated_data_paths = [ 17 | f'{INTER_ADD_PATH}/01_add_generated/{directory_name}' 18 | for directory_name in os.listdir(f'{INTER_ADD_PATH}/01_add_generated') 19 | if '.' not in directory_name 20 | ] 21 | 22 | add_all_to_all_data_paths = [ 23 | f'{INTER_ADD_PATH}/02_add_all_to_all/{directory_name}' 24 | for directory_name in os.listdir(f'{INTER_ADD_PATH}/02_add_all_to_all') 25 | if '.' not in directory_name 26 | ] 27 | 28 | 29 | # Gather *_ProcessedKeyTerms ----- 30 | sdg_terms_add_validated, sdg_terms_add_generated = dict(), dict() 31 | term_sources = dict() 32 | 33 | # Validated 34 | for directory in add_validated_data_paths: 35 | try: 36 | processed_sdg_terms_fname = list(filter(lambda oname: '_ProcessedKeyTerms.json' in oname, os.listdir(directory)))[0] 37 | with open(f'{directory}/{processed_sdg_terms_fname}', 'r') as file_: 38 | processed_sdg_terms = json.load(file_) 39 | processed_sdg_terms = {sdg_label: processed_sdg_terms[sdg_label] for sdg_label in sorted(processed_sdg_terms.keys())} 40 | except IndexError: 41 | print(f'Sdg Terms are not processed in {directory}') 42 | continue 43 | 44 | for sdg_label, terms in processed_sdg_terms.items(): 45 | if sdg_label not in sdg_terms_add_validated.keys(): 46 | sdg_terms_add_validated[sdg_label] = set() 47 | sdg_terms_add_validated[sdg_label].update(terms) 48 | 49 | # Update term sources 50 | if sdg_label not in term_sources.keys(): 51 | term_sources[sdg_label] = OrderedDict() 52 | for term in sdg_terms_add_validated[sdg_label]: 53 | if term not in term_sources[sdg_label].keys(): 54 | term_sources[sdg_label][term] = [] 55 | term_sources[sdg_label][term].append(directory.split('/')[-1]) 56 | 57 | # All to all # TODO leave it for matching? if not, it goes into assembling sdg_fos_script. Must be checked for conflicts when assembling generated 58 | for directory in add_all_to_all_data_paths: 59 | try: 60 | processed_sdg_fos_fname = list(filter(lambda oname: '_ProcessedFOS.json' in oname, os.listdir(directory)))[0] 61 | with open(f'{directory}/{processed_sdg_fos_fname}', 'r') as file_: 62 | processed_sdg_fos = json.load(file_) 63 | processed_sdg_fos = {sdg_label: processed_sdg_fos[sdg_label] for sdg_label in sorted(processed_sdg_fos.keys())} 64 | except IndexError: 65 | print(f'Sdg FOS are not processed in {directory}') 66 | continue 67 | 68 | for sdg_label, foses in processed_sdg_fos.items(): 69 | terms = list(map(lambda x: x[1], foses)) # TODO All to all has ids and might move to Assemblign SdgFos script 70 | if sdg_label not in sdg_terms_add_validated.keys(): 71 | sdg_terms_add_validated[sdg_label] = set() 72 | sdg_terms_add_validated[sdg_label].update(terms) 73 | 74 | # Update term sources 75 | if sdg_label not in term_sources.keys(): 76 | term_sources[sdg_label] = OrderedDict() 77 | for term in sdg_terms_add_validated[sdg_label]: 78 | if term in terms: 79 | if term not in term_sources[sdg_label].keys(): 80 | term_sources[sdg_label][term] = [] 81 | term_sources[sdg_label][term].append(directory.split('/')[-1]) 82 | 83 | sdg_terms_add_validated = { 84 | sdg_label: sorted(list(sdg_terms_add_validated[sdg_label])) 85 | for sdg_label in sorted(sdg_terms_add_validated.keys(), key=sdg_label_sort) 86 | } 87 | 88 | with open(f'{INTER_ADD_PATH}/ValidatedSdgTerms.json', 'w') as file_: 89 | json.dump(sdg_terms_add_validated, file_) 90 | 91 | 92 | # Generated 93 | gen_term_sources = dict() 94 | 95 | for directory in add_generated_data_paths: 96 | try: 97 | processed_sdg_terms_fname = list(filter(lambda oname: '_ProcessedKeyTerms.json' in oname, os.listdir(directory)))[0] 98 | with open(f'{directory}/{processed_sdg_terms_fname}', 'r') as file_: 99 | processed_sdg_terms = json.load(file_) 100 | processed_sdg_terms = {sdg_label: processed_sdg_terms[sdg_label] for sdg_label in sorted(processed_sdg_terms.keys())} 101 | except IndexError: 102 | print(f'Sdg Terms are not processed in {directory}') 103 | continue 104 | 105 | for sdg_label, terms in processed_sdg_terms.items(): 106 | if sdg_label not in sdg_terms_add_generated.keys(): 107 | sdg_terms_add_generated[sdg_label] = set() 108 | sdg_terms_add_generated[sdg_label].update(terms) 109 | 110 | # Update gen term sources 111 | for term in sdg_terms_add_generated[sdg_label]: 112 | if term not in term_sources[sdg_label].keys(): 113 | term_sources[sdg_label][term] = [] 114 | term_sources[sdg_label][term].append(directory.split('/')[-1]) 115 | 116 | term_dist = OrderedDict() 117 | for terms in sdg_terms_add_generated.values(): 118 | for term in terms: 119 | if term not in term_dist.keys(): 120 | term_dist[term] = 1 121 | else: 122 | term_dist[term] += 1 123 | 124 | multi_sdg_terms = [term for term, freq in term_dist.items() if freq > 1] # TODO add to file to keep track 125 | 126 | for sdg_label, terms in sdg_terms_add_generated.items(): 127 | terms = terms.difference(multi_sdg_terms) 128 | for v_sdg_label, v_terms in sdg_terms_add_validated.items(): 129 | if v_sdg_label != sdg_label: 130 | terms = terms.difference(v_terms) 131 | sdg_terms_add_generated[sdg_label] = terms 132 | 133 | # Update fos source for both validated and generated 134 | if sdg_label in gen_term_sources.keys(): 135 | for term, sources in gen_term_sources[sdg_label].items(): 136 | if term in sdg_terms_add_generated[sdg_label]: 137 | if term not in term_sources[sdg_label].keys(): 138 | term_sources[sdg_label][term] = [] 139 | term_sources[sdg_label][term] += sources 140 | 141 | sdg_terms_add_generated = { 142 | sdg_label: sorted(list(sdg_terms_add_generated[sdg_label])) 143 | for sdg_label in sorted(sdg_terms_add_generated.keys(), key=sdg_label_sort) 144 | } 145 | 146 | with open(f'{INTER_ADD_PATH}/GeneratedSdgTerms.json', 'w') as file_: 147 | json.dump(sdg_terms_add_generated, file_) 148 | 149 | # Combined Validated and Generated Sdg Terms 150 | sdg_ontology_combined = OrderedDict() 151 | 152 | ata_sources = [path.split('/')[-1] for path in add_all_to_all_data_paths] 153 | sdg_labels = sorted(set(list(sdg_terms_add_validated.keys()) + list(sdg_terms_add_generated.keys())), key=sdg_label_sort) 154 | for sdg_label in sdg_labels: 155 | sdg_ontology_combined[sdg_label] = OrderedDict() 156 | validated_terms = sdg_terms_add_validated[sdg_label] if sdg_label in sdg_terms_add_validated.keys() else [] 157 | generated_terms = sdg_terms_add_generated[sdg_label] if sdg_label in sdg_terms_add_generated.keys() else [] 158 | 159 | for term in sorted(list(set(validated_terms + generated_terms))): 160 | t_sources = sorted(term_sources[sdg_label][term], key=sdg_label_sort) 161 | if all(src in ata_sources for src in t_sources): 162 | continue 163 | if term not in sdg_ontology_combined[sdg_label].keys(): 164 | sdg_ontology_combined[sdg_label][term] = dict() 165 | sdg_ontology_combined[sdg_label][term] = t_sources 166 | 167 | with open("InterimTerms.json", "w") as file_: 168 | file_.write(json.dumps(sdg_ontology_combined)) 169 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at osdg@technote.ai. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute to OSDG? 2 | 3 | The OSDG project welcomes contributions from all users and communities. We have identified several areas, which could benefit the project the most: 4 | 5 | 1) Suggest new data sources - If you have a data source or a classifier to recognize one or more SDG, you can suggest it to be added to the OSDG data sources. 6 | 2) Suggest new/better rules for data source cleaning / integration. 7 | 3) Suggest improvements to the procedure for matching the items in the combined ontology to the Fields of Study in Microsoft Academic. 8 | 9 | This can be done in various ways : 10 | a) by posting an issue on OSDG GitHub ; 11 | b) by contacting the team at [osdg@technote.ai](mailto:osdg@technote.ai); 12 | c) By forking there project repository, integrating the new data and then creating a pull request(read more about [pull requests](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)). 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /Methodology.md: -------------------------------------------------------------------------------- 1 | ## Methodology 2 | OSDG aims to: 3 | - integrate various existing attempts to classify research according to SustainableDevelopment Goals, 4 | - make this process open, transparent and user-friendly. 5 | 6 | OSDG integrates the existing research into a comprehensive approach, and does so in a way that evades the shortcomings of former individual approaches and duplication of research efforts. 7 | 8 |

9 | OSDG_Logo 10 |

11 | 12 | ## About the project 13 | In short, OSDG builds an **integrated ontology** from the feature sets identified in previous research, and then matches the ontology items to the topics from [Microsoft Academic](https://academic.microsoft.com/home). 14 | OSDG takes relevant text features (such as ontology items, features from machine-learning models or extracted keywords) from the previous research, cleans them and merges them into a comprehensive, constantly-growing OSDG ontology. The ontology items are mapped to the ever-growing list of topics/Fields of Study in the Microsoft Academic Graph (MAG). 15 | By doing this, we: 16 | - expand the ontology – acquire more key terms associated with the relevant MAG Topics, natively called Fields of Study (FOS); 17 | - capture more nuanced relationships between individual terms and latent concepts. 18 | 19 | ## How does OSDG work? 20 | OSDG processes user queries in the following steps: 21 | 1) It tags the user query with FOS’es from Microsoft Academic Graph (MAG); 22 | 2) It cross-references the FOS’es assigned to the user query with the OSDG Ontology and determines which SDGs (if any) are relevant for the query; 23 | 3) The relevance of a SDG to a query is interpreted as being “Strong” or “Moderate” depending on a specific threshold that is specifically adjusted for each SDG by testing the tool on a set of 16 000 scientific publication abstracts). 24 | 25 | Head to the Search page to put our methodology to practical use. If you see something that requires improvement or you would like to contact our data team, please state your enquiry using our contact form. 26 | ## References and inspiration 27 | 28 | The list of data sources used in the current version of the OSDG Tool are [here](https://github.com/TechNote-ai/osdg/blob/master/OSDG_DATA_SOURCES.md). OSDG leverages the data from [Microsoft Academic](https://academic.microsoft.com/home): 29 | 30 | 1) Sinha, A., Shen, Z., Song, Y., Ma, H., Eide, D., Hsu, B.-J. & Wang, K. (2015). AnOverview of Microsoft Academic Service (MAS) and Applications. Proceedings of the24th International Conference on World Wide Web (p./pp. 243--246), Republic andCanton of Geneva, Switzerland: International World Wide Web Conferences SteeringCommittee. ISBN: 978-1-4503-3473-0. doi:10.1145/2740908.27428398. 31 | 2) Wang, K., Shen, Z., Huang, C., Wu, C., Eide, D., Dong, Y., Qian, J., Kanakia, A., Chen,A.C., & Rogahn, R. (2019). A Review of Microsoft Academic Services for Science ofScience Studies. Frontiers in Big Data, 2. doi:10.3389/FDATA.2019.00045 32 | -------------------------------------------------------------------------------- /OSDG-Ontology.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/OSDG-Ontology.xlsx -------------------------------------------------------------------------------- /OSDG_DATA_SOURCES.md: -------------------------------------------------------------------------------- 1 | # SDG Data Sources 2 | 3 | ## Expert validated data sources 4 | 5 | | Index | Description | Folder Name | Link | 6 | | :------: | :------ | :------: | ------: | 7 | | 0. | SDG Ontology compiled by Dr Nuria B. Puig and E. Mauleon| 0_PuigOntology | [Dataset](https://figshare.com/articles/SDG_ontology/11106113/1) | 8 | | 6. | Terms by Indicator from SDGIO Ontology | 6_SDGIO_Terms | [Link to SDGIO GitHub ](https://github.com/SDG-InterfaceOntology/sdgio) | 9 | ## 10 | ## Generated data sources 11 | 12 | | Index | Description | Folder Name | Link | 13 | | :------: | :------ | :------: | ------: | 14 | | 1. | Mapping from "FP7-4-SD" Project (edited VS and LP) | 1_FP7-4-SD_edited | [Link to Project website](https://www.fp7-4-sd.eu/) | 15 | | 2. | Concepts UN Linked SDG tool extracted from academic publications | 2_LinkedSDG_Concepts | [Link to LinkedSGS Tool](http://linkedsdg.apps.officialstatistics.org/#/) | 16 | | 3. | Concepts extracted from SDG Pathfinder documents extracted via ML | 3_SDGPathfiner_DocumentConcepts | [Document Colletion](https://sdg-pathfinder.org/) ; [Modelling Description](https://ppmi.lt/) | 17 | | 4. | Keywords from SDG Pathfinder indicated by the SDG Pathfinder tool itself| 4_SDGPathfinder_Keywords| [SDG Pathfinder](https://sdg-pathfinder.org/) | 18 | | 5. | Concepts UN Linked SDG tool extracted from Administrative Documents | 5_LinkedSDG_DocumentExtracts | [Link to LinkedSGS Tool](http://linkedsdg.apps.officialstatistics.org/#/) | 19 | | 7. | Concepts linked to SDGs from EC Policy Documents | 7_EC_Policy_Doc_Terms | Skrynnyk & Stanciauskas ( 2020 upcoming ) | 20 | | 9. | Keywords from "Science4SDGs" project | 9_SIRIS_Science4SDGs | [Link to "Science4SDGs" project](http://science4sdgs.sirisacademic.com/) | 21 | ## 22 | 23 | ## ATA data sources 24 | 25 | | Index | Description | Folder Name | Link | 26 | | :------: | :------ | :------: | ------: | 27 | | 8. | FOS'es Linked to NABs Areas | 8_NABS_FOS | [Link to Eurostat](https://ec.europa.eu/eurostat/ramon/nomenclatures/index.cfm?TargetUrl=LST_NOM_DTL&StrNom=CEPA_1994&StrLanguageCode=EN&IntPcKey=4431590&StrLayoutCode=HIERARCHIC) | 28 | | 10. | A boost of SDG relevant FOS'es compiled by PPMI researchers | 10_PPMI_boost | [PPMI](https://ppmi.lt)| 29 | # 30 | 31 | *** 32 | 33 | 34 | **** 35 | # Raw Data structure 36 | * `raw_data/` 37 | * `0_add/` 38 | * `00_add_validated/`\ 39 | **Expert validated term labels**\ 40 | **→** each data source must produce: 41 | *`*_ProcessedKeyTerms.json`* 42 | ```python 43 | { 44 | 'SDG_1': ['term_1', 'term_2', ...], 45 | 'SDG_2': ['term_3', 'term_4', ...], 46 | ... 47 | } 48 | ``` 49 | * `01_add_generated/`\ 50 | **Expert validated term labels**\ 51 | **→** each data source must produce: 52 | *`*_ProcessedKeyTerms.json`* 53 | ```python 54 | { 55 | 'SDG_1': ['term_1', 'term_2', ...], 56 | 'SDG_2': ['term_3', 'term_4', ...], 57 | ... 58 | } 59 | ``` 60 | * `02_add_all_to_all/`\ 61 | **Expert validated FOS labels**\ 62 | **→** each data source must produce: 63 | *`*_ProcessedFOS.json`* 64 | ```python 65 | { 66 | 'SDG_1': [['fos_id_1', 'fos_name_1'], ['fos_id_2', 'fos_name_2'], ...], 67 | 'SDG_2': [['fos_id_3', 'fos_name_3'], ['fos_id_4', 'fos_name_4'], ...], 68 | ... 69 | } 70 | ``` 71 | * `1_replace/`\ 72 | **Mapping for FOS SDG label reassignment from `SDG_a` to `SDG_b`**\ 73 | **→** each data source must produce: 74 | *`*_ReplaceFOS.json`* 75 | ```python 76 | { 77 | 'fos_id_1': [['SDG_1', 'SDG_2'], ...], 78 | 'fos_id_2': [['SDG_3', 'SDG_4'], ...], 79 | ... 80 | } 81 | ``` 82 | * `2_remove/`\ 83 | **FOS to remove from sdg assigned FOS lists**\ 84 | **→** each data source must produce: 85 | *`*_RemoveFOS.json`* 86 | ```python 87 | { 88 | 'SDG_1': ['fos_id_1', 'fos_id_2', ...], 89 | 'SDG_2': ['fos_id_1', 'fos_id_3', ...], 90 | ... 91 | } 92 | ``` 93 | * `Blacklist`\ 94 | **Irrelevant FOS**\ 95 | **→** each data source must produce: 96 | *`*_Blacklist.csv`* 97 | | fos_id | fos_name | 98 | | :------ | :-------- | 99 | | fos_id_1 | fos_name_1 | 100 | | fos_id_2 | fos_name_2 | 101 | | ... | ...| 102 | 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | OSDG_Logo 2 | 3 | ### OSDG has moved to a new repository (https://github.com/osdg-ai/osdg-mapping). 4 | **All the updates will be made to the new repo only!** 5 | 6 | -------------------------------------------------------------------------------- /comparison_fos_update.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/comparison_fos_update.xlsx -------------------------------------------------------------------------------- /images/Methodology-visual_0511_Updated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/images/Methodology-visual_0511_Updated.png -------------------------------------------------------------------------------- /images/OSDG.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/images/OSDG.png -------------------------------------------------------------------------------- /images/OSDG_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/images/OSDG_new.png -------------------------------------------------------------------------------- /raw_data/0_add/00_add_validated/0_PuigOntology/0_process_key_terms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Apr 27 17:39:53 2020 5 | 6 | @author: lukas 7 | """ 8 | import json 9 | 10 | 11 | data = {} 12 | 13 | file = open("Ontology.csv" , "r" , encoding = "latin1") 14 | for line in file : 15 | parts = line[:-1].split(";") 16 | if len(parts) == 1: 17 | break 18 | else: 19 | if parts[1] != "clasification" : 20 | if parts[1] in data: 21 | data[ parts[1] ].append( parts[0] ) 22 | else: 23 | data[ parts[1] ] = [ parts[0] ] 24 | file.close() 25 | 26 | #%% 27 | replacables_symbols = ["&" , "-" , '"' , " "] 28 | replacables_words = ["and" , "or" , "for", "&" , "of" , "sdg" , "oecd" , "arctic"] 29 | def pre_proc( list_o_strings ): 30 | """ 31 | Keeps only the keywords longer than 4 characters ; 32 | Strips non Alphanumeric chars ; 33 | Removes basic interluding words ( "and" , "of" , etc. ) ; 34 | Deduplicates 35 | """ 36 | 37 | processed = [] 38 | alpha = "abcdefghijklmnopqrstuvwxyz0123456789 " 39 | for item in list_o_strings : 40 | item = item.lower() 41 | 42 | for c in replacables_symbols: 43 | item = item.replace( c , " " ) 44 | item_p = item.split() 45 | item = " ".join(i for i in item_p if i not in replacables_words) 46 | 47 | if all( c in alpha for c in item ) : 48 | if item.startswith( " " ) : 49 | item = item[ 1: ] 50 | if item.endswith( " " ) : 51 | item = item[:-1] 52 | if len(item) > 4 : 53 | if item not in processed: 54 | processed.append( item ) 55 | return processed 56 | 57 | #%% 58 | data_proc = {} 59 | 60 | for key , value in data.items() : 61 | key2 = key.replace("SDG" , "SDG_") 62 | data_proc[ key2 ] = pre_proc( value ) 63 | 64 | 65 | #%% 66 | js = json.dumps( data_proc ) 67 | file = open("0_ProcessedKeyTerms.json" , "w") 68 | file.write( js ) 69 | file.close() 70 | -------------------------------------------------------------------------------- /raw_data/0_add/00_add_validated/0_PuigOntology/Ontology.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/00_add_validated/0_PuigOntology/Ontology.csv -------------------------------------------------------------------------------- /raw_data/0_add/00_add_validated/6_SDGIO_terms/6_ProcessedKeyTerms.json: -------------------------------------------------------------------------------- 1 | {"SDG_1": ["access to basic services", "disaster", "poverty reduction programme", "national action plans that support actions that eradicate poverty sustainably use natural resources"], "SDG_2": ["undernourishment", "stunting", "malnutrition wasting overweight", "volume production", "total factor productivity", "sustainable agricultural practices", "agricultural households", "eco friendly fertilizers", "ex situ crop collections enrichment index", "local crops breeds wild relatives", "import export tariffs", "agricultural export subsidies"], "SDG_3": ["maternal deaths", "birth", "mortality rate", "neonatel", "hiv infections", "tuberculosis", "malaria", "hepatitis b infections", "neglected tropical diseases", "cardiovascular disease", "suicide", "substance use disorders", "harmful use alcohol", "road traffic", "family planning", "adolescent birth rate", "tracer interventions", "health expenditure", "household pollution", "hazardous chemicals", "tobacco use", "access to affordable medicines vaccines", "official development assistance", "health worker", "13 core capacities"], "SDG_4": ["education children young people", "developmentally on track", "organized learning", "parity indices", "fixed level proficiency in functional skills", "environmental science geoscience", "official development assistance flows scholarships", "teachers"], "SDG_5": ["ever partnered", "married in a union", "unpaid domestic care work", "seats in national parliaments local governments", "managerial positions", "informed decisions", "laws regulations", "ownership secure rights", "legal framework customary law", "mobile telephone"], "SDG_6": ["safely managed drinking water services", "sanitation services", "wastewater safely treated", "good ambient water quality", "water use efficiency", "water resources used", "water related ecosystems extent", "official development assistance water sanitation related", "local communities participation in water sanitation managemnt"], "SDG_7": ["acces to electricity", "clean fuels technology", "renewable energy share final energy consumption", "energy intensity primary energy gdp", "100 billion commitment", "net domestic energy use"], "SDG_8": ["annual growth rate real gdp", "employed person", "informal employment non agricultural employment", "resource productivity", "hourly earnings", "unemployment rate", "child labour", "fatal occupational injury non fatal occupational injury", "international labour organization conventions", "tourism direct gdp tourism industries", "commercial bank branches atm", "aid trade", "social protection employment progammes"], "SDG_9": ["rural population all season road", "freight volume passenger volume", "manufacturing value added", "manufacturing employment", "small scale industries", "loan credit", "co2 emission", "research development", "researchers", "official international support", "mobile network"], "SDG_10": ["household expenditure", "median income", "report that personally felt discriminated against harassed", "labour share", "members in international organizations voting rights in international organizations", "recruitment cost borne by employee", "international migration policy index", "victims human trafficking", "tariff lines", "resource flows development", "remittance costs"], "SDG_11": ["slum informal settlement inadequate housing", "access to public transport convenient", "land consumption population growth rate", "urban planning management", "urban solid waste", "fine particulate matter", "open space public use", "physical harassment sexual harassment women subjected to", "urban regional development plans implementing", "risk reduction resilience strategies implementing"], "SDG_12": ["international multilateral environmental agreements on hazardous other chemicals waste", "treatment waste", "national recycling rate", "sustainability reports", "sustainable public procurement policies action plans implementing", "sustainable development lifestyle topics", "green patent applications", "residual flows", "fossil fuel subsidies", "effective climate change related planning management"], "SDG_14": ["nitrogen use efficiency composite", "coastal marine development", "fish stocks", "protected areas", "negative fishery subsidies", "fisheries", "research in marine technology", "regional seas protocols provisions"], "SDG_15": ["forest area", "forest cover", "net permanent forest loss", "degraded land", "important sites mountain biodiversity", "mountain green cover index", "red list index", "access benefit sharing clearinghouse", "rli species in trade", "illegal trade in wildlife wildlife products", "invasive alien species", "biodiversity ecosystem services values", "sustainable use biodiversity ecosystems conservation", "forestry official development assistance"], "SDG_16": ["victim intentional homicide", "conflict related deaths", "feel safe walking alone", "physical punishment", "human trafficking", "victims violence", "unsentenced detainees", "illicit financial flows", "small arms light weapons", "contact with a public official", "primary government expenditures", "satisfactory experience with public services", "positions in public institutions", "national development plans poverty reduction strategies", "births registered with a civil authority", "physical sexual crime reported by victim"], "SDG_17": ["total governement revenue", "domestic taxes", "net official development assistance", "volume remittances", "debt service", "sustainable development objectives safeguard", "access to patent information", "fixed internet broadband subscriptions", "environmentally sound technologies", "internet use by individual", "sustainable development in three dimensions", "global exports", "tariff", "official development assistance loan agreements", "public private civil society partnerships", "national statistical legislation", "statistical capacity", "inclusive wealth"]} -------------------------------------------------------------------------------- /raw_data/0_add/00_add_validated/6_SDGIO_terms/6_process_key_terms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu May 15 15:47:41 2020 5 | 6 | @author: lukas 7 | """ 8 | 9 | 10 | import pandas as pd 11 | import json 12 | from tqdm import tqdm 13 | 14 | replacables_symbols = ["&" , "-" , '"' , " "] 15 | replacables_words = ["and" , "or" , "for", "&" , "of" , "sdg" , "oecd" , "arctic"] 16 | def pre_proc( list_o_strings ): 17 | """ 18 | Keeps only the keywords longer than 4 characters ; 19 | Strips non Alphanumeric chars ; 20 | Removes basic interluding words ( "and" , "of" , etc. ) ; 21 | Deduplicates 22 | """ 23 | 24 | processed = [] 25 | alpha = "abcdefghijklmnopqrstuvwxyz0123456789 " 26 | for item in list_o_strings : 27 | item = item.lower() 28 | 29 | for c in replacables_symbols: 30 | item = item.replace( c , " " ) 31 | item_p = item.split() 32 | item = " ".join(i for i in item_p if i not in replacables_words) 33 | 34 | if all( c in alpha for c in item ) : 35 | if item.startswith( " " ) : 36 | item = item[ 1: ] 37 | if item.endswith( " " ) : 38 | item = item[:-1] 39 | if len(item) > 4 : 40 | if item not in processed: 41 | processed.append( item ) 42 | return processed 43 | 44 | dfl = pd.read_excel("SDG Terms by Indicator.xlsx").to_dict( orient = "records") 45 | 46 | 47 | number_map = {"1" : "SDG_1" , 48 | "2" : "SDG_2", 49 | "3" : "SDG_3", 50 | "4" : "SDG_4", 51 | "5" : "SDG_5", 52 | "6" : "SDG_6", 53 | "7" : "SDG_7", 54 | "8" : "SDG_8", 55 | "9" : "SDG_9", 56 | "10" : "SDG_10", 57 | "11" : "SDG_11", 58 | "12" : "SDG_12", 59 | "13" : "SDG_12", 60 | "14" : "SDG_14", 61 | "15" : "SDG_15", 62 | "16" : "SDG_16", 63 | "17" : "SDG_17", 64 | } 65 | 66 | sdg_words_raw = {} 67 | 68 | for row in tqdm(dfl) : 69 | number = row['Indicator \nNumber'].split(".")[0] 70 | sdg = number_map[ number ] 71 | if sdg not in sdg_words_raw.keys() : 72 | sdg_words_raw[ sdg ] = [] 73 | sdg_words_raw[ sdg ].append( str(row["Term(s)"])) 74 | 75 | counter = 0 76 | print("Key Words Identified before cleaning : " ) 77 | for key , value in sdg_words_raw.items() : 78 | print( key , " : ", len(value)) 79 | counter += len(value) 80 | 81 | print("Overall : ", counter) 82 | 83 | #%% 84 | for key , value in sdg_words_raw.items() : 85 | sdg_words_raw[ key ] = pre_proc( value ) 86 | 87 | #%% 88 | """ 89 | Deduplicating keywords 90 | """ 91 | word_freq_dict = {} 92 | for val in list(sdg_words_raw.values()) : 93 | for v in val : 94 | if v not in word_freq_dict : 95 | word_freq_dict[ v ] = 1 96 | else: 97 | word_freq_dict[ v ] += 1 98 | 99 | #%% 100 | sdg_words = {} 101 | for key , value in sdg_words_raw.items() : 102 | plh = [ i for i in value if word_freq_dict[i] < 2] 103 | sdg_words[ key ] = plh 104 | 105 | #%% 106 | js = json.dumps( sdg_words ) 107 | file = open( "6_ProcessedKeyTerms.json" , "w") 108 | file.write( js ) 109 | file.close() 110 | 111 | counter = 0 112 | print("Key Words Identified after cleaning: " ) 113 | for key , value in sdg_words.items() : 114 | print( key , " : ", len(value)) 115 | counter += len(value) 116 | 117 | print("Overall : ", counter) 118 | -------------------------------------------------------------------------------- /raw_data/0_add/00_add_validated/6_SDGIO_terms/SDG Terms by Indicator.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/00_add_validated/6_SDGIO_terms/SDG Terms by Indicator.xlsx -------------------------------------------------------------------------------- /raw_data/0_add/01_add_generated/1_FP7-4-SD_edited/1_process_key_terms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Apr 28 17:02:48 2020 5 | 6 | @author: lukas 7 | """ 8 | 9 | import json 10 | 11 | #Hand curated list of bad FOS ids 12 | from bad_fos import * 13 | #%% 14 | file = open("WU_projectSDGs.json" , "r") 15 | project_sdgs = json.loads( file.read() ) 16 | file.close() 17 | 18 | file = open("ProjectFOS.json" , "r") 19 | project_fos = json.loads( file.read() ) 20 | file.close() 21 | 22 | file = open("FOSMAP.json" , "r") 23 | fos_map = json.loads( file.read() ) 24 | file.close() 25 | 26 | 27 | #%% 28 | """ 29 | Taking Top 5 FOS'es from each project 30 | Checking if they are not EU project slang related FOS'es (bad_fos) 31 | Assigning them to SDGs and deduplicating 32 | """ 33 | sdg_fos_raw = {} 34 | for key , value in project_sdgs.items() : 35 | fos = [ i[0] for i in sorted( project_fos[ key ].items() , key = lambda kv:kv[1] , reverse = True)[0:10] if int( i[ 0 ] ) not in bad_fos ] 36 | for v in value : 37 | if v not in sdg_fos_raw.keys() : 38 | sdg_fos_raw[ v ] = set() 39 | for f in fos : 40 | sdg_fos_raw[ v ].add( f ) 41 | 42 | for key , value in sdg_fos_raw.items() : 43 | sdg_fos_raw[ key ] = list( value ) 44 | 45 | #Data on SDG 17 in this set is very poor quality ; removing it 46 | pop = sdg_fos_raw.pop("SDG_17", None) 47 | 48 | #%% 49 | """ 50 | Removing certain some FOS'es that were assigned to projects but do not align well with SDGs 51 | """ 52 | sdg_fos_raw["SDG_1"] = [ i for i in sdg_fos_raw["SDG_1"] if int(i) not in bad_sdg1_fos] 53 | sdg_fos_raw["SDG_2"] = [ i for i in sdg_fos_raw["SDG_2"] if int(i) not in bad_sdg2_fos] 54 | 55 | sdg_fos_raw["SDG_3"] = [ i for i in sdg_fos_raw["SDG_3"] if int(i) not in bad_sdg3_fos] 56 | sdg_fos_raw["SDG_4"] = [ i for i in sdg_fos_raw["SDG_4"] if int(i) not in bad_sdg4_fos] 57 | sdg_fos_raw["SDG_5"] = [ i for i in sdg_fos_raw["SDG_5"] if int(i) not in bad_sdg5_fos] 58 | sdg_fos_raw["SDG_6"] = [ i for i in sdg_fos_raw["SDG_6"] if int(i) not in bad_sdg6_fos] 59 | 60 | sdg_fos_raw["SDG_7"] = [ i for i in sdg_fos_raw["SDG_7"] if int(i) not in bad_sdg7_fos] 61 | 62 | sdg_fos_raw["SDG_8"] = [ i for i in sdg_fos_raw["SDG_8"] if int(i) not in bad_sdg8_fos] 63 | sdg_fos_raw["SDG_9"] = [ i for i in sdg_fos_raw["SDG_9"] if int(i) not in bad_sdg9_fos] 64 | sdg_fos_raw["SDG_10"] = [ i for i in sdg_fos_raw["SDG_10"] if int(i) not in bad_sdg10_fos] 65 | 66 | sdg_fos_raw["SDG_11"] = [ i for i in sdg_fos_raw["SDG_11"] if int(i) not in bad_sdg11_fos] 67 | sdg_fos_raw["SDG_12"] = [ i for i in sdg_fos_raw["SDG_12"] if int(i) not in bad_sdg12_fos] 68 | sdg_fos_raw["SDG_13"] = [ i for i in sdg_fos_raw["SDG_13"] if int(i) not in bad_sdg13_fos] 69 | sdg_fos_raw["SDG_14"] = [ i for i in sdg_fos_raw["SDG_14"] if int(i) not in bad_sdg14_fos] 70 | sdg_fos_raw["SDG_15"] = [ i for i in sdg_fos_raw["SDG_15"] if int(i) not in bad_sdg15_fos] 71 | sdg_fos_raw["SDG_16"] = [ i for i in sdg_fos_raw["SDG_16"] if int(i) not in bad_sdg16_fos] 72 | 73 | 74 | #%% 75 | """ 76 | Deduplicating FOS 77 | """ 78 | fos_freq_dict = {} 79 | for val in list(sdg_fos_raw.values()) : 80 | for v in val : 81 | if v not in fos_freq_dict : 82 | fos_freq_dict[ v ] = 1 83 | else: 84 | fos_freq_dict[ v ] += 1 85 | 86 | #%% 87 | sdg_fos = {} 88 | for key , value in sdg_fos_raw.items() : 89 | plh = [ i for i in value if fos_freq_dict[i] < 2] 90 | sdg_fos[ key ] = plh 91 | 92 | 93 | #js = json.dumps( sdg_fos ) 94 | #file = open("NewWU.json" , "w") 95 | #file.write( js ) 96 | #file.close() 97 | 98 | #%% 99 | sdg_fos_s = {} 100 | for key , value in sdg_fos_raw.items() : 101 | plh = [ fos_map[ i ].lower() for i in value if fos_freq_dict[i] < 2 and i in fos_map.keys() ] 102 | sdg_fos_s[ key ] = plh 103 | 104 | 105 | #%% 106 | js = json.dumps( sdg_fos_s ) 107 | file = open("1_ProcessedKeyTerms.json" , "w") 108 | file.write( js ) 109 | file.close() 110 | 111 | #%% 112 | """ 113 | for key , value in sdg_fos.items() : 114 | file = open( key+".txt" , "w") 115 | for v in value : 116 | line = v + "\t" + fos_map[ v ] +"\n" 117 | file.write( line ) 118 | file.close() 119 | """ -------------------------------------------------------------------------------- /raw_data/0_add/01_add_generated/2_LinkedSDG_Concepts/2_ProcessedKeyTerms.json: -------------------------------------------------------------------------------- 1 | {"SDG_12": ["hazardous waste", "cleaner production", "sustainable production"], "SDG_8": ["occupational accident", "trade financing", "financial services", "youth employment", "trade promotion", "labour productivity", "decent work", "occupational safety", "occupational hazards", "salaires", "occupational accidents"], "SDG_3": ["non communicable diseases", "diseases", "mortality", "death", "public health", "health personnel", "physicians", "reproductive health", "family planning", "traffic accidents", "suicide", "maternal child health", "narcotic drugs", "tobacco", "youth health", "communicable diseases", "mothers", "child mortality", "maternal mortality", "tuberculosis", "food hygiene", "child health", "traffic safety", "malaria", "toxic substance", "water related diseases", "international health regulations", "childbirth", "infant mortality", "sex education", "medical research", "health hazards", "infants", "tropical disease", "mental health", "smoking", "vaccination", "alcoholism", "road traffic", "deliveries", "mortalities", "toxicity", "maladies infectieuses", "delivery", "infectious diseases", "deaths", "alcohol abuse", "mortality rates", "tropical diseases", "dioxins", "burial", "death rate", "vaccinations", "suicides", "cause death", "toxic chemicals", "community health", "toxic substances", "dioxin", "perinatal mortality", "mortality rate", "seropositivity", "food safety", "toxicities", "hiv infections", "mortalite", "terminally ill", "human mortality", "parturition", "death rates", "burials", "meres", "health the population", "confinement", "malarias", "sante publique", "funeral", "adolescent health", "enfermedades", "addiction to tobacco", "mortalidad", "medical personnel", "sante mentale", "salud publica", "maladies", "newborn babies", "natural death", "narcotics", "confinements", "childbirths", "road safety", "enfants", "funerals", "tobacco addiction", "salud mental", "recien nacidos", "lactantes", "muerte", "toxic discharge", "medecins"], "SDG_9": ["innovation", "industrial infrastructure", "industrialization", "research development", "technology", "technologies", "technological innovations", "scientific personnel", "engineering", "innovations", "technological innovation", "industrial innovation", "industrial infrastructures"], "SDG_5": ["girls", "participation women", "position women", "gender based violence", "harmful traditional practices", "unpaid work", "female circumcision", "women managers", "political participation", "excision", "situation women", "excisions", "sexual violence", "violence against women"], "SDG_1": ["standard living"], "SDG_11": [], "SDG_2": ["sustainable agriculture", "food shortage", "hunger", "malnutrition", "food security", "agricultural policy", "economic policy", "starvation", "food production", "food price", "agricultural development", "famine", "food insecurity", "food prices", "access to food", "food availability", "agricultural policies", "economic policies", "multifunctional agriculture", "economic choices", "food shortages", "local food production", "starvations"], "SDG_6": ["use water", "water management", "use waters"], "SDG_16": ["human rights", "civil registration", "birth reporting", "child abuse", "access to information", "corruption", "rule law", "bribery", "legal protection", "public information", "detained persons", "administration justice", "societe civile", "protection human rights", "corrupt practices", "public bodies", "public institutions", "human rights violations", "corruptions"], "SDG_4": ["educational facilities", "right to education", "computer literacy", "teacher", "scholarships", "preschool education", "educational financing", "teacher training", "teachers", "kindergarten", "kindergartens", "professors", "professor", "educacion", "teaching staff", "educational buildings"], "SDG_7": ["sustainable energy", "energy market", "66 energy", "renewable energy sources", "electrification", "energy resources", "energy", "energy efficiency", "environmentally sound technology", "energies", "energy sector", "fuels", "alternative energy sources", "power sector", "energy supply", "energy supplies", "fuel efficiency", "energy sectors", "energy markets", "clean technologies", "energy efficiencies", "fuel resources"], "SDG_15": ["terrestrial ecosystem", "terrestrial ecosystems", "deforestation", "mountain ecosystems", "freshwater ecosystem", "national parks reserves", "protected area", "desertification", "national park", "forest", "forests", "biological diversity", "forest ecosystems", "biodiversity", "endangered species", "protected areas", "diversidad biologica", "biodiversidad", "bosque", "species diversity", "woodland", "woodlands", "national parks", "biosphere reserves", "freshwater ecosystems", "nature reserves", "biodiversite", "salinisation", "alpine ecology", "alpine ecosystems"], "SDG_13": [], "SDG_17": ["partnership", "internet", "national budget", "economic support", "resources mobilization", "external debt", "debt servicing", "programme evaluation", "programme ownership", "partnerships", "state budget", "third world"], "SDG_14": ["oceans seas", "ocean", "marine life", "marine resources", "marine environment", "marine ecosystems", "marine environments", "oceans", "marine ecology", "oceanos", "marine fauna", "sea resources"], "SDG_10": ["social security", "income distribution", "income inequalities", "income inequality", "social protection", "social insurance"]} -------------------------------------------------------------------------------- /raw_data/0_add/01_add_generated/2_LinkedSDG_Concepts/2_process_key_terms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Apr 30 10:16:21 2020 5 | 6 | @author: lukas 7 | """ 8 | 9 | 10 | import pandas as pd 11 | import json 12 | 13 | #%% 14 | replacables_symbols = ["&" , "-" , '"' , " "] 15 | replacables_words = ["and" , "or" , "for", "&" , "of" , "sdg" , "oecd" , "arctic"] 16 | def pre_proc( list_o_strings ): 17 | """ 18 | Keeps only the keywords longer than 4 characters ; 19 | Strips non Alphanumeric chars ; 20 | Removes basic interluding words ( "and" , "of" , etc. ) ; 21 | Deduplicates 22 | """ 23 | 24 | processed = [] 25 | alpha = "abcdefghijklmnopqrstuvwxyz0123456789 " 26 | for item in list_o_strings : 27 | item = item.lower() 28 | 29 | for c in replacables_symbols: 30 | item = item.replace( c , " " ) 31 | item_p = item.split() 32 | item = " ".join(i for i in item_p if i not in replacables_words) 33 | 34 | if all( c in alpha for c in item ) : 35 | if item.startswith( " " ) : 36 | item = item[ 1: ] 37 | if item.endswith( " " ) : 38 | item = item[:-1] 39 | if len(item) > 4 : 40 | if item not in processed: 41 | processed.append( item ) 42 | return processed 43 | 44 | 45 | #%% 46 | dfl = pd.read_excel( "LinkedSDG_Data.xlsx" ).to_dict(orient="records") 47 | 48 | sdg_words_raw = {} 49 | for row in dfl : 50 | if str(row["Goal"]) != "nan" : 51 | sdg = row["Goal"].replace("Goal " , "SDG_") 52 | if sdg not in sdg_words_raw.keys() : 53 | sdg_words_raw[ sdg ] = [] 54 | if str(row["Concept"]) != "nan" : 55 | sdg_words_raw[ sdg ].append( row["Concept"].lower() ) 56 | if str(row["Keyword"]) != "nan" : 57 | sdg_words_raw[ sdg ].append( row["Keyword"].lower() ) 58 | 59 | for key , value in list(sdg_words_raw.items()) : 60 | sdg_words_raw[ key ] = pre_proc( value ) 61 | 62 | #%% 63 | """ 64 | Deduplicating keywords 65 | """ 66 | word_freq_dict = {} 67 | for val in list(sdg_words_raw.values()) : 68 | for v in val : 69 | if v not in word_freq_dict : 70 | word_freq_dict[ v ] = 1 71 | else: 72 | word_freq_dict[ v ] += 1 73 | 74 | #%% 75 | sdg_words = {} 76 | for key , value in sdg_words_raw.items() : 77 | plh = [ i for i in value if word_freq_dict[i] < 2] 78 | sdg_words[ key ] = plh 79 | 80 | #%% 81 | js = json.dumps( sdg_words ) 82 | file = open( "2_ProcessedKeyTerms.json" , "w") 83 | file.write( js ) 84 | file.close() 85 | -------------------------------------------------------------------------------- /raw_data/0_add/01_add_generated/2_LinkedSDG_Concepts/LinkedSDG_Data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/01_add_generated/2_LinkedSDG_Concepts/LinkedSDG_Data.xlsx -------------------------------------------------------------------------------- /raw_data/0_add/01_add_generated/3_SDGPathfinder_DocumentConcepts/3_process_key_terms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Apr 30 09:43:00 2020 5 | 6 | @author: lukas 7 | """ 8 | 9 | import pandas as pd 10 | import json 11 | import ast 12 | 13 | 14 | 15 | #%% 16 | replacables_symbols = ["&" , "-" , '"' , " "] 17 | replacables_words = ["and" , "or" , "for", "&" , "of" , "sdg" , "oecd" , "arctic"] 18 | def pre_proc( list_o_strings ): 19 | """ 20 | Keeps only the keywords longer than 4 characters ; 21 | Strips non Alphanumeric chars ; 22 | Removes basic interluding words ( "and" , "of" , etc. ) ; 23 | Deduplicates 24 | """ 25 | 26 | processed = [] 27 | alpha = "abcdefghijklmnopqrstuvwxyz0123456789 " 28 | for item in list_o_strings : 29 | item = item.lower() 30 | 31 | for c in replacables_symbols: 32 | item = item.replace( c , " " ) 33 | item_p = item.split() 34 | item = " ".join(i for i in item_p if i not in replacables_words) 35 | 36 | if all( c in alpha for c in item ) : 37 | if item.startswith( " " ) : 38 | item = item[ 1: ] 39 | if item.endswith( " " ) : 40 | item = item[:-1] 41 | if len(item) > 4 : 42 | if item not in processed: 43 | processed.append( item ) 44 | return processed 45 | 46 | 47 | #%% 48 | df = pd.read_excel( "OECD_SDG_betas.xlsx" ) 49 | #%% 50 | """ 51 | File includes TOP 1000 positive and negative beta coeficients from the regression models for both unigrams and ngrams 52 | We will take top 200 ngrams and top 100 unigrams 53 | """ 54 | 55 | unigrams = list( df["Keywords_Positive"]) 56 | ngrams = list( df["Ngrams_Positive"]) 57 | 58 | sdg_words_raw = {} 59 | 60 | for index, item in enumerate( unigrams ) : 61 | 62 | unigram_short = [v[0] for v in sorted( ast.literal_eval( item ) , key = lambda kv : kv[1] , reverse = True )[0:50] ] 63 | ngram_short = [v[0] for v in sorted( ast.literal_eval( ngrams[ index ] ) , key = lambda kv : kv[1] , reverse = True )[0:250] ] 64 | plh = unigram_short + ngram_short 65 | 66 | key = "SDG_" + str(index+1) 67 | 68 | sdg_words_raw[ key ] = pre_proc( plh ) 69 | 70 | #%% 71 | """ 72 | Deduplicating keywords 73 | """ 74 | word_freq_dict = {} 75 | for val in list(sdg_words_raw.values()) : 76 | for v in val : 77 | if v not in word_freq_dict : 78 | word_freq_dict[ v ] = 1 79 | else: 80 | word_freq_dict[ v ] += 1 81 | 82 | #%% 83 | sdg_words = {} 84 | for key , value in sdg_words_raw.items() : 85 | plh = [ i for i in value if word_freq_dict[i] < 2] 86 | sdg_words[ key ] = plh 87 | 88 | #%% 89 | bad_sdg3_terms = set( [ "data type" , 90 | "date signature" , 91 | "date start" , 92 | "date start date" , 93 | "deliverable list" , 94 | "deliverable wp" , 95 | "demonstrator dissemination" , 96 | "description deliverable" , 97 | "developed new" , 98 | "development new" , 99 | "dissemination activities" , 100 | "dissemination report" , 101 | "document version" , 102 | "ec contribution" ] ) 103 | 104 | sdg3_plh = [ i for i in sdg_words["SDG_3"] if i not in bad_sdg3_terms] 105 | sdg_words[ "SDG_3" ] = sdg3_plh 106 | 107 | #%% 108 | js = json.dumps( sdg_words ) 109 | file = open( "3_ProcessedKeyTerms.json" , "w") 110 | file.write( js ) 111 | file.close() -------------------------------------------------------------------------------- /raw_data/0_add/01_add_generated/3_SDGPathfinder_DocumentConcepts/OECD_SDG_betas.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/01_add_generated/3_SDGPathfinder_DocumentConcepts/OECD_SDG_betas.xlsx -------------------------------------------------------------------------------- /raw_data/0_add/01_add_generated/4_SDGPathfinder_Keywords/4_process_key_terms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Apr 30 11:43:03 2020 5 | 6 | @author: lukas 7 | """ 8 | 9 | import pandas as pd 10 | import json 11 | import ast 12 | 13 | 14 | 15 | #%% 16 | replacables_symbols = ["&" , "-" , '"' , " "] 17 | replacables_words = ["and" , "or" , "for", "&" , "of" , "sdg" , "oecd" , "arctic"] 18 | def pre_proc( list_o_strings ): 19 | """ 20 | Keeps only the keywords longer than 4 characters ; 21 | Strips non Alphanumeric chars ; 22 | Removes basic interluding words ( "and" , "of" , etc. ) ; 23 | Deduplicates 24 | """ 25 | 26 | processed = [] 27 | alpha = "abcdefghijklmnopqrstuvwxyz0123456789 " 28 | for item in list_o_strings : 29 | item = item.lower() 30 | 31 | for c in replacables_symbols: 32 | item = item.replace( c , " " ) 33 | item_p = item.split() 34 | item = " ".join(i for i in item_p if i not in replacables_words) 35 | 36 | if all( c in alpha for c in item ) : 37 | if item.startswith( " " ) : 38 | item = item[ 1: ] 39 | if item.endswith( " " ) : 40 | item = item[:-1] 41 | if len(item) > 4 : 42 | if item not in processed: 43 | processed.append( item ) 44 | return processed 45 | 46 | #%% 47 | sdg_dict = { 48 | "partnerships-for-the-goals" : "SDG_17" , 49 | "good-health" : "SDG_3" , 50 | "no-poverty" : "SDG_1" , 51 | "life-below-water" : "SDG_14" , 52 | "peace-justice-and-strong-institutions" : "SDG_16" , 53 | "decent-work-growth" : "SDG_8" , 54 | "responsible-consumption" : "SDG_12" , 55 | "climate-action" : "SDG_13" , 56 | "industry-innovation-and-infrastructure" : "SDG_9" , 57 | "gender-equality" : "SDG_5" , 58 | "affordable-energy" : "SDG_7" , 59 | "reduced-inequalities" : "SDG_10" , 60 | "zero-hunger" : "SDG_2" , 61 | "clean-water" : "SDG_6" , 62 | "sustainable-cities" : "SDG_11" , 63 | "quality-education" : "SDG_4" , 64 | "life-on-land" : "SDG_15" } 65 | 66 | 67 | #%% 68 | dfl = pd.read_csv( "keywords.csv" ).to_dict(orient="records") 69 | 70 | sdg_words_raw = {} 71 | 72 | for row in dfl : 73 | if sdg_dict[ row["sdg"] ] not in sdg_words_raw.keys() : 74 | sdg_words_raw[ sdg_dict[ row["sdg"] ] ] = [] 75 | plh = ast.literal_eval( row["keys"] ) 76 | for i in plh : 77 | sdg_words_raw[ sdg_dict[ row["sdg"] ] ].append( i["key"].lower()) 78 | 79 | #%% 80 | for key , value in sdg_words_raw.items() : 81 | sdg_words_raw[ key ] = pre_proc( value ) 82 | 83 | #%% 84 | """ 85 | Deduplicating keywords 86 | """ 87 | word_freq_dict = {} 88 | for val in list(sdg_words_raw.values()) : 89 | for v in val : 90 | if v not in word_freq_dict : 91 | word_freq_dict[ v ] = 1 92 | else: 93 | word_freq_dict[ v ] += 1 94 | 95 | #%% 96 | sdg_words = {} 97 | for key , value in sdg_words_raw.items() : 98 | plh = [ i for i in value if word_freq_dict[i] < 2] 99 | sdg_words[ key ] = plh 100 | 101 | #%% 102 | js = json.dumps( sdg_words ) 103 | file = open( "4_ProcessedKeyTerms.json" , "w") 104 | file.write( js ) 105 | file.close() -------------------------------------------------------------------------------- /raw_data/0_add/01_add_generated/5_LinkedSDG_DocumentExtracts/5_ProcessedKeyTerms.json: -------------------------------------------------------------------------------- 1 | {"SDG_12": ["hazardous waste", "cleaner production", "sustainable production"], "SDG_8": ["occupational accident", "trade financing", "financial services", "youth employment", "trade promotion", "labour productivity", "decent work", "occupational safety", "occupational hazards", "salaires", "occupational accidents"], "SDG_3": ["non communicable diseases", "diseases", "mortality", "death", "public health", "health personnel", "physicians", "reproductive health", "family planning", "traffic accidents", "suicide", "maternal child health", "narcotic drugs", "tobacco", "youth health", "communicable diseases", "mothers", "child mortality", "maternal mortality", "tuberculosis", "food hygiene", "child health", "traffic safety", "malaria", "toxic substance", "water related diseases", "international health regulations", "childbirth", "infant mortality", "sex education", "medical research", "health hazards", "infants", "tropical disease", "mental health", "smoking", "vaccination", "alcoholism", "road traffic", "deliveries", "mortalities", "toxicity", "maladies infectieuses", "delivery", "infectious diseases", "deaths", "alcohol abuse", "mortality rates", "tropical diseases", "dioxins", "burial", "death rate", "vaccinations", "suicides", "cause death", "toxic chemicals", "community health", "toxic substances", "dioxin", "perinatal mortality", "mortality rate", "seropositivity", "food safety", "toxicities", "hiv infections", "mortalite", "terminally ill", "human mortality", "parturition", "death rates", "burials", "meres", "health the population", "confinement", "malarias", "sante publique", "funeral", "adolescent health", "enfermedades", "addiction to tobacco", "mortalidad", "medical personnel", "sante mentale", "salud publica", "maladies", "newborn babies", "natural death", "narcotics", "confinements", "childbirths", "road safety", "enfants", "funerals", "tobacco addiction", "salud mental", "recien nacidos", "lactantes", "muerte", "toxic discharge", "medecins"], "SDG_9": ["innovation", "industrial infrastructure", "industrialization", "research development", "technology", "technologies", "technological innovations", "scientific personnel", "engineering", "innovations", "technological innovation", "industrial innovation", "industrial infrastructures"], "SDG_5": ["girls", "participation women", "position women", "gender based violence", "harmful traditional practices", "unpaid work", "female circumcision", "women managers", "political participation", "excision", "situation women", "excisions", "sexual violence", "violence against women"], "SDG_1": ["standard living"], "SDG_11": [], "SDG_2": ["sustainable agriculture", "food shortage", "hunger", "malnutrition", "food security", "agricultural policy", "economic policy", "starvation", "food production", "food price", "agricultural development", "famine", "food insecurity", "food prices", "access to food", "food availability", "agricultural policies", "economic policies", "multifunctional agriculture", "economic choices", "food shortages", "local food production", "starvations"], "SDG_6": ["use water", "water management", "use waters"], "SDG_16": ["human rights", "civil registration", "birth reporting", "child abuse", "access to information", "corruption", "rule law", "bribery", "legal protection", "public information", "detained persons", "administration justice", "societe civile", "protection human rights", "corrupt practices", "public bodies", "public institutions", "human rights violations", "corruptions"], "SDG_4": ["educational facilities", "right to education", "computer literacy", "teacher", "scholarships", "preschool education", "educational financing", "teacher training", "teachers", "kindergarten", "kindergartens", "professors", "professor", "educacion", "teaching staff", "educational buildings"], "SDG_7": ["sustainable energy", "energy market", "66 energy", "renewable energy sources", "electrification", "energy resources", "energy", "energy efficiency", "environmentally sound technology", "energies", "energy sector", "fuels", "alternative energy sources", "power sector", "energy supply", "energy supplies", "fuel efficiency", "energy sectors", "energy markets", "clean technologies", "energy efficiencies", "fuel resources"], "SDG_15": ["terrestrial ecosystem", "terrestrial ecosystems", "deforestation", "mountain ecosystems", "freshwater ecosystem", "national parks reserves", "protected area", "desertification", "national park", "forest", "forests", "biological diversity", "forest ecosystems", "biodiversity", "endangered species", "protected areas", "diversidad biologica", "biodiversidad", "bosque", "species diversity", "woodland", "woodlands", "national parks", "biosphere reserves", "freshwater ecosystems", "nature reserves", "biodiversite", "salinisation", "alpine ecology", "alpine ecosystems"], "SDG_13": [], "SDG_17": ["partnership", "internet", "national budget", "economic support", "resources mobilization", "external debt", "debt servicing", "programme evaluation", "programme ownership", "partnerships", "state budget", "third world"], "SDG_14": ["oceans seas", "ocean", "marine life", "marine resources", "marine environment", "marine ecosystems", "marine environments", "oceans", "marine ecology", "oceanos", "marine fauna", "sea resources"], "SDG_10": ["social security", "income distribution", "income inequalities", "income inequality", "social protection", "social insurance"]} -------------------------------------------------------------------------------- /raw_data/0_add/01_add_generated/5_LinkedSDG_DocumentExtracts/5_process_key_terms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Apr 30 15:47:41 2020 5 | 6 | @author: lukas 7 | """ 8 | 9 | 10 | 11 | import pandas as pd 12 | import json 13 | 14 | #%% 15 | replacables_symbols = ["&" , "-" , '"' , " "] 16 | replacables_words = ["and" , "or" , "for", "&" , "of" , "sdg" , "oecd" , "arctic"] 17 | def pre_proc( list_o_strings ): 18 | """ 19 | Keeps only the keywords longer than 4 characters ; 20 | Strips non Alphanumeric chars ; 21 | Removes basic interluding words ( "and" , "of" , etc. ) ; 22 | Deduplicates 23 | """ 24 | 25 | processed = [] 26 | alpha = "abcdefghijklmnopqrstuvwxyz0123456789 " 27 | for item in list_o_strings : 28 | item = item.lower() 29 | 30 | for c in replacables_symbols: 31 | item = item.replace( c , " " ) 32 | item_p = item.split() 33 | item = " ".join(i for i in item_p if i not in replacables_words) 34 | 35 | if all( c in alpha for c in item ) : 36 | if item.startswith( " " ) : 37 | item = item[ 1: ] 38 | if item.endswith( " " ) : 39 | item = item[:-1] 40 | if len(item) > 4 : 41 | if item not in processed: 42 | processed.append( item ) 43 | return processed 44 | 45 | 46 | #%% 47 | dfl = pd.read_excel( "LinkedSDG_DocumentExtracts.xlsx" ).to_dict(orient="records") 48 | 49 | sdg_words_raw = {} 50 | for row in dfl : 51 | if str(row["Goal"]) != "nan" : 52 | sdg = row["Goal"].replace("Goal " , "SDG_") 53 | if sdg not in sdg_words_raw.keys() : 54 | sdg_words_raw[ sdg ] = [] 55 | if str(row["Concept"]) != "nan" : 56 | sdg_words_raw[ sdg ].append( row["Concept"].lower() ) 57 | if str(row["Keyword"]) != "nan" : 58 | sdg_words_raw[ sdg ].append( row["Keyword"].lower() ) 59 | 60 | for key , value in list(sdg_words_raw.items()) : 61 | sdg_words_raw[ key ] = pre_proc( value ) 62 | 63 | #%% 64 | """ 65 | Deduplicating keywords 66 | """ 67 | word_freq_dict = {} 68 | for val in list(sdg_words_raw.values()) : 69 | for v in val : 70 | if v not in word_freq_dict : 71 | word_freq_dict[ v ] = 1 72 | else: 73 | word_freq_dict[ v ] += 1 74 | 75 | #%% 76 | sdg_words = {} 77 | for key , value in sdg_words_raw.items() : 78 | plh = [ i for i in value if word_freq_dict[i] < 2] 79 | sdg_words[ key ] = plh 80 | 81 | #%% 82 | js = json.dumps( sdg_words ) 83 | file = open( "5_ProcessedKeyTerms.json" , "w") 84 | file.write( js ) 85 | file.close() -------------------------------------------------------------------------------- /raw_data/0_add/01_add_generated/5_LinkedSDG_DocumentExtracts/LinkedSDG_DocumentExtracts.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/01_add_generated/5_LinkedSDG_DocumentExtracts/LinkedSDG_DocumentExtracts.xlsx -------------------------------------------------------------------------------- /raw_data/0_add/01_add_generated/7_EC_Policy_Doc_Terms/7_ProcessedKeyTerms.json: -------------------------------------------------------------------------------- 1 | {"SDG_1": ["old age", "inter american", "economic downturn", "small medium", "el salvador", "poverty eradication", "gender responsive", "pre school", "gender sensitive", "price volatility", "lump sum", "austria belgium"], "SDG_2": ["fishery aquaculture", "adaptation climate change", "central america", "animal health", "research technological", "shelf life", "west africa", "clinical trial"], "SDG_3": ["road map", "hiv aids", "breast cancer", "mg ml"], "SDG_4": ["lifelong learning", "best practice", "vocational education", "teaching profession", "english language", "chamber commerce", "youth unemployment", "north west", "shed light", "early childhood education", "public procurement", "democratic republic congo", "tel mail", "secondary education", "job search", "vocational education training", "unite kingdom"], "SDG_5": ["violence woman", "millennium development goals", "holistic approach", "male female", "essential element", "vocational training", "criminal justice", "small medium sized", "venture capital", "lessons learned", "domestic violence", "post conflict"], "SDG_6": ["analytical method", "soil erosion", "river lake", "water sanitation", "saudi arabia", "wastewater treatment plant", "surface water", "heavy metal", "czech republic", "flood risk", "wastewater treatment", "river basin", "lung cancer", "van den", "supply sanitation", "service provider", "drink water", "urban rural", "add value", "iceland norway", "extreme weather"], "SDG_7": ["added value", "sole responsibility", "remedial action", "free charge", "table contents", "food chain", "oil gas", "wind power", "medical device"], "SDG_8": ["latin american", "energy efficiency", "joint venture", "family life", "northern ireland", "solve problem", "gender balance", "gender gap", "medium term", "migrant worker", "ministry finance", "corporate governance"], "SDG_9": ["air transport", "gender perspective", "emission trading", "electric vehicle", "capital formation", "artificial intelligence", "republic korea", "millennium development goal", "import export", "ex post", "prime minister", "high speed", "freight transport", "scientific technological", "guinea bissau", "high tech"], "SDG_10": ["ethnic minority", "annex iii", "cros border", "root cause", "past decade", "elderly people", "commodity price", "disposable income", "minimum wage"], "SDG_11": ["directorate general", "better understanding", "france germany", "urban mobility", "executive summary", "south east asia", "intellectual property right", "fuel consumption", "high spee", "cash flow", "south west"], "SDG_12": ["ministry agriculture", "win win", "mm mm", "convention biological diversity", "medium sized", "north atlantic", "air pollutant", "fish stock"], "SDG_14": ["marine ecosystem", "biodiversity loss", "north sea", "baseline scenario", "mid term", "bosnia herzegovina", "fishing vessel", "papua new guinea", "longer term", "fax mail"], "SDG_15": ["fossil fuel", "low carbon economy", "kyoto protocol", "easily accessible", "damage cause", "genetic resource", "north east", "central eastern europe", "non discriminatory", "internet thing", "motor vehicle", "mitigation adaptation", "search engine", "solar panel", "biodiversity ecosystem"], "SDG_16": ["van der", "personal datum", "ad hoc"], "SDG_17": ["peer review", "natural resources", "road traffic", "communicable disease", "policy makers"]} -------------------------------------------------------------------------------- /raw_data/0_add/01_add_generated/7_EC_Policy_Doc_Terms/7_process_key_terms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu May 15 15:47:41 2020 5 | 6 | @author: lukas 7 | """ 8 | 9 | import pandas as pd 10 | import json 11 | from tqdm import tqdm 12 | 13 | import ast 14 | 15 | replacables_symbols = ["&" , "-" , '"' , " "] 16 | replacables_words = ["and" , "or" , "for", "&" , "of" , "sdg" , "oecd" , "arctic"] 17 | 18 | def pre_proc( list_o_strings ): 19 | """ 20 | Keeps only the keywords longer than 4 characters ; 21 | Strips non Alphanumeric chars ; 22 | Removes basic interluding words ( "and" , "of" , etc. ) ; 23 | Deduplicates 24 | """ 25 | 26 | processed = [] 27 | alpha = "abcdefghijklmnopqrstuvwxyz0123456789 " 28 | for item in list_o_strings : 29 | item = item.replace("_" , " ") 30 | item = item.lower() 31 | 32 | for c in replacables_symbols: 33 | item = item.replace( c , " " ) 34 | item_p = item.split() 35 | item = " ".join(i for i in item_p if i not in replacables_words) 36 | 37 | if all( c in alpha for c in item ) : 38 | if item.startswith( " " ) : 39 | item = item[ 1: ] 40 | if item.endswith( " " ) : 41 | item = item[:-1] 42 | if len(item) > 4 : 43 | if item not in processed: 44 | processed.append( item ) 45 | return processed 46 | 47 | dfl = pd.read_excel("ECPolicyDocs_Ngrams REVISED.xlsx").to_dict( orient = "records") 48 | 49 | 50 | number_map = {"Goal_1" : "SDG_1" , 51 | "Goal_2" : "SDG_2", 52 | "Goal_3" : "SDG_3", 53 | "Goal_4" : "SDG_4", 54 | "Goal_5" : "SDG_5", 55 | "Goal_6" : "SDG_6", 56 | "Goal_7" : "SDG_7", 57 | "Goal_8" : "SDG_8", 58 | "Goal_9" : "SDG_9", 59 | "Goal_10" : "SDG_10", 60 | "Goal_11" : "SDG_11", 61 | "Goal_12" : "SDG_12", 62 | "Goal_13" : "SDG_12", 63 | "Goal_14" : "SDG_14", 64 | "Goal_15" : "SDG_15", 65 | "Goal_16" : "SDG_16", 66 | "Goal_17" : "SDG_17", 67 | } 68 | 69 | sdg_words_raw = {} 70 | 71 | for row in tqdm(dfl) : 72 | number = row['Goal'].split(".")[0] 73 | sdg = number_map[ number ] 74 | if sdg not in sdg_words_raw.keys() : 75 | sdg_words_raw[ sdg ] = [] 76 | sdg_words_raw[ sdg ] = list( ast.literal_eval(row["SDG&EC_NgramsOverlap"])) 77 | 78 | counter = 0 79 | print("Key Words Identified before cleaning : " ) 80 | for key , value in sdg_words_raw.items() : 81 | print( key , " : ", len(value)) 82 | counter += len(value) 83 | 84 | print("Overall : ", counter) 85 | 86 | #%% 87 | for key , value in sdg_words_raw.items() : 88 | sdg_words_raw[ key ] = pre_proc( value ) 89 | 90 | #%% 91 | """ 92 | Deduplicating keywords 93 | """ 94 | word_freq_dict = {} 95 | for val in list(sdg_words_raw.values()) : 96 | for v in val : 97 | if v not in word_freq_dict : 98 | word_freq_dict[ v ] = 1 99 | else: 100 | word_freq_dict[ v ] += 1 101 | 102 | #%% 103 | sdg_words = {} 104 | for key , value in sdg_words_raw.items() : 105 | plh = [ i for i in value if word_freq_dict[i] < 2] 106 | sdg_words[ key ] = plh 107 | 108 | #%% 109 | js = json.dumps( sdg_words ) 110 | file = open( "7_ProcessedKeyTerms.json" , "w") 111 | file.write( js ) 112 | file.close() 113 | 114 | counter = 0 115 | print("Key Words Identified after cleaning: " ) 116 | for key , value in sdg_words.items() : 117 | print( key , " : ", len(value)) 118 | counter += len(value) 119 | 120 | print("Overall : ", counter) 121 | -------------------------------------------------------------------------------- /raw_data/0_add/01_add_generated/7_EC_Policy_Doc_Terms/ECPolicyDocs_Ngrams REVISED.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/01_add_generated/7_EC_Policy_Doc_Terms/ECPolicyDocs_Ngrams REVISED.xlsx -------------------------------------------------------------------------------- /raw_data/0_add/01_add_generated/9_SIRIS_Science4SDGs/9_process_key_terms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu May 15 15:47:41 2020 5 | 6 | @author: lukas 7 | """ 8 | 9 | import pandas as pd 10 | import json 11 | from tqdm import tqdm 12 | 13 | import ast 14 | 15 | replacables_symbols = ["&" , "-" , '"' , " "] 16 | replacables_words = ["and" , "or" , "for", "&" , "of" , "sdg" , "oecd" , "arctic"] 17 | 18 | def pre_proc( list_o_strings ): 19 | """ 20 | Keeps only the keywords longer than 4 characters ; 21 | Strips non Alphanumeric chars ; 22 | Removes basic interluding words ( "and" , "of" , etc. ) ; 23 | Deduplicates 24 | """ 25 | 26 | processed = [] 27 | alpha = "abcdefghijklmnopqrstuvwxyz0123456789 " 28 | for item in list_o_strings : 29 | item = item.replace("_" , " ") 30 | item = item.lower() 31 | 32 | for c in replacables_symbols: 33 | item = item.replace( c , " " ) 34 | item_p = item.split() 35 | item = " ".join(i for i in item_p if i not in replacables_words) 36 | 37 | if all( c in alpha for c in item ) : 38 | if item.startswith( " " ) : 39 | item = item[ 1: ] 40 | if item.endswith( " " ) : 41 | item = item[:-1] 42 | if len(item) > 4 : 43 | if item not in processed: 44 | processed.append( item ) 45 | return processed 46 | 47 | dfl = pd.read_excel("sdg_vocabulary_V1.2 [zenodo](single_shhet).xlsx").to_dict( orient = "records") 48 | 49 | 50 | number_map = {"SDG 1" : "SDG_1" , 51 | "SDG 2" : "SDG_2", 52 | "SDG 3" : "SDG_3", 53 | "SDG 4" : "SDG_4", 54 | "SDG 5" : "SDG_5", 55 | "SDG 6" : "SDG_6", 56 | "SDG 7" : "SDG_7", 57 | "SDG 8" : "SDG_8", 58 | "SDG 9" : "SDG_9", 59 | "SDG 10" : "SDG_10", 60 | "SDG 11" : "SDG_11", 61 | "SDG 12" : "SDG_12", 62 | "SDG 13" : "SDG_12", 63 | "SDG 14" : "SDG_14", 64 | "SDG 15" : "SDG_15", 65 | "SDG 16" : "SDG_16", 66 | "SDG 17" : "SDG_17", 67 | } 68 | 69 | sdg_words_raw = {} 70 | 71 | for row in tqdm(dfl) : 72 | number = row['SDG'] 73 | sdg = number_map[ number ] 74 | if sdg not in sdg_words_raw.keys() : 75 | sdg_words_raw[ sdg ] = [] 76 | sdg_words_raw[ sdg ].append( row["keyword"]) 77 | extras = str(row["extra"]).split("|") 78 | sdg_words_raw[sdg]+= extras 79 | 80 | counter = 0 81 | print("Key Words Identified before cleaning : " ) 82 | for key , value in sdg_words_raw.items() : 83 | print( key , " : ", len(value)) 84 | counter += len(value) 85 | 86 | print("Overall : ", counter) 87 | 88 | #%% 89 | for key , value in sdg_words_raw.items() : 90 | sdg_words_raw[ key ] = pre_proc( value ) 91 | 92 | #%% 93 | """ 94 | Deduplicating keywords 95 | """ 96 | word_freq_dict = {} 97 | for val in list(sdg_words_raw.values()) : 98 | for v in val : 99 | if v not in word_freq_dict : 100 | word_freq_dict[ v ] = 1 101 | else: 102 | word_freq_dict[ v ] += 1 103 | 104 | #%% 105 | sdg_words = {} 106 | for key , value in sdg_words_raw.items() : 107 | plh = [ i for i in value if word_freq_dict[i] < 2] 108 | sdg_words[ key ] = plh 109 | 110 | #%% 111 | js = json.dumps( sdg_words ) 112 | file = open( "9_ProcessedKeyTerms.json" , "w") 113 | file.write( js ) 114 | file.close() 115 | 116 | counter = 0 117 | print("Key Words Identified after cleaning: " ) 118 | for key , value in sdg_words.items() : 119 | print( key , " : ", len(value)) 120 | counter += len(value) 121 | 122 | print("Overall : ", counter) 123 | -------------------------------------------------------------------------------- /raw_data/0_add/01_add_generated/9_SIRIS_Science4SDGs/sdg_vocabulary_V1.2 [zenodo](single_shhet).xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/01_add_generated/9_SIRIS_Science4SDGs/sdg_vocabulary_V1.2 [zenodo](single_shhet).xlsx -------------------------------------------------------------------------------- /raw_data/0_add/01_add_generated/9_SIRIS_Science4SDGs/sdg_vocabulary_V1.2 [zenodo].xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/01_add_generated/9_SIRIS_Science4SDGs/sdg_vocabulary_V1.2 [zenodo].xlsx -------------------------------------------------------------------------------- /raw_data/0_add/02_add_all_to_all/10_PPMI_boost/10_process_fos.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | 4 | 5 | data_1 = pd.read_excel("SDG FOS updated 06 01.xlsx").to_dict(orient="records") 6 | data_2 = pd.read_excel("SDG FOS updated 06 12.xlsx").to_dict(orient="records") 7 | 8 | sdg_words = {} 9 | 10 | for dfl in (data_1, data_2): 11 | for row in dfl: 12 | if str(row['SDG number']) != "nan": 13 | sdg = f"SDG_{int(row['SDG number'])}" 14 | if sdg not in sdg_words.keys(): 15 | sdg_words[sdg] = [] 16 | sdg_words[sdg].append((str(row['FOS number']), row["FOS name"])) 17 | 18 | 19 | counter = 0 20 | print("Key Words Identified before cleaning : ") 21 | for key, value in sdg_words.items(): 22 | print(key, " : ", len(value)) 23 | counter += len(value) 24 | 25 | print("Overall : ", counter) 26 | 27 | for sdg_label in sorted(sdg_words.keys(), key=lambda x: int(x.split('_')[-1])): 28 | sdg_words[sdg_label] = sorted(sdg_words[sdg_label], key=lambda x: x[1]) 29 | 30 | with open("10_ProcessedFOS.json", "w") as file_: 31 | file_.write(json.dumps(sdg_words)) 32 | 33 | counter = 0 34 | print("Key Words Identified after cleaning: ") 35 | for key, value in sdg_words.items(): 36 | print(key, " : ", len(value)) 37 | counter += len(value) 38 | 39 | print("Overall : ", counter) 40 | -------------------------------------------------------------------------------- /raw_data/0_add/02_add_all_to_all/10_PPMI_boost/SDG FOS updated 06 01.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/02_add_all_to_all/10_PPMI_boost/SDG FOS updated 06 01.xlsx -------------------------------------------------------------------------------- /raw_data/0_add/02_add_all_to_all/10_PPMI_boost/SDG FOS updated 06 12.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/02_add_all_to_all/10_PPMI_boost/SDG FOS updated 06 12.xlsx -------------------------------------------------------------------------------- /raw_data/0_add/02_add_all_to_all/8_NABS_FOS/8_process_fos.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | 3 | import json 4 | import pandas as pd 5 | 6 | 7 | FNAME_PROCESSED_KEY_TERMS = "8_ProcessedFOS.json" 8 | 9 | 10 | if __name__ == '__main__': 11 | fos_data = pd.read_excel('NABS_FOS_update_2020-08-20_ed_VS.xlsx')[['FOS NAME', 'FOS NUMBER', 'SDG']].drop_duplicates() 12 | 13 | # Ignore fos list 14 | ignore_fos = fos_data[fos_data['SDG'] == 'NOT RELEVANT']['FOS NUMBER'].unique() 15 | 16 | sdg_fos = dict() 17 | for fos_name, fos_id, sdg_nr in tqdm(fos_data[~fos_data['FOS NUMBER'].isin(ignore_fos)].values): 18 | sdg_label = f'SDG_{sdg_nr}' 19 | if sdg_label not in sdg_fos.keys(): 20 | sdg_fos[sdg_label] = [] 21 | sdg_fos[sdg_label].append((str(fos_id), fos_name)) 22 | 23 | for sdg_label in sorted(sdg_fos.keys(), key=lambda x: int(x.split('_')[-1])): 24 | sdg_fos[sdg_label] = sorted(sdg_fos[sdg_label], key=lambda x: x[1]) 25 | 26 | with open(FNAME_PROCESSED_KEY_TERMS, 'w') as file_: 27 | json.dump(sdg_fos, file_) 28 | -------------------------------------------------------------------------------- /raw_data/0_add/02_add_all_to_all/8_NABS_FOS/NABS_FOS_update_2020-08-20_ed_VS.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/02_add_all_to_all/8_NABS_FOS/NABS_FOS_update_2020-08-20_ed_VS.xlsx -------------------------------------------------------------------------------- /raw_data/1_replace/11_TJL-24_review/11_ReplaceFOS.json: -------------------------------------------------------------------------------- 1 | {"159390177": [["SDG_6", "SDG_2"]], "50516716": [["SDG_6", "SDG_2"]], "139838865": [["SDG_11", "SDG_10"]], "38774213": [["SDG_2", "SDG_2"], ["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "65580899": [["SDG_3", "SDG_2"], ["SDG_6", "SDG_2"]], "182124840": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "52896960": [["SDG_15", "SDG_7"]], "551662922": [["SDG_12", "SDG_8"]], "152494472": [["SDG_6", "SDG_2"]], "198072978": [["SDG_6", "SDG_2"]], "156634047": [["SDG_6", "SDG_2"]], "39464130": [["SDG_6", "SDG_2"], ["SDG_13", "SDG_2"]], "109332788": [["SDG_11", "SDG_10"]], "21790881": [["SDG_6", "SDG_14"], ["SDG_12", "SDG_14"]], "160934017": [["SDG_6", "SDG_2"]], "141650431": [["SDG_6", "SDG_2"]], "109162521": [["SDG_6", "SDG_2"]], "2779819667": [["SDG_11", "SDG_12"]], "7083945": [["SDG_11", "SDG_12"]], "20529654": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "85675897": [["SDG_2", "SDG_2"], ["SDG_6", "SDG_2"], ["SDG_13", "SDG_2"], ["SDG_15", "SDG_2"]], "205726622": [["SDG_6", "SDG_2"]], "120991184": [["SDG_6", "SDG_2"]], "7959160": [["SDG_6", "SDG_2"]], "175963888": [["SDG_6", "SDG_2"]], "162902727": [["SDG_6", "SDG_2"]], "53421856": [["SDG_6", "SDG_2"]], "81751973": [["SDG_12", "SDG_8"]], "121923324": [["SDG_6", "SDG_2"]], "120217122": [["SDG_15", "SDG_2"]], "29510844": [["SDG_6", "SDG_2"]], "70957220": [["SDG_6", "SDG_2"]], "2776266027": [["SDG_6", "SDG_7"], ["SDG_11", "SDG_7"]], "141185391": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "154414509": [["SDG_6", "SDG_2"]], "2780189059": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "126408429": [["SDG_2", "SDG_2"], ["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "172365310": [["SDG_6", "SDG_2"]], "2779449393": [["SDG_11", "SDG_16"]], "560292": [["SDG_12", "SDG_8"]], "71864017": [["SDG_6", "SDG_2"]], "24649204": [["SDG_12", "SDG_8"]], "38070178": [["SDG_12", "SDG_8"]], "78285338": [["SDG_12", "SDG_8"]], "2777382958": [["SDG_12", "SDG_8"]], "33411773": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "5589519": [["SDG_6", "SDG_2"]], "3742959": [["SDG_6", "SDG_2"]], "53706860": [["SDG_6", "SDG_2"]], "172817999": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "2777638134": [["SDG_11", "SDG_4"]], "58395597": [["SDG_6", "SDG_2"]], "3963096": [["SDG_12", "SDG_8"]], "2777276756": [["SDG_12", "SDG_8"]], "156086215": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "14171219": [["SDG_2", "SDG_2"], ["SDG_6", "SDG_2"]], "2781198434": [["SDG_12", "SDG_8"]], "182745123": [["SDG_6", "SDG_2"]], "2777027713": [["SDG_6", "SDG_2"], ["SDG_7", "SDG_2"]], "45020621": [["SDG_6", "SDG_2"]], "2909722689": [["SDG_6", "SDG_2"]], "104471815": [["SDG_6", "SDG_2"]], "28362043": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "2778202820": [["SDG_12", "SDG_8"]], "2778380070": [["SDG_12", "SDG_8"]], "100474770": [["SDG_6", "SDG_2"]], "102561126": [["SDG_11", "SDG_13"]], "63696750": [["SDG_6", "SDG_2"]], "2775845107": [["SDG_6", "SDG_2"]], "114426456": [["SDG_6", "SDG_2"]], "2780339060": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "154885393": [["SDG_6", "SDG_2"]], "2779872728": [["SDG_12", "SDG_8"]], "59804570": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "174200844": [["SDG_6", "SDG_2"]], "2778099469": [["SDG_11", "SDG_16"]], "2780257989": [["SDG_6", "SDG_2"]], "2779422593": [["SDG_6", "SDG_2"]], "53145804": [["SDG_6", "SDG_2"]], "2778163119": [["SDG_6", "SDG_2"]], "42731165": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "152100882": [["SDG_6", "SDG_2"]], "55312793": [["SDG_6", "SDG_2"]], "125596622": [["SDG_6", "SDG_2"]], "160212601": [["SDG_6", "SDG_2"]], "2779746779": [["SDG_6", "SDG_2"]], "2778577444": [["SDG_6", "SDG_2"]], "134906952": [["SDG_6", "SDG_2"]], "2909107899": [["SDG_6", "SDG_2"]], "2778818373": [["SDG_6", "SDG_2"]], "2777073172": [["SDG_6", "SDG_2"]], "2910302653": [["SDG_6", "SDG_2"]]} -------------------------------------------------------------------------------- /raw_data/1_replace/11_TJL-24_review/11_process_replace_fos.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | import re 4 | 5 | 6 | df = pd.read_excel('osdg_fos_paper_citation_counts_REPLACE_v2_ed_VS.xlsx') 7 | 8 | replace_fos = dict() 9 | for fos_id, replace_from, to_sdg in df[['fos_id', 'sdgs', 'replace_to']].values: 10 | replace_from = map(lambda sdg_nr: f'SDG_{sdg_nr}', re.findall(r'\d+', replace_from)) 11 | to_sdg_nr = re.findall(r'\d+', to_sdg)[0] 12 | to_sdg = f"SDG_{to_sdg_nr}" 13 | if fos_id not in replace_fos.keys(): 14 | replace_fos[fos_id] = [] 15 | for from_sdg in replace_from: 16 | replace_fos[fos_id].append([from_sdg, to_sdg]) 17 | 18 | with open('11_ReplaceFOS.json', 'w') as file_: 19 | json.dump(replace_fos, file_) 20 | -------------------------------------------------------------------------------- /raw_data/1_replace/11_TJL-24_review/osdg_fos_paper_citation_counts_REPLACE_v2_ed_VS.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/1_replace/11_TJL-24_review/osdg_fos_paper_citation_counts_REPLACE_v2_ed_VS.xlsx -------------------------------------------------------------------------------- /raw_data/1_replace/12_Review_2020-10-02/12_ReplaceFOS.json: -------------------------------------------------------------------------------- 1 | {"53421856": [["SDG_2", "SDG_15"]], "160934017": [["SDG_2", "SDG_15"]], "29510844": [["SDG_2", "SDG_15"]], "7959160": [["SDG_2", "SDG_15"]], "198072978": [["SDG_2", "SDG_15"]], "152494472": [["SDG_2", "SDG_15"]], "120991184": [["SDG_2", "SDG_15"]], "14171219": [["SDG_2", "SDG_15"]], "156634047": [["SDG_2", "SDG_15"]], "159390177": [["SDG_2", "SDG_15"]], "50516716": [["SDG_2", "SDG_15"]], "100474770": [["SDG_2", "SDG_15"]], "53706860": [["SDG_2", "SDG_15"]], "58395597": [["SDG_2", "SDG_15"]], "5589519": [["SDG_2", "SDG_15"]], "175963888": [["SDG_2", "SDG_15"]], "2909722689": [["SDG_2", "SDG_15"]], "172365310": [["SDG_2", "SDG_15"]], "3742959": [["SDG_2", "SDG_15"]], "65580899": [["SDG_2", "SDG_15"]], "63696750": [["SDG_2", "SDG_15"]], "182745123": [["SDG_2", "SDG_15"]], "71864017": [["SDG_2", "SDG_15"]], "174200844": [["SDG_2", "SDG_15"]], "160212601": [["SDG_2", "SDG_15"]], "152100882": [["SDG_2", "SDG_15"]], "104471815": [["SDG_2", "SDG_15"]], "39464130": [["SDG_2", "SDG_15"]], "114426456": [["SDG_2", "SDG_15"]], "125596622": [["SDG_2", "SDG_15"]], "55312793": [["SDG_2", "SDG_15"]], "205726622": [["SDG_2", "SDG_15"]], "2910302653": [["SDG_2", "SDG_15"]], "53145804": [["SDG_2", "SDG_15"]], "139669111": [["SDG_2", "SDG_15"]], "162902727": [["SDG_2", "SDG_15"]], "70957220": [["SDG_2", "SDG_15"]]} -------------------------------------------------------------------------------- /raw_data/1_replace/12_Review_2020-10-02/12_process_replace_fos.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | 4 | 5 | replace_fos = dict() 6 | 7 | df = pd.read_csv('replace-review_2020-10-02.csv') 8 | for fos_id, _, from_sdg, to_sdg in df.values: 9 | fos_id = str(fos_id) 10 | if fos_id not in replace_fos.keys(): 11 | replace_fos[fos_id] = [] 12 | replace_fos[fos_id].append([from_sdg, to_sdg]) 13 | 14 | with open('12_ReplaceFOS.json', 'w') as file_: 15 | json.dump(replace_fos, file_) 16 | -------------------------------------------------------------------------------- /raw_data/1_replace/12_Review_2020-10-02/replace-review_2020-10-02.csv: -------------------------------------------------------------------------------- 1 | fos_id,fos_name,from_sdg,to_sdg 2 | 53421856,Soil biology,SDG_2,SDG_15 3 | 160934017,Soil type,SDG_2,SDG_15 4 | 29510844,Soil chemistry,SDG_2,SDG_15 5 | 7959160,Soil biodiversity,SDG_2,SDG_15 6 | 198072978,Soil pH,SDG_2,SDG_15 7 | 152494472,Soil classification,SDG_2,SDG_15 8 | 120991184,Soil structure,SDG_2,SDG_15 9 | 14171219,Agricultural soil science,SDG_2,SDG_15 10 | 156634047,Soil horizon,SDG_2,SDG_15 11 | 159390177,Soil science,SDG_2,SDG_15 12 | 50516716,Soil test,SDG_2,SDG_15 13 | 100474770,Soil physics,SDG_2,SDG_15 14 | 53706860,Soil morphology,SDG_2,SDG_15 15 | 58395597,Red soil,SDG_2,SDG_15 16 | 5589519,Soil series,SDG_2,SDG_15 17 | 175963888,Soil texture,SDG_2,SDG_15 18 | 2909722689,Soil Pollutants,SDG_2,SDG_15 19 | 172365310,Soil microbiology,SDG_2,SDG_15 20 | 3742959,Soil survey,SDG_2,SDG_15 21 | 65580899,Soil contamination,SDG_2,SDG_15 22 | 63696750,USDA soil taxonomy,SDG_2,SDG_15 23 | 182745123,Soil gradation,SDG_2,SDG_15 24 | 71864017,Soil map,SDG_2,SDG_15 25 | 174200844,Unified Soil Classification System,SDG_2,SDG_15 26 | 160212601,World Reference Base for Soil Resources,SDG_2,SDG_15 27 | 152100882,Soil color,SDG_2,SDG_15 28 | 104471815,Digital soil mapping,SDG_2,SDG_15 29 | 39464130,Soil carbon,SDG_2,SDG_15 30 | 114426456,Soil thermal properties,SDG_2,SDG_15 31 | 125596622,Soil resilience,SDG_2,SDG_15 32 | 55312793,National Cooperative Soil Survey,SDG_2,SDG_15 33 | 205726622,Soil mechanics,SDG_2,SDG_15 34 | 2910302653,SOIL EXPOSURE,SDG_2,SDG_15 35 | 53145804,Soil food web,SDG_2,SDG_15 36 | 139669111,Understory,SDG_2,SDG_15 37 | 162902727,Soil conditioner,SDG_2,SDG_15 38 | 70957220,Soil compaction,SDG_2,SDG_15 39 | -------------------------------------------------------------------------------- /raw_data/1_replace/ReplacedFOS.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/1_replace/ReplacedFOS.xlsx -------------------------------------------------------------------------------- /raw_data/2_remove/20_FP7-4-SD_edited/20_RemoveFOS.json: -------------------------------------------------------------------------------- 1 | {"SDG_1": ["198428699", "170659323", "46578552", "2780196416", "47344431", "2780650499", "2780877353", "2780774518", "193395930", "2781201427", "2778149918", "129963666"], "SDG_2": ["2910099511", "2908895846", "2909158028", "47806933", "2910592767", "2779491563", "2775988993", "18419278", "3480121", "2908684217", "2777626052", "2776423422", "138256199", "65607591", "2779910751", "2777970171", "53007507", "74359761", "2908707527", "2776574205", "2910521719", "2776928777", "2776941976", "2779910956", "2777403496", "2908614360", "2909333666", "2909520665", "2780687331", "58808276", "2909745607", "115220002", "148257392", "2778020220", "106165879", "2780114722", "2910394173", "2910354950", "2780118905", "138020889", "2779814899", "100521375", "37662734", "97483426", "25260931", "2909271431", "2908574822", "51672120", "164065428", "94919774", "2778080818", "38749836", "2781415353", "2909072068", "2909807485", "2775937386", "2777974810", "78161392", "2910479360", "138399698", "120562766", "185671874", "2776744078", "2776709667", "50919411", "65257409", "2781418482", "2777621572", "173837035", "2910092765", "37228920", "170959889", "2909553954", "98536072", "135300049", "21249469", "2777220787", "2780471819", "2779634897", "549605437", "42525527", "2775840374", "2909076198", "27254500", "122735190", "2908952963", "101434241", "130947863", "2910985744", "2776968180", "190638079", "2776020993", "149172842", "2780017871", "2775834976", "120912362", "2910220031", "192233252", "102744134", "107211472", "2779160288", "2910444920", "2909153497", "44283249", "540751848", "2910258602", "2910321295", "175965649", "2779704485", "79850504", "2777958824", "2778365744", "2778172956", "134864226", "4733338", "60635243", "195454712", "78945660", "2781445593", "2777481183", "2778600265", "148196450", "2910992960", "2910222286", "2777926330", "2779091883", "2910342552", "2776482837", "2780051701", "2781446643", "2780015948", "2777103877", "2778599437", "11804174", "2777264270", "2780170040", "2776903312", "175895763", "2779287493", "39257715", "2909270706", "63412515", "2909934379", "2776319399", "2779459076", "2776008721", "2910734704", "2780030891", "2776665880", "2777963300", "60544836", "2911159031", "2909039982", "64417066", "2777849792", "2910859984", "2776609134", "2781314072", "2909612347", "2779168029", "168402607", "2778793514", "2779652696", "117534273", "183688256", "2779316952", "2908972231", "2910200824", "2910994361", "2909856885", "2776809771", "2780535588", "131923401", "2910940979", "2910538944", "2777271545", "2776338311", "2776773308", "2781281389", "2778438103", "2781375701", "2778184291", "2776294106", "111280770", "2909571003", "2778274352", "2777714445", "148846489", "70295763", "2780608908", "147716585", "26355699", "2779218938", "2777647554", "2777250853", "2776317494", "2777469154", "169490005", "9111530", "2777584449", "101512455", "2776947765", "206625514", "45051096", "2777330291", "2781396848", "162947575", "176035894", "48306297", "125900194", "124219066", "2778175407", "2780684046", "2780643479", "2778532622", "152747807", "201437064", "53702515", "2780805685", "2776905826", "196467688", "2776266440", "174253337", "190930322", "202964095", "167887339", "11039648", "141983198", "87841596", "153876917", "2776922509", "2910336849", "2781289450", "2776077682", "2777060948", "80323366", "2779046117", "2779862049", "2779741023", "2776222705", "2780597670", "2777140777", "2779740938"], "SDG_3": ["2910288937", "25166345", "2909111439", "167908162", "2776888527", "2779363792", "198738867", "111684460", "133462117", "91632574", "73751289", "148449293", "2910560156", "2910427492", "2911021130", "2910860471", "30439317", "108074857", "63540848", "2909518570", "2778444522", "2775940519", "2779372377", "2779870758", "2779728303", "2779308462", "2777814067", "2781145028", "37098654"], "SDG_4": ["2780623789", "2909064599", "2910043827", "132758656", "2908885425", "146804397", "24845683", "58346731", "2910324923", "2780550299", "2910181414", "2779686014", "2908678694", "117893075", "74279204", "78015137", "2776675903", "59364581", "61189997", "2780852648", "172905872", "2910043429", "173481278", "2909755642"], "SDG_5": ["2776596443", "541189924", "2908821035", "2909253651", "2778307344", "2779621813", "2781437166"], "SDG_6": ["2779732133", "59269818", "78302586", "544153396", "39442485", "2781128188", "11999413", "108597893", "2781026758", "2778323849"], "SDG_7": ["2911104624", "2776122723", "2775918509", "2909376016", "148718273", "49848784", "2776581130", "138417311", "2776611462", "25915539", "141842801", "112505250", "2778835581", "2910577901", "2909450372", "2908874825", "136649699", "584957", "19766214", "2777382002", "33840335", "147441545", "2779941319", "74824818", "115957382", "2910439062", "2778321654", "2780339557", "2908683193", "199873434", "2780331013", "126172416", "2781309322", "118635694", "2910681606", "2775974325", "202446494", "2909542240", "131747538", "108615695", "2777708149", "118732332", "2909269005", "2780452421", "33134510", "2776909254", "97508593", "151771877", "193809577", "192668324", "2908591035", "2908610585", "123380192", "138171918", "183912175", "2778944020", "99611785", "169574100", "42067281", "166151169", "45872418", "197301865", "2910564024", "2910306918", "2781056475", "2778348927", "125171110", "195534400", "2776365744", "2777045768", "3839877", "14642617", "2778431730", "83160514", "2780611706", "2777071705", "85909142", "79675319", "53914812", "2778776584", "2778334255", "2777890241", "78246475", "47645306", "145460709", "159851900", "88743934", "169961344", "2781333068", "57177791", "2910445384", "2910822426", "2780839634", "2781030502", "2776588390", "63969886", "2780251136", "170133592", "123977732", "15569618", "3283095", "2780611830", "132868160", "2776917865", "505695854", "2779700286", "179036041", "127288500", "51926234", "2778927675", "59329165", "162168397", "106189395", "2778383842", "179733262", "2776892096", "26324664", "2908784896", "191186522", "2909205303", "19966478", "2780688951", "2908749873", "2778569793", "113740612", "81877898", "2776810965", "159795486", "2909187471", "2909741741", "14447218", "199364081", "121629672", "95930237", "148651041", "2777155145", "130207615", "17098449", "42812", "12701381", "54017597", "93552971", "35995877", "126789939", "2780942248", "31771446", "54932901", "201999631", "2776069950", "2780638000", "2909937733", "136155141", "49304495", "166194698", "93953391", "42021957", "108848220", "188116033", "120398109", "2777126586", "62467634", "58896106", "56985126", "21552470", "28472234", "161028810", "2778772182", "2911016986", "192299074", "37530146", "74222875", "2779110910", "17371274", "22116519", "53645450", "82979123", "2776228582", "46787917", "83204339", "107863493", "2777027713", "29621489", "188087704", "2779503484", "2778958889", "60439489", "2777373712", "112578098", "75003639", "2777328224", "151948712", "29652920", "173182743", "29310469", "145597803", "2777134600", "98943031", "154864947", "2780949067", "167310744", "162681261", "52121051", "101188967", "2908581237", "103753734", "2780778756", "2777742759", "2779117831"], "SDG_8": ["2776444593", "2909413384", "27591710", "2778896325", "2911011203", "2910222570", "2910326028", "2780775167", "2911164255"], "SDG_9": ["137099501", "2909452073", "2778097690", "2775925408", "529335014"], "SDG_10": ["102268210", "105152847", "169900460", "61641136", "2910665876", "16976872", "2780776881", "2778078003"], "SDG_11": ["2776902872", "2776673659", "2908674967", "24856439", "183283035", "2778839380", "2779323829", "100675267", "2776689096", "2776508417", "23221634", "2778977993", "2778920248", "2779661781", "75461684", "2777614519", "111603439", "49221354", "2855170", "151890184", "2781255199", "80309976", "2777161741", "2911048674", "2779627320", "29279314", "2775976938", "2909020933", "44263959", "2779636881", "121684219", "550222582", "120208923", "81302111", "2780302256", "2777131152", "2777111354", "53232910", "100368936", "171730128", "80583463", "5072461", "200046510", "81961946", "536930464", "2777488192", "6506403", "120938966", "2778132726", "126082660", "2777440324", "173870130", "18030348", "80368990", "54855816", "2776816662", "125470083", "2778717691", "110604844", "2775886207", "2910286708", "2911120092", "120352889", "122224866", "97250363", "2780940931", "23680986", "2777262768", "2780575044", "2778269189", "128226362", "201743585", "73340581", "7149132", "2778790543", "2776389138", "53160558", "2778497495", "2911224752", "2908570632", "2781461753", "2781099003", "81667532", "50415386", "120588126", "207821765", "2776974013", "2225880", "38439746", "550607084", "2781281093", "107157880", "172438305", "2917558", "39511330", "150506046", "107119854", "69423932", "37350624", "206019424", "2779652681", "2779529265", "2780015235", "2778414658", "50637493", "2780665216", "189360488", "2777328387", "10245270", "2779473934", "103648661", "16189245", "167752473", "11360483", "102792161", "132373408", "2779313563", "2777041775", "2780761308", "2779962852", "2779313700", "20756127", "2910820772", "2777877904", "2779720300", "68640439", "2777735972", "131979681", "2778330474", "2909681832", "2909395910", "2781293718", "21457203", "2908664457", "2908818157", "7856111", "2780066083", "2780423321", "2777988118", "2778821660", "193450905", "2776219102", "2776160632", "148699463", "2775873933", "2775937711", "2776408593", "197553423", "2779279276", "2776764004", "2779462066", "2780444441", "2777048483", "2779722824", "98200471", "14390630", "46737286", "2910792664", "85148207", "2910477109", "200749887", "2910310371", "2777346527", "2775932640", "2781257993", "2909546771", "2911038400", "72355985", "40350719", "2776485071", "168443057", "2780775721", "142442999", "43227947", "2777447984", "193759585", "2776432661", "194229684", "57341113", "2909072158", "52226264", "2909398177", "2777817495", "13743948", "107779570", "137990359"], "SDG_12": ["23138022", "2778253041", "2776596069", "2779338949", "193596192", "84859931", "2777091700", "2780596747", "2777247137", "20820323", "162853370", "72104268", "2776558947", "525650276", "2779738550", "169093310", "63257944", "36067731", "188468808", "148027575", "62232509", "120302269", "101230327"], "SDG_14": ["78275445", "79158427", "2776665970", "173656711", "19889080", "2780309369", "2908618603", "530175646", "2909005227", "29275276", "202824567", "2909048777", "79334102", "51865526", "45942800", "2777721721", "2779086188", "2777894483", "2780042314", "2780583818", "2910866688", "205649164", "16405173", "73525677", "150012506", "2776582039", "102315692", "8182607", "155484110", "204259536", "103500101", "2909697453", "2779919027", "2775922648", "200401390", "37202355", "191506330", "52146309", "171276312", "2780660560", "153440673", "2778206238", "2781330656", "166423231", "2778199754", "2910510794", "2353230", "190703929", "50311922", "58341921", "201490090", "185809878", "164120249", "51450119"], "SDG_15": ["2778049214"], "SDG_16": ["527821871", "2780786045", "33791563", "2781107206", "143425610", "2776982550", "157686319", "9201690", "15758519", "2778906372", "69828861", "104177525", "117353447", "22674136", "2776112939", "2909746666", "174943157", "189809214", "35550292", "105585729", "191393472", "2779254040", "100102862", "31829608", "106544461", "2779686019", "123045823", "43067198", "203165030", "2908548367", "47607710", "2780513914", "2909318246", "2909123673", "2910269103", "2779186577", "9514381", "2779847632", "201762086", "2776614250", "91760546", "120144228", "106737062", "2779566273", "202796686", "2777010668", "2909947951", "2911000069", "2909328758", "146870623", "85014361", "2780351192", "2780262311", "2779387731", "102375830", "2778804986", "65067816", "2781205572", "2908766468", "2779401785", "164172150", "2779359390", "166003498", "2776987467", "2778159086", "164663123", "2910200502", "2777810591", "171906077", "2909832105", "181149355", "33884865", "178489894", "84525096", "2910472664", "134174499", "2780320074", "2910273717", "2778166725", "2780270224", "2909434199", "2909423120", "2778484313", "2778654863", "89198739", "2781351580", "101959639", "971699", "4698774", "2776498708", "77019957", "94915269", "194072897", "82922719", "2781357168", "2779813694", "2776622343", "169796023", "2542834", "167225187", "104383817", "2779129001", "2777257180", "140006998", "74363100", "2778456923", "2910395371", "2779311591", "2908611806", "98940541", "204016326", "185429906", "2910001868", "2909498615", "110921888", "2910704000", "2909609750", "2780049918", "2909902876", "2909804582", "2781195161", "2909314849", "2778436418", "2909600298", "154238967", "2780721665", "76144217", "2778618615", "2199051", "2910471639", "2777490532", "2776459999", "116251930", "130731218", "44750222", "2911010606", "200797679", "164995936", "202292293", "168406668", "48295401", "56906370", "74556096", "2911193946", "2780732888", "140505726", "2909075684", "111498074", "2778605688", "2908834839", "63854197", "2910259063", "2777475166", "2779916870", "75114861", "188649462", "12780434", "68307924", "2781138619", "2775899829", "186835682", "2910173640", "2777240490", "72320291", "2778286736", "69258756", "2776833093", "105409693", "2776905153", "2781115736", "118867912", "2776942576", "33222762", "555379026", "2778290591", "2779881993", "37672646", "8397983", "2779270055", "2909874202", "556297831", "60008888", "33326189", "2778983686", "2779184870", "91435432", "177821555", "2781105336", "2779677046", "12365522", "112138406", "40046163", "138207750", "31901060", "2746353", "2778921735", "2910953355", "2776824162", "2776987546", "2779872411", "162571340", "200185824", "130684572", "21442874", "106289968", "137975842", "27426343", "124568556", "544833334", "2779608074", "200909587", "2778186200", "2909624168", "2908573047", "174127684", "2909263554", "2308441", "2778532584", "86037889", "174176344", "100203831", "2779395397", "196491621", "2777299998", "160776313", "2776911219", "61871575", "2778759178", "128805008", "144090359", "111964698", "44083865", "50091055", "144486260", "167275870", "2775928558", "151989614", "2779079919", "2779793503", "2909933650", "2780098792", "67174900", "2780052528", "2780967490", "2778898898", "2908793332", "2780005421", "110739175", "83516960", "2910491271", "14982408", "50747538", "2780358027", "152568617", "89136471", "184186437", "73649233", "84952885", "50776230", "147027905", "78299736", "134801348", "53076038", "2779714858", "2909071857", "2780656832", "2780542009", "137405303", "16759151", "203094294", "98893333", "123326733", "123583881", "18362487", "35788789", "183617614", "2779073994", "138569888", "2779792404", "2778517334", "2777008152", "183680338", "151120012", "2780342482", "115910719", "153692070", "134066672", "145097563", "35637245", "2776623338", "185822510", "2776983043", "2776486069", "155051475", "2780668467", "2777286522", "8643368", "2777657240", "2780003111", "190771501", "2781196315", "26834231", "2777482191", "196156399", "73712438", "71743495", "186293655", "2908556616", "67666897", "167693441", "175968658", "2910490378", "142944206", "158531012", "5395021", "22241219", "120892966", "191795016", "27826464", "103232671", "129724132", "75398719", "2778989422", "10929652", "154038757", "2779989747", "164226766", "1026927", "118248890", "144709373", "176258234", "2778908344", "83975546", "2908584300", "2777842450", "2780986262", "158154518", "2908935257", "2910048382", "27357055", "56281022", "2781330901", "2780164666", "2775892892", "2777200438", "2909309735", "107584723", "67226441", "45567728", "2777549818", "2779619698", "45737032", "2777646408", "2779751349", "114445506", "18396474", "13652956", "131275738", "2776711565", "2779280868", "150018143", "179302884", "2780723106", "148704626", "83616695", "45326173", "49289754", "114938261", "74370796", "145804949", "2779300802", "2910486168", "2911024786", "2776898426", "2909377819", "115314053", "196690852", "60136833", "2777671340", "45355965", "2777629068", "25566979", "149091818", "2776831955", "124219066", "167900197", "2779288016", "107027933", "2777611316", "2776904728", "2778764671", "2778579508", "146667757", "102213258", "2778216119", "203133693", "2780648150", "2776540713", "38635669", "13459763", "180727682", "37771279", "2776040635", "154800190", "2777480472", "2780968727", "2776831232", "206149592", "202775310", "147346212", "2780164529", "2778565663", "51945325", "22760457", "173836518", "2776889888", "165751822", "2776548393", "181169782", "151211776", "106030495", "2910110944", "2780507753", "182964821", "2778150766", "95713431", "8020162", "85946185", "127613066", "59241245", "97200028", "75773760", "94643802", "180932941", "124086997", "2910431462", "2779352166", "2779886121", "36914074"]} -------------------------------------------------------------------------------- /raw_data/2_remove/20_FP7-4-SD_edited/20_process_remove_fos.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | 4 | df = pd.read_csv('bad_fos.csv') 5 | df = df.drop_duplicates(['sdg', 'fos_id']) 6 | 7 | remove_fos = dict() 8 | for sdg_label, fos_id, _ in df.values: 9 | if sdg_label not in remove_fos.keys(): 10 | remove_fos[sdg_label] = [] 11 | remove_fos[sdg_label].append(str(fos_id)) 12 | 13 | with open('20_RemoveFOS.json', 'w') as file_: 14 | json.dump(remove_fos, file_) 15 | -------------------------------------------------------------------------------- /raw_data/2_remove/21_8_NABS_FOS/21_process_remove_fos.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | 4 | df = pd.read_excel('NABS_FOS_update_2020-08-20_NOT-RELEVANT__ed_VS.xlsx') 5 | not_relevant_fos = df['fos_number'].unique().tolist() 6 | remove_fos = { 7 | f'SDG_{sdg_nr}': list(map(lambda fos_id: str(fos_id), not_relevant_fos)) 8 | for sdg_nr in range(1, 18) 9 | } 10 | 11 | with open('21_RemoveFOS.json', 'w') as file_: 12 | json.dump(remove_fos, file_) 13 | -------------------------------------------------------------------------------- /raw_data/2_remove/21_8_NABS_FOS/NABS_FOS_update_2020-08-20_NOT-RELEVANT__ed_VS.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/2_remove/21_8_NABS_FOS/NABS_FOS_update_2020-08-20_NOT-RELEVANT__ed_VS.xlsx -------------------------------------------------------------------------------- /raw_data/2_remove/22_TJL-24_review/22_RemoveFOS.json: -------------------------------------------------------------------------------- 1 | {"SDG_3": ["71924100", "86803240", "141071460", "142724271", "203014093", "1862650", "177713679", "159047783", "54355233", "159110408", "126838900", "98274493", "2779134260", "121608353", "160735492", "115260700", "526734887", "44228677", "199360897", "133462117", "2779281246", "510538283", "56318395", "205545832", "206836424", "100243477", "32546565", "2779139147", "23131810", "58874564", "56995899", "196697905", "186187911", "529173508", "544657597", "508106653", "2777566558", "505241676", "67649825", "147080431", "15952604", "192144188", "2776714187", "19351080", "551968917", "82789193", "2776780178", "536738050", "22467394", "142757262", "2776459890", "2780503075", "141379421", "42781572", "134659438", "150594956", "174618031", "17235551", "2776866176", "104819515", "147224300", "183469790", "2776321320", "111852164", "2777935641", "111998727", "106847996", "82381507", "2778372188", "65545243", "2776980637", "104122410", "2781460079", "169274487", "2776093513", "14185376", "2779402116", "158592959", "179179568", "2775968528", "181907467", "2780901251", "2776960227", "519991488", "2775933685", "37413474", "89008666", "2776933761", "2776056205", "2780771206", "62826618", "44221107", "70352696", "155911762", "2779466056", "57477423", "135448224", "68710425", "44403221", "147717901", "196777733", "11105738", "39424602", "2778617687", "50952357", "540938839", "2779138802", "141795571", "2776875633", "2781005124", "2781402376", "192039680", "141239990", "2780554537", "152236973", "151286553", "128717455", "2779809887", "10165471", "125938697", "205380661", "2779898584", "122881758", "2780312654", "2778605646", "193641492", "2779909984", "117009084", "116856471", "125198404", "163688568", "2780718992", "2781040256", "2779825147", "151054161", "2780762185", "78780964", "2780754355", "2779033964", "37752577", "20860254", "2779227060", "28225019", "2776970089", "185298936", "196467688", "108905452", "69357855", "118316555", "128644962", "148325268", "140764562", "199561411", "125450847", "2777365067", "2777413408", "25070020", "57805442", "72404758", "82740854", "162466561", "2779905828", "104335537", "199529486", "2776471321", "19163912", "123688308", "2776971686", "2777317252", "2781249807", "2778267616", "145417883", "26573533", "2777215511", "2780407094", "20129857", "101812284", "58916441", "2781044819", "187642187", "2776165558", "2776196091", "16895185", "2780944729", "5041914", "2779310008", "555789112", "81758059", "2781083359", "2779201015", "83100098", "197712280", "69505689", "2777913276", "82484044", "2776591724", "166936260", "2777780933", "91632574", "191364105", "2781426373", "40442364", "2776753347", "2778974597", "52173716", "2777299493", "2780263730", "2777226368", "2777289228", "112098571", "2780330291", "28722885", "34929307", "99762115", "20901353", "156983192", "2908586218", "150670458", "2777967479", "187696735", "94176051", "2908751799", "88606150", "2780822005", "115174607", "2910010793", "91790935", "2777072894", "2908926047", "127634017", "2776695260", "33010914", "73751289", "2780935168", "2909563789", "2778454149", "2778280450", "2778646529", "2908924136", "201033657", "108318186", "2780553527", "176672177", "2910782172", "172710988", "2777936119", "2909859419", "189812789", "36080966", "75458452", "2776451152", "2780931059", "2780391353", "2776955481", "2776395653", "2909397458", "2779974081", "2775908279", "179852193", "2781461121", "2777179688", "2781206205", "24493144", "2908803427", "135935922", "2777191628", "2779561794", "2780186313", "2778580320", "2778279454", "2777429807", "2779401766", "203731517", "2780757305", "2778004377", "2778070212", "2781333626", "105099762", "508295664", "39154926", "2777601251", "2776586755", "2777006632", "18986850", "2778248277", "2778936159", "44980441", "2779051267", "84792229", "2909767253", "14471711", "123741691", "185734982", "2778696743", "2909875802", "141388940", "2776346358", "2910755469", "2777076221", "1060249"], "SDG_9": ["127413603"], "SDG_11": ["15708023", "24890656", "2908647359", "166957645", "2549261", "198394728", "99454951", "149923435", "20625102", "129727815", "2780165032", "115901376", "536315585", "179454799", "2780781376", "202372285", "64413873", "43126263", "76775654", "2775896111", "163428354", "62908951", "22590252", "49876356", "2778073708", "105636585", "42045870", "116081451", "49999975", "2780743171", "556340858", "84250820", "108257041", "1813318", "176165272", "2776280689", "159032367", "2781145028", "130076159", "2778165684", "2781007418", "4590074", "2776928176", "141371185", "83854009", "2777068528", "2776870768", "2780278329", "2779436609", "78390623", "2775838644", "2777790407", "19994219", "2779356876", "2781316319", "57097009", "2781112155", "35647692", "201052633", "122173349", "86811826", "2780210451", "141321718", "2779711381", "2776676706", "2777617796", "2778842010", "2781119000", "2778205265", "26623033", "2909633619", "39014021", "144199811", "204431084"], "SDG_4": ["33923547", "509550671", "150394285", "103208741", "147077947", "55958113", "184356942", "108583219", "542530943", "2779018934", "2777189325", "2776526686", "134290984", "175801342", "154482161", "86637286", "522453465", "2780035574", "521786372", "507981020", "2777075199", "23588892", "138296749", "2781051278", "188116033", "2777603413", "197676734", "2776934989", "148324565", "2780732545", "165364887", "2778325511", "2776622967", "2779106483", "164403151", "2777244724", "8077954", "2779063172", "2779961193", "2909116566", "2781349114", "2909931160", "2777841733", "85597727", "2778197446", "204814006", "2910150694", "164449429"], "SDG_6": ["39432304", "201289731", "138921699", "87717796", "107826830", "41625074", "111368507", "91375879", "54286561", "523546767", "521259446", "2776256026", "2776053758", "2778357586", "522964758", "125907379", "108469399", "82576440", "130797344", "2779547435", "131046424", "188287460", "158836135", "499616599", "36574619", "143020374", "2781287369", "93983250", "52201283", "40241539", "2778572946", "16989226", "2776364969", "198428699", "15098985", "2778182573", "108797546", "108628306", "51832835", "547231352", "68359772", "153102810", "205537798", "130950616", "154261466", "2779282177", "2778148510"], "SDG_10": ["162324750", "119857082", "45355965", "2779119184", "121017731", "51067260", "557691694", "25810664", "166052673", "159176650", "163836022", "162725370", "91093795", "540791928", "2780535194", "109051061", "160333310", "161407221", "84945661", "177309310", "2776354556", "2779625216", "2779201187", "188116033", "191953296", "2776845425", "128963836", "116019233", "97713585", "2909801347", "2908766468", "2781313914", "2909025839", "4162061", "2909439219", "179709323", "2909492420", "100607858", "2779997400", "2776572088", "150432175", "2910289302", "2910645313"], "SDG_2": ["153911025", "77088390", "2779234561", "87976508", "150194340", "105639569", "2779483572", "118643609", "173145845", "549605437", "8673954", "26291073", "140413371", "84699730", "134215735", "30455989", "132964779", "183135511", "2780523633", "35158069", "155739000", "2776841711", "120009192", "2777782036", "2780246931", "2779764123", "123336316", "162889289", "203017698", "93066458", "2778452349", "2780106736", "137555145", "59582021", "54815482", "112939947", "47924181", "105152847", "2778944361", "134068817", "9927688", "2779501324", "91770344", "163588314", "2780238508", "177658893", "98722961", "2777129469", "165237769", "130693829", "2779485152", "2780174665", "8313540", "24144980", "2780907584", "2777858656", "2776111823", "2777617796", "2778733383", "2780745107", "2778359420", "2778200843", "2777416314", "2910375186", "2776672683", "2780871851", "157717039", "23837897", "2776176627", "107888415", "2777794352", "2778554304", "2775868463", "2779287364", "2779425982", "91447561", "199724614", "2778896754", "2909399481", "2910933275", "2909524676", "123244313", "2777953396", "33824837", "2909753820", "2779979797", "2910653396", "2778329027", "2911118914", "2781218492", "2908542670", "2909529903", "2909506248", "96105989", "2777438402", "181607587", "2775991992", "2910584990", "2776708618", "2909152114", "2778003962", "2776007641", "2910283248", "2779529612", "2779316989", "2911208417", "2909807485"], "SDG_13": ["18903297", "2780471494", "153294291", "204530211", "23795335", "537208039", "94061648", "521259446", "58874564", "2780805685", "2777822432", "112964050", "147534773", "123403432", "151406439", "127454912", "2778918656", "173651095", "106199856", "143299363", "136020623", "2775840915", "2779900269", "6964187", "20564796", "114148465", "38262639", "545622115", "128849468", "206145494", "25022447", "93785673", "2780211030", "2777605225"], "SDG_12": ["31972630", "206139338", "181199279", "2779851234", "28328180", "146778888", "52121051", "88182573", "201903717", "67203356", "108713360", "2776985865", "44877443", "32198211", "2776943663", "105306849", "2777612826", "45292766", "167740415", "69991583", "2781400479", "2779570065", "160565873", "182566", "199310239", "2779301550", "82753439", "502701156", "143020374", "2779726014", "49326732", "204217086", "2778035492", "15098985", "62960913", "189123395", "503285160", "117185709", "2780518120", "2776002898", "204983608", "2776908094", "2777566824", "92244383", "170828538", "173366509", "76893819", "183682340", "21338462", "2779167034", "2778734332", "41826821", "29140674", "47187476", "99578197", "2780848231", "54276265", "2779293432", "123703457", "196781063", "2779539549", "88959737", "169824061", "39177556", "28613373", "6907630", "104002121", "186673887", "127045886", "80646779", "171988757", "2777637287", "58640550", "7591567", "192045728", "94866938", "2780210451", "2723826", "164495641", "2775953691", "25796384", "2779299574", "2776936074", "2780569836", "2910251023", "2777121799", "116197896", "46312889", "150839157", "190362163", "201958364", "2909963963", "2778126675", "2775893736", "5035944", "2910127915", "30543370", "159821036", "2778804209"], "SDG_1": ["36289849", "549605437", "2781426361", "2781061807", "2778452349", "2778054917", "2776672683", "2775868463", "78302928", "2909852078"], "SDG_15": ["144027150", "176933379", "185933670", "78458016", "2776042228", "145097563", "130217890", "68189081", "153823671", "81860439", "126343540", "157021035", "56685638", "36727532", "51244244", "2777904157", "97854310", "2778157034", "16397148", "2778049214", "149207113", "202552767", "139669111", "23119410", "66782513", "55347375", "513535597", "43003075", "2779142801", "521815418", "162012527", "72286879", "173979980", "2779152076", "2778148510"], "SDG_8": ["187212893", "147583825", "158886217", "68189081", "176289848", "2778431023", "2779011557", "111226992", "175700187", "78597825", "506796395", "2776125615", "2779986911", "206713868", "2778556080", "93236110", "2780618658", "2778021871", "2908822358", "2779783368", "46312889", "2775893736", "2776498708", "34099160", "41708089", "2776880170", "49906088", "105578763", "2778381653", "2780836627", "2775876557"], "SDG_17": ["171250308", "56739046", "520434653", "191935318", "66204764", "2910001868", "134560507", "159317903", "2777953023", "530175646", "164767435", "47344431", "190960625", "2778300220", "186229450", "2777113093", "28718268", "87616379", "15845906", "2776060655", "138368954", "2776604539", "199491958", "44171179", "2779015535", "70455891", "2776553905", "99743013", "2777481183", "134632028", "2778711553", "552089266", "2778449271", "206103860", "2910910449", "2780575108", "2780903623", "129275984", "2776577793", "68307924", "2778459265", "2777836882", "160354207", "2781328080", "190539079", "158041659", "2780124536", "2909744077", "2776561884", "198891747", "2780479094"], "SDG_14": ["544153396", "159750122", "18918823", "119128265", "76177295", "88862950", "22070199", "197248824", "553184892", "115961737", "2776415932", "88160329", "514928085", "502230775", "152382732", "83419821", "2776023875", "72958200", "23531484", "2779429622", "85721925", "156380964", "68874143", "132543647", "46576788", "111874474", "2777403171", "72634772", "153279818", "49427245", "139369640", "192536144", "143517461", "509746633", "3641667", "82988372", "152613627", "150418976", "2779310246", "2777590139", "2779522410", "2908811810", "14918906", "2909168245", "2776538778", "2908904675", "2780756971", "2776265578", "2911123808", "39077098", "2909168288"], "SDG_7": ["501529594", "544956773", "108225325", "105923489", "90509273", "68801617", "107645774", "75684735", "20788544", "55037315", "151406439", "2777622855", "38677869", "31395832", "2780066083", "2778869765", "78244369", "2779019381", "155373166", "68476402", "509746633", "137851953", "137886200", "33039251", "2780936489", "2777466363", "200630231", "2776970089", "17648541", "80845027", "2778675665", "165998758", "2778330180", "135436540", "2781249646", "37965861", "2778539042", "37415627", "2779252636", "83227832", "86714428", "88417058", "2776058518", "2779200991", "126172416", "38940224", "188818383", "2781309322", "2777586272", "143559376", "9132272", "2776782565", "2779877863", "2776740001", "2779867701", "2779895041"], "SDG_16": ["104267543", "44249647", "171289174", "524765639", "509933004", "46295352", "2777367657", "74501621", "131046424", "112299071", "129603779", "64848388", "2778215748", "2775935494", "2779363069", "162466561", "538473155", "71156930", "710854", "2777963317", "41150092", "2778029865", "156460124", "2778724510", "2780919918", "2910956745", "207035908", "2775982628", "2776363604", "2909494222", "2776686254", "2779889875", "2778736898"], "SDG_5": ["70036468", "55447825", "102587632", "77352025", "46578552", "17632256", "2777973936", "162077342", "104151175", "2780233487", "119693030", "2777667586", "21279758", "37512671", "994546", "2777877159", "48057960", "122251271", "2779881493", "2777941463", "2776689383", "102003337", "119588120", "2778071103", "2777177043", "170806853", "2775906418", "2779865128", "2775850206", "140816417", "2780438625", "2775880612", "2776430950"]} -------------------------------------------------------------------------------- /raw_data/2_remove/22_TJL-24_review/22_process_remove_fos.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | import re 4 | 5 | df = pd.read_excel('osdg_fos_paper_citation_counts_REMOVE_v2_ed_VS.xlsx') 6 | 7 | remove_fos = dict() 8 | for sdg_to_remove, fos_id in df[['remove', 'fos_id']].values: 9 | sdg_to_remove = map(lambda sdg_nr: f'SDG_{sdg_nr}', re.findall(r'\d+', sdg_to_remove)) 10 | for sdg_label in sdg_to_remove: 11 | if sdg_label not in remove_fos.keys(): 12 | remove_fos[sdg_label] = [] 13 | remove_fos[sdg_label].append(str(fos_id)) 14 | 15 | with open('22_RemoveFOS.json', 'w') as file_: 16 | json.dump(remove_fos, file_) 17 | -------------------------------------------------------------------------------- /raw_data/2_remove/22_TJL-24_review/osdg_fos_paper_citation_counts_REMOVE_v2_ed_VS.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/2_remove/22_TJL-24_review/osdg_fos_paper_citation_counts_REMOVE_v2_ed_VS.xlsx -------------------------------------------------------------------------------- /raw_data/2_remove/23_Restructuring_review/23_RemoveFOS.json: -------------------------------------------------------------------------------- 1 | {"SDG_2": ["2777480484", "2994333706"], "SDG_15": ["192241223", "176943803", "113754120", "2994352824", "3020462461", "2992165118", "120806208", "64551749", "194187813", "64015301", "2777480484", "2778364563", "2983333560", "74250896", "109902934"], "SDG_11": ["2779548549", "64004221", "2776756561", "79420006", "2994396486", "2776489436", "19096712", "46585869", "2986229148", "2779286702", "2777152325", "2779028214", "2776902267", "76155785", "7131667", "18533594", "2909931525", "184386139", "2909614546", "111943024", "29760336", "2779725038", "2909978109", "5455396", "113145756", "7991579", "2776288101", "86532276", "49304495", "2776941537", "2908570603", "86085837", "2779220109", "2910447950", "45012715", "52069626", "171276312", "2781190202", "103060789", "46135064", "27157697", "3017795126", "2778289769", "2780512908", "2776323365", "74211669", "2780805606", "103189561", "3020114046", "2910432382", "2776576667", "2780273121", "34349720", "2777362114", "110069353", "71839028", "162044005", "2775889553", "96926464", "192126672"], "SDG_14": ["2908583363", "2910560996", "2911073633", "20992447", "2910628358", "1189109488"], "SDG_4": ["2777626052"], "SDG_16": ["133462117"], "SDG_3": ["2778853725", "2908832293", "2908520703", "2908999294"], "SDG_9": ["2779424974"], "SDG_7": ["151174772", "1034443"]} -------------------------------------------------------------------------------- /raw_data/2_remove/23_Restructuring_review/23_process_remove_fos.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | 4 | 5 | fname = '23_RemoveFOS.json' 6 | 7 | remove_fos = dict() 8 | 9 | df = pd.read_excel('sdg-fos_restructuring-v3_to-remove.xlsx') 10 | 11 | for _, vals in df.iterrows(): 12 | sdg, fos_id = vals['sdg'], str(vals['fos_id']) 13 | if sdg not in remove_fos.keys(): 14 | remove_fos[sdg] = set() 15 | remove_fos[sdg].add(fos_id) 16 | 17 | for sdg, foses in remove_fos.items(): 18 | remove_fos[sdg] = list(foses) 19 | 20 | 21 | with open(fname, 'w') as file_: 22 | json.dump(remove_fos, file_) 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /raw_data/2_remove/23_Restructuring_review/sdg-fos_restructuring-v3_to-remove.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/2_remove/23_Restructuring_review/sdg-fos_restructuring-v3_to-remove.xlsx -------------------------------------------------------------------------------- /raw_data/2_remove/24_Review_2020-10-02/24_RemoveFOS.json: -------------------------------------------------------------------------------- 1 | {"SDG_2": ["38774213", "156086215", "126408429", "182124840", "33411773", "172817999", "141185391", "20529654", "42731165", "59804570", "116370137", "126589399", "2780528068", "15147509", "31568149", "2776562576", "24518262", "23519681", "53002841", "197320908", "33283694", "2780189059", "133382796", "2619416", "201401522", "2776107028", "2778625682", "107394435", "2780816530", "28631016", "108216600", "192392207", "2776500793", "2779004245", "97137747", "2775966360", "34070608", "2780086105", "64229544", "2777707638", "91354502", "2776285232", "2776554196", "147103442", "87621631", "119249163", "155987862", "2775841215", "2777106113", "155015343", "2776492830", "89295123", "63651461", "121850381", "198979508", "154702282", "154575652", "150436541", "152491559", "2775999090", "153427425", "2776596991", "123917164", "2777399377", "32120771", "93944068", "62158283", "555313981", "2779128174", "2780946806", "2777380357", "118694661", "60989497", "25382069", "2781208722", "2777472530", "54625482", "2780696901", "59898753", "39571515", "126914827", "173795300", "2776978901", "2776278397", "2777387638", "532801124", "139518226", "2778361644", "2776801807", "2777132354", "2776054349"]} -------------------------------------------------------------------------------- /raw_data/2_remove/24_Review_2020-10-02/24_process_remove_fos.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | 4 | 5 | remove_fos = dict() 6 | 7 | df = pd.read_csv('remove-review_2020-10-02.csv') 8 | for fos_id, _, rm_sdg in df.values: 9 | fos_id = str(fos_id) 10 | if rm_sdg not in remove_fos.keys(): 11 | remove_fos[rm_sdg] = [] 12 | remove_fos[rm_sdg].append(fos_id) 13 | 14 | with open('24_RemoveFOS.json', 'w') as file_: 15 | json.dump(remove_fos, file_) 16 | 17 | -------------------------------------------------------------------------------- /raw_data/2_remove/24_Review_2020-10-02/remove-review_2020-10-02.csv: -------------------------------------------------------------------------------- 1 | fos_id,fos_name,from_sdg 2 | 38774213,Soil fertility,SDG_2 3 | 156086215,Soil ecology,SDG_2 4 | 126408429,Soil health,SDG_2 5 | 182124840,Soil organic matter,SDG_2 6 | 33411773,Bulk soil,SDG_2 7 | 172817999,Soil functions,SDG_2 8 | 141185391,Soil retrogression and degradation,SDG_2 9 | 20529654,Topsoil,SDG_2 10 | 42731165,Environmental soil science,SDG_2 11 | 59804570,Soil governance,SDG_2 12 | 116370137,No-till farming,SDG_2 13 | 126589399,Umbrella species,SDG_2 14 | 2780528068,Plant nutrition,SDG_2 15 | 15147509,Conservation reliant species,SDG_2 16 | 31568149,Near-threatened species,SDG_2 17 | 2776562576,Plant strategies,SDG_2 18 | 24518262,Threatened species,SDG_2 19 | 23519681,Montane ecology,SDG_2 20 | 53002841,Plant community,SDG_2 21 | 197320908,Pioneer species,SDG_2 22 | 33283694,Deciduous,SDG_2 23 | 2780189059,Soil stabilization,SDG_2 24 | 133382796,Secondary forest,SDG_2 25 | 2619416,Rainforest,SDG_2 26 | 201401522,Plant cover,SDG_2 27 | 2776107028,Forest dynamics,SDG_2 28 | 2778625682,High forest,SDG_2 29 | 107394435,Quadrat,SDG_2 30 | 2780816530,Forest protection,SDG_2 31 | 28631016,Forest management,SDG_2 32 | 108216600,Tropical and subtropical dry broadleaf forests,SDG_2 33 | 192392207,Clearcutting,SDG_2 34 | 2776500793,Beech,SDG_2 35 | 2779004245,Tilth,SDG_2 36 | 97137747,Forestry,SDG_2 37 | 2775966360,Silviculture,SDG_2 38 | 34070608,Cover crop,SDG_2 39 | 2780086105,Forest product,SDG_2 40 | 64229544,Habitat destruction,SDG_2 41 | 2777707638,Vascular plant,SDG_2 42 | 91354502,Basal area,SDG_2 43 | 2776285232,Tropical forest,SDG_2 44 | 2776554196,Evergreen forest,SDG_2 45 | 147103442,Forest inventory,SDG_2 46 | 87621631,Taiga,SDG_2 47 | 119249163,Felling,SDG_2 48 | 155987862,Selection cutting,SDG_2 49 | 2775841215,Sustainable forest management,SDG_2 50 | 2777106113,Crop simulation model,SDG_2 51 | 155015343,Plant breeding,SDG_2 52 | 2776492830,Dipterocarpaceae,SDG_2 53 | 89295123,Forest pathology,SDG_2 54 | 63651461,Tropical agriculture,SDG_2 55 | 121850381,Certified wood,SDG_2 56 | 198979508,Forest fragmentation,SDG_2 57 | 154702282,Temperate deciduous forest,SDG_2 58 | 154575652,Reforestation,SDG_2 59 | 150436541,Forb,SDG_2 60 | 152491559,Macroecology,SDG_2 61 | 2775999090,Joint Forest Management,SDG_2 62 | 153427425,Biodiversity hotspot,SDG_2 63 | 2776596991,Cultural methods,SDG_2 64 | 123917164,Bumper crop,SDG_2 65 | 2777399377,DSSAT,SDG_2 66 | 32120771,Ecosystem engineer,SDG_2 67 | 93944068,Phytogeography,SDG_2 68 | 62158283,Species translocation,SDG_2 69 | 555313981,Tropical rainforest,SDG_2 70 | 2779128174,Scots pine,SDG_2 71 | 2780946806,Plant functional type,SDG_2 72 | 2777380357,Rainfed agriculture,SDG_2 73 | 118694661,Climax community,SDG_2 74 | 60989497,Red List Index,SDG_2 75 | 25382069,Seral community,SDG_2 76 | 2781208722,Intercropping,SDG_2 77 | 2777472530,Catch crop,SDG_2 78 | 54625482,Community forestry,SDG_2 79 | 2780696901,Conventional tillage,SDG_2 80 | 59898753,Shrubland,SDG_2 81 | 39571515,Undergrowth,SDG_2 82 | 126914827,Flagship species,SDG_2 83 | 173795300,Salvage logging,SDG_2 84 | 2776978901,Tree breeding,SDG_2 85 | 2776278397,Revegetation,SDG_2 86 | 2777387638,Forestry law,SDG_2 87 | 532801124,Crop protection,SDG_2 88 | 139518226,Sclerophyll,SDG_2 89 | 2778361644,Yield gap,SDG_2 90 | 2776801807,Pinus radiata,SDG_2 91 | 2777132354,Shelterwood cutting,SDG_2 92 | 2776054349,Vegetation classification,SDG_2 93 | -------------------------------------------------------------------------------- /raw_data/2_remove/25_TOL-7_MostPopularSDG3FOS/25_RemoveFOS.json: -------------------------------------------------------------------------------- 1 | {"SDG_3": ["2777532764", "2909375031", "145642194", "2776556313", "2911127567", "106977388", "2908822358", "512399662", "2910036418", "509550671", "2780433410", "2780877353", "204787440", "86804380", "137992405", "2911023962", "545542383", "2780559412", "2780141013", "110894328", "188884661", "190960625", "502701156", "2777471088", "502991105", "22607594", "2777896191", "2779676829", "2776020993", "2779328685", "33623176", "2779671548", "2777161012", "2910661759", "2910448010", "2910661131", "2815619", "2780550299", "38858142", "2780541811", "14498672", "2777607137", "2779141489", "2909731318", "2779629443", "2910654967", "2776818590", "176656743", "178441611", "2781187916", "156312663", "2778369149", "2908819760", "2909715475", "165998758", "131138744", "2910237699", "2909160651", "2911013501", "140608501", "2780848588", "2777143679", "57177791", "2780646005", "2780589914", "111459926", "2908903645", "2779976542", "2778103839", "2777512617", "2910694641", "2779176400", "2780477921", "2781430560", "145798840", "121246419", "2911093041", "61620210", "2780542330", "161126747", "2777335584", "207006810", "2910950043", "31402265", "2780678043", "156168145", "2781332184", "2776370487", "2780812456", "40722700", "2910151648", "2778957590"]} -------------------------------------------------------------------------------- /raw_data/2_remove/25_TOL-7_MostPopularSDG3FOS/25_process_remove_fos.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | 4 | 5 | remove_fos = dict() 6 | 7 | df = pd.read_csv('TOL-7_MostPopularSDG3RemoveFOS.csv') 8 | for fos_id, _, rm_sdg in df.values: 9 | fos_id = str(fos_id) 10 | if rm_sdg not in remove_fos.keys(): 11 | remove_fos[rm_sdg] = [] 12 | remove_fos[rm_sdg].append(fos_id) 13 | 14 | with open('25_RemoveFOS.json', 'w') as file_: 15 | json.dump(remove_fos, file_) 16 | -------------------------------------------------------------------------------- /raw_data/2_remove/25_TOL-7_MostPopularSDG3FOS/TOL-7_MostPopularSDG3RemoveFOS.csv: -------------------------------------------------------------------------------- 1 | fos_id,fos_name,from_sdg 2 | 2777532764,Research center,SDG_3 3 | 2909375031,Drug Company,SDG_3 4 | 145642194,Health informatics,SDG_3 5 | 2776556313,Downtown,SDG_3 6 | 2911127567,Generic Product,SDG_3 7 | 106977388,Medical research,SDG_3 8 | 2908822358,Organizational Case Studies,SDG_3 9 | 512399662,Family medicine,SDG_3 10 | 2910036418,Patient care team,SDG_3 11 | 509550671,Medical education,SDG_3 12 | 2780433410,Digital health,SDG_3 13 | 2780877353,Health services research,SDG_3 14 | 204787440,Alternative medicine,SDG_3 15 | 86804380,Construction site safety,SDG_3 16 | 137992405,Health administration,SDG_3 17 | 2911023962,Combination Product,SDG_3 18 | 545542383,Medical emergency,SDG_3 19 | 2780559412,Good-morning,SDG_3 20 | 2780141013,Time-out,SDG_3 21 | 110894328,Biomedical technology,SDG_3 22 | 188884661,Active packaging,SDG_3 23 | 190960625,Water treatment,SDG_3 24 | 502701156,Biomedical sciences,SDG_3 25 | 2777471088,Patient advocacy,SDG_3 26 | 502991105,Clinical research,SDG_3 27 | 22607594,Enabling,SDG_3 28 | 2777896191,Patient experience,SDG_3 29 | 2779676829,Connected health,SDG_3 30 | 2776020993,Group work,SDG_3 31 | 2779328685,Patient safety,SDG_3 32 | 33623176,eMix,SDG_3 33 | 2779671548,Interurban,SDG_3 34 | 2777161012,Institutional research,SDG_3 35 | 2910661759,Treatment room,SDG_3 36 | 2910448010,Delivery location,SDG_3 37 | 2910661131,Training skills,SDG_3 38 | 2815619,Continuous training,SDG_3 39 | 2780550299,Job description,SDG_3 40 | 38858142,Aftertaste,SDG_3 41 | 2780541811,Quackery,SDG_3 42 | 14498672,Effective safety training,SDG_3 43 | 2777607137,Added sugar,SDG_3 44 | 2779141489,Group home,SDG_3 45 | 2909731318,Home deliveries,SDG_3 46 | 2779629443,Cross-training,SDG_3 47 | 2910654967,Fast foods,SDG_3 48 | 2776818590,Natural Product Research,SDG_3 49 | 176656743,Serving size,SDG_3 50 | 178441611,Training effect,SDG_3 51 | 2781187916,Day care,SDG_3 52 | 156312663,Steering committee,SDG_3 53 | 2778369149,Clinical data management,SDG_3 54 | 2908819760,Food selections,SDG_3 55 | 2909715475,What treatment,SDG_3 56 | 165998758,Imaging technology,SDG_3 57 | 131138744,Completed Staff Work,SDG_3 58 | 2910237699,Device Approval,SDG_3 59 | 2909160651,Delivery - action,SDG_3 60 | 2911013501,Delivery timing,SDG_3 61 | 140608501,Review article,SDG_3 62 | 2780848588,Power Balance,SDG_3 63 | 2777143679,Post and core,SDG_3 64 | 57177791,Imaging science,SDG_3 65 | 2780646005,Trafficability,SDG_3 66 | 2780589914,Ingredient,SDG_3 67 | 111459926,Walk-in,SDG_3 68 | 2908903645,Normal delivery,SDG_3 69 | 2779976542,Case report form,SDG_3 70 | 2778103839,Home management,SDG_3 71 | 2777512617,Staffing,SDG_3 72 | 2910694641,Patient name,SDG_3 73 | 2779176400,Medical food,SDG_3 74 | 2780477921,Chewiness,SDG_3 75 | 2781430560,Food pyramid,SDG_3 76 | 145798840,Process safety management,SDG_3 77 | 121246419,Unlicensed assistive personnel,SDG_3 78 | 2911093041,Her Disease,SDG_3 79 | 61620210,Flame-Sim,SDG_3 80 | 2780542330,Clinical data repository,SDG_3 81 | 161126747,Hot work,SDG_3 82 | 2777335584,N-group (finite group theory),SDG_3 83 | 207006810,Improved water source,SDG_3 84 | 2910950043,Reservoir bag,SDG_3 85 | 31402265,Potential space,SDG_3 86 | 2780678043,Group A,SDG_3 87 | 156168145,Passive fire protection,SDG_3 88 | 2781332184,Payment by Results,SDG_3 89 | 2776370487,Sitting,SDG_3 90 | 2780812456,Cooling down,SDG_3 91 | 40722700,Cluster of differentiation,SDG_3 92 | 2910151648,Negative Test Result,SDG_3 93 | 2778957590,CD19,SDG_3 94 | -------------------------------------------------------------------------------- /raw_data/2_remove/RemovedFOS.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/2_remove/RemovedFOS.xlsx -------------------------------------------------------------------------------- /raw_data/3_blacklist/30_8_NABS_FOS/30_BlacklistFOS.csv: -------------------------------------------------------------------------------- 1 | fos_id,fos_name 2 | 2987034934,Earth crust 3 | 2776698055,Crust 4 | 79572550,Partial melting 5 | 2994012208,Upper crust 6 | 2780356177,Baltic Shield 7 | 56859440,Bouguer anomaly 8 | 2778471503,Basin and Range Province 9 | 2779422593,Soil crust 10 | 2779980370,Magma ocean 11 | 37523158,Hadean 12 | 3017803470,Crust formation 13 | 2780556036,South Pole–Aitken basin 14 | 549698073,Structure of the Earth 15 | 77928131,Tectonics 16 | 33556824,Hydrogeology 17 | 179158327,Palaeogeography 18 | 159719176,Engineering geology 19 | 2776797426,Biogeosciences 20 | 109281948,Stratigraphy 21 | 6363049,Volcanism 22 | 58097730,Subduction 23 | 119477230,Plate tectonics 24 | 2778261408,Eurasian Plate 25 | 2777994876,Pacific Plate 26 | 2777992645,North American Plate 27 | 2779867292,African Plate 28 | 2781207809,South American Plate 29 | 2776184289,Farallon Plate 30 | 180362636,Thrust tectonics 31 | 152972079,Plate reconstruction 32 | 7251660,Structural geology 33 | 50682988,Rift 34 | 201867031,Continental margin 35 | 141646446,Continental crust 36 | 136752280,Geodynamics 37 | 23295444,Shear zone 38 | 110041135,Thrust fault 39 | 97490223,Focal mechanism 40 | 166839181,Graben 41 | 44938399,Petrogenesis 42 | 150999391,Terrane 43 | 92596616,Lineament 44 | 128954607,Tectonophysics 45 | 23923706,Ophiolite 46 | 199007388,Diapir 47 | 16670881,Neotectonics 48 | 67236022,Mantle (geology) 49 | 16942324,Lithosphere 50 | 51151373,Mid-ocean ridge 51 | 84372278,Peridotite 52 | 11872896,Fractional crystallization (geology) 53 | 44938399,Petrogenesis 54 | 23923706,Ophiolite 55 | 122959257,Seismic tomography 56 | 42796848,Xenolith 57 | 167919410,Metasomatism 58 | 127723449,Core–mantle boundary 59 | 183282558,Kimberlite 60 | 2994012208,Upper crust 61 | 140441402,Carbonatite 62 | 23148476,Seismic anisotropy 63 | 93746451,Mineral redox buffer 64 | 2780356177,Baltic Shield 65 | 22512106,Shear wave splitting 66 | 195081551,Ultramafic rock 67 | 2993808335,Seismic velocity 68 | 2780942940,Stishovite 69 | 2778882853,Phlogopite 70 | 21441200,Mineral physics 71 | 83948199,Incompatible element 72 | 2779980370,Magma ocean 73 | 2993054622,Core formation 74 | 2781390083,Pinctada fucata 75 | 37523158,Hadean 76 | 140230471,Planetary core 77 | 160804572,Silicate perovskite 78 | 2776763651,Ferropericlase 79 | 154802760,Giant impact hypothesis 80 | 2777480983,USArray 81 | 114793014,Geomorphology 82 | 16674752,Mining engineering 83 | 97842125,Rock mechanics 84 | 5166401,Tailings 85 | 41242791,Rock mass classification 86 | 2779096232,Hydraulic fracturing 87 | 175181221,Prospecting 88 | 2777201227,Overburden 89 | 2984157484,Mining industry 90 | 7028197,Gangue 91 | 2776760134,Gold mining 92 | 94236395,Stoping 93 | 184977646,Open-pit mining 94 | 179974421,Rock bolt 95 | 93011207,Geotechnical investigation 96 | 113658590,Muck 97 | 2993323123,Rock slope 98 | 207469975,Shaft mining 99 | 58625266,Lode 100 | 186096623,Ground pressure 101 | 2992974802,Geological exploration 102 | 2779742380,Gabion 103 | 2993134977,Mineral deposit 104 | 102044607,Adit 105 | 2993252152,Rock body 106 | 2991922516,Rock pressure 107 | 2911210907,Support pressure 108 | 2994289516,Geological investigation 109 | 2993102984,Mine planning 110 | 2992547679,Land mine 111 | 2993437602,Rock structure 112 | 2780043312,Hydraulic fill 113 | 2993527706,Salt mine 114 | 2992990004,Solid rock 115 | 2992406196,Waste dump 116 | 26144545,Cut and fill 117 | 2909623323,Waste Dumps 118 | 2910921642,Mineral industries 119 | 127200247,Hydraulic mining 120 | 2992067306,Mineral potential 121 | 2778524612,Mining law 122 | 66511971,Mining geology 123 | 2781079927,Dimension stone 124 | 2778375701,Ground stone 125 | 2993492720,Gold production 126 | 2778839144,Medical geology 127 | 2992981300,Salt deposit 128 | 2909086881,Stone quarry 129 | 2779795913,Hurrying 130 | 2776629827,Panasqueira 131 | 2992407798,Iron mining 132 | 2992330363,Urban geology 133 | 2778143190,Minnesota Geological Survey 134 | 46517748,Drift mining 135 | 2777425756,Gold panning 136 | 193605714,Steam shovel 137 | 2779070535,Street gutter 138 | 46580973,Blackdamp 139 | 2781121916,Bow drill 140 | 2910477778,DUMP formation 141 | 2779880937,Whinstone 142 | 2910697619,Hearing analyzer 143 | 2911132530,Mine surveyor 144 | 8824402,Landslides vs. Rock strength 145 | 2910164855,Logging car 146 | 2909642594,Shaft (site) 147 | 2910514300,Root stones 148 | 42972112,Veterinary medicine 149 | 2776977481,Dairy cattle 150 | 2776482104,Breed 151 | 194775826,Herd 152 | 134215735,Flock 153 | 523966790,Animal welfare 154 | 2779620486,Tick 155 | 2780505807,Beef cattle 156 | 2779885849,Milking 157 | 173419221,Crossbreed 158 | 2776908094,Anthelmintic 159 | 2778877831,Cryptosporidium 160 | 2991862235,Animal health 161 | 2779557943,Canis 162 | 2908982167,Cattle Diseases 163 | 2780284631,Sire 164 | 2777499811,Ivermectin 165 | 66914385,Poultry farming 166 | 52991690,Culling 167 | 2779329348,Feedlot 168 | 2777976947,Eimeria 169 | 2781368420,Biosecurity 170 | 103797069,Domestic sheep reproduction 171 | 2776082042,Vulpes 172 | 2776222705,Wild boar 173 | 2777963300,Ovis 174 | 2777786777,Flea 175 | 2776247511,Zebu 176 | 2779552062,Roe deer 177 | 3018078696,Dwarf goats 178 | 2777146433,Badger 179 | 2776521926,Brahman 180 | 2780968714,Mange 181 | 2778226015,Capreolus 182 | 2779867394,Bubalus 183 | 2994537864,Human medicine 184 | 2777199308,Louse 185 | 2777474537,Pheasant 186 | 3017754109,Companion animal 187 | 2991667299,Capra hircus 188 | 2777114023,Withers 189 | 2778134537,Domestic pig 190 | 2780323295,Cervus 191 | 2778002360,Rump 192 | 2779914258,Nili-Ravi 193 | 2994460426,Pig farms 194 | 2910651670,Bird Diseases 195 | 2777151259,Mallophaga 196 | 2780487972,Veterinary parasitology 197 | 2909771501,Goat Diseases 198 | 3020113513,Small ruminant 199 | 2909031412,Gallus gallus domesticus 200 | 2993139054,Water buffalo 201 | 2776960312,Hock 202 | 2911060314,Laboratory Animal Science 203 | 2780727426,Awassi 204 | 3017937595,Dog owners 205 | 168568655,Medical entomology 206 | 2908605944,Meleagris gallopavo 207 | 2777222942,Corriedale 208 | 2778136425,Struthio 209 | 2777225262,Veterinary pathology 210 | 2780460740,Jackal 211 | 2909619495,Food animal 212 | 2909895380,Guinea fowl 213 | 2910990604,Lama glama 214 | 2778856526,Cow-calf 215 | 144027150,Horticulture 216 | 137580998,Crop 217 | 197321923,Cultivar 218 | 2777108408,Sugar 219 | 21410773,Shoot 220 | 100701293,Germination 221 | 88862950,Irrigation 222 | 168741863,Sowing 223 | 2776096895,Seedling 224 | 2780719635,Flavor 225 | 8868529,Taste 226 | 2780618852,Pollen 227 | 2776373379,Chlorophyll 228 | 150668497,Dry weight 229 | 88972607,Human fertilization 230 | 22508944,PEST analysis 231 | 38304854,Manure 232 | 104727253,Biological pest control 233 | 32198211,Greenhouse 234 | 2779678110,Fungus 235 | 161221295,Plant physiology 236 | 45292766,Bark 237 | 133479454,Mycelium 238 | 2780563676,Aroma 239 | 2982966219,Plant growth 240 | 172353545,Ripening 241 | 51417038,Phenology 242 | 168197293,Pollination 243 | 2777461220,Germplasm 244 | 540442320,Pest control 245 | 2780414537,Maple 246 | 115930662,Shelf life 247 | 2778157034,Sorghum 248 | 2780739461,Compost 249 | 2776632002,Legume 250 | 2779824472,Herb 251 | 49799701,Xylem 252 | 2776242653,Pepper 253 | 75639521,Field experiment 254 | 2993531722,Zea mays 255 | 36248471,Seeding 256 | 2778761015,Solanaceae 257 | 2776451879,Infestation 258 | 55969652,photoperiodism 259 | 2776747608,Brassica 260 | 155868670,Root system 261 | 2776474821,Mushroom 262 | 513193947,Fodder 263 | 2988529969,Cold storage 264 | 46328234,Organoleptic 265 | 2776286235,Phaseolus 266 | 2775976403,Aphid 267 | 74103781,Ornamental plant 268 | 157670687,Postharvest 269 | 2779197568,Sunflower 270 | 85582077,Paddy field 271 | 75296557,Husk 272 | 108010975,Pruning 273 | 137776501,Point of delivery 274 | 178165689,Inflorescence 275 | 83740816,Gibberellin 276 | 2993273313,Chemical control 277 | 185476388,Cotyledon 278 | 2776327621,Flesh 279 | 35496372,Phloem 280 | 2780054949,Spinach 281 | 53007507,Browning 282 | 43143990,Conidium 283 | 6557445,Agronomy 284 | 159750122,Soil water 285 | 2779371384,Biomass 286 | 142796444,Nutrient 287 | 137580998,Crop 288 | 510538283,Phosphorus 289 | 48743137,Organic matter 290 | 197321923,Cultivar 291 | 21410773,Shoot 292 | 88862950,Irrigation 293 | 168741863,Sowing 294 | 161176658,Pesticide 295 | 2780138947,Dry matter 296 | 101000010,Canopy 297 | 46757340,Poaceae 298 | 150668497,Dry weight 299 | 2779587293,Straw 300 | 128758860,Woody plant 301 | 2777612826,Insect 302 | 2777904157,Grazing 303 | 137660486,Growing season 304 | 50660011,Tropics 305 | 150772632,Arid 306 | 2779370140,Forage 307 | 48189365,Hybrid 308 | 81461190,Temperate climate 309 | 32198211,Greenhouse 310 | 2775891814,Weed 311 | 2775835988,Grassland 312 | 2779429622,Litter 313 | 53657456,Peat 314 | 2778053677,Pasture 315 | 2982966219,Plant growth 316 | 141282968,Plant ecology 317 | 132215390,Abiotic component 318 | 540442320,Pest control 319 | 2778157034,Sorghum 320 | 2780739461,Compost 321 | 2776632002,Legume 322 | 24461792,Perennial plant 323 | 75639521,Field experiment 324 | 118518473,Agriculture 325 | 549605437,Food security 326 | 128383755,Agricultural productivity 327 | 3987366,Livelihood 328 | 16397148,Tillage 329 | 139496715,Deforestation 330 | 2988676352,Rural development 331 | 502990516,Agricultural land 332 | 156005406,Subsistence agriculture 333 | 559400886,Land management 334 | 122690726,"Land use, land-use change and forestry" 335 | 85675897,Soil management 336 | 123963621,Integrated pest management 337 | 2779220025,Peasant 338 | 13558536,Cropping 339 | 137607661,Land tenure 340 | 2776475172,Soil quality 341 | 183135511,Natural resource management 342 | 157140304,Agrarian society 343 | 71762439,Arable land 344 | 175760724,Crop rotation 345 | 189797535,Drought tolerance 346 | 109162521,Soil conservation 347 | 2989409935,Crop production 348 | 54924851,Sustainable agriculture 349 | 61968832,Animal husbandry 350 | 118817206,Organic farming 351 | 2778852317,Agricultural policy 352 | 183889291,Crop residue 353 | 1670747,Agribusiness 354 | 202050865,Hectare 355 | 2992211155,Grain yield 356 | 2992730755,Agricultural development 357 | 51832835,Environmental management system 358 | 47136581,Agricultural machinery 359 | 112077630,Irrigation management 360 | 37923429,Intensive farming 361 | 120217122,Precision agriculture 362 | 113052830,Land degradation 363 | 17616946,Pastoralism 364 | 2777178263,Land reform 365 | 105462344,Nutrient management 366 | 57664001,Agroecosystem 367 | 2778452349,Rural poverty 368 | 192039558,Biofertilizer 369 | 2777481183,Market access 370 | 507981020,Agricultural education 371 | 207581243,Agrochemical 372 | 156663261,Agroecology 373 | 2993199473,Plant biochemistry 374 | 64476972,Sustainable Agriculture Innovation Network 375 | 112939947,Green Revolution 376 | 129225989,Cash crop 377 | 2778402112,Agricultural extension 378 | 2780117336,Farm income 379 | 2778691696,Dairy farming 380 | 2775898560,Common Agricultural Policy 381 | 141005173,Shifting cultivation 382 | 2993003885,Land area 383 | 27206212,Theology 384 | 4445939,Islam 385 | 17235551,Self 386 | 2775858120,Memoria 387 | 128361363,Symbol 388 | 74256435,Flood myth 389 | 2777617010,Mainstream 390 | 2778692574,Faith 391 | 150152722,Judaism 392 | 521751864,Christian ministry 393 | 161487207,Derecho 394 | 2776211767,Doctrine 395 | 182744844,Metaphysics 396 | 551968917,Christianity 397 | 2780415144,SAINT 398 | 2778738651,Novelty 399 | 2776050585,Scrutiny 400 | 2779103253,Duty 401 | 2778983918,Wife 402 | 18296254,Skepticism 403 | 133979268,Vision 404 | 143128703,Middle Ages 405 | 2777239683,Virtue 406 | 2780422510,Humanity 407 | 2777122596,Praxis 408 | 2780822299,Soul 409 | 75699723,Buddhism 410 | 102523778,Form of the Good 411 | 2778052875,Bildung 412 | 2780310893,Passion 413 | 2777438998,Tribunal 414 | 152212766,The Republic 415 | 10180917,Conscience 416 | 530479602,Opera 417 | 2777582232,CONTEST 418 | 2781354396,Enthusiasm 419 | 543192267,Magic (paranormal) 420 | 2779438500,Honor 421 | 2778182169,Jako 422 | 111021475,Protestantism 423 | 159789966,Lingua franca 424 | 50379869,Hermeneutics 425 | 173853756,Dialog box 426 | 164105321,Catalan 427 | 32506930,Hegelianism 428 | 129454956,Field research 429 | 2779829227,Vitality 430 | 2776932993,Ethos 431 | 2779728303,Pride 432 | 2780710533,Governo 433 | 169081014,Mysticism 434 | 113522999,Fall of man 435 | 9992130,Pessimism 436 | 2775944640,Utopia 437 | 130979935,Ansatz 438 | 2776684731,Garcia 439 | 91304198,Hebrew 440 | 194105502,Biblical studies 441 | 2777222677,Worship 442 | 2781384534,Gospel 443 | 2776527531,Persian 444 | 2776405206,Revelation 445 | 46610780,Hinduism 446 | 2776134716,Sacrifice 447 | 27362006,Gestalt psychology 448 | 2777477151,Prayer 449 | 2780580889,Panorama 450 | 128536511,History of religions 451 | 2779021329,Destiny 452 | 58348228,Auteur theory 453 | 2776305542,Problema 454 | 2776911728,Courage 455 | 2776727279,Heaven 456 | 92047909,Hyperbolic function 457 | 2781179785,Valencia 458 | 83559648,Croatian 459 | 2778896172,Manifesto 460 | 2781287369,Stuttgart 461 | 534701709,Old Testament 462 | 180903884,Rationalism 463 | 32772713,Charisma 464 | 2992637229,Michel foucault 465 | 2778802261,Orthodoxy 466 | 61783943,Luck 467 | 39511330,Logo 468 | 186857363,Siege 469 | 10869588,Church history 470 | 155785087,Natural law 471 | 2437467,Perfection 472 | 512654426,Public domain 473 | 73440236,Psyche 474 | 76960060,Umwelt 475 | 164663123,Cosmos 476 | 2776347870,Passions 477 | 9299846,Secularization 478 | 22029948,Dice 479 | 2777776507,Lexico 480 | 558299567,Mass media 481 | 155030161,Mass communication 482 | 48185193,Media 483 | 167275870,Media system dependency theory 484 | 55322685,Media conglomerate 485 | 2781343547,Media Practice Model 486 | 518677369,Social media 487 | 74216064,Social computing 488 | 101293273,User-generated content 489 | 503923677,Social web 490 | 156571341,Cyberpsychology 491 | 60136833,Social media optimization 492 | 2776892586,Brand engagement 493 | 2987376390,Electronic word of mouth 494 | 2776915394,Customer engagement 495 | 2985889538,Social media marketing 496 | 16759151,Online presence management 497 | 2778838397,Uses and gratifications theory 498 | 2778729106,Social media analytics 499 | 2987325470,Social commerce 500 | 2780564743,Social CRM 501 | 2992647939,Consumer engagement 502 | 2993555337,Personal branding 503 | 178408851,Content marketing 504 | 2985692548,Crisis informatics 505 | 2777835648,Filter bubble 506 | 2993426613,Arabic sentiment analysis 507 | 2780695499,Social media mining 508 | 2993240939,Online activism 509 | 2991870026,Social media network 510 | 2780997048,Digital footprint 511 | 2988327197,Online harassment 512 | 2777257828,Virtual archaeology 513 | 2988996608,Online engagement 514 | 3018846106,Fear of missing out 515 | 2988833398,Social event detection 516 | 2993172631,Social news 517 | 2778871292,Social television 518 | 2988622424,Social multimedia 519 | 2780441040,Slacktivism 520 | 196690852,Social analytics 521 | 2988338654,News sharing 522 | 2987800000,Crisis mapping 523 | 2779113645,Like button 524 | 2984648278,Ambient awareness 525 | 2777370179,Social media measurement 526 | 2986426982,Social stream 527 | 2989393167,Personal learning network 528 | 2984029112,Social mining 529 | 2993865493,Social data analytics 530 | 2778412320,iPhoneography 531 | 529147693,News media 532 | 167752473,News values 533 | 2011517,Broadcast journalism 534 | 42211076,Reliable Sources 535 | 201280247,Newspaper 536 | 167752473,News values 537 | 16189245,News bureau 538 | 2776585538,Agenda-setting theory 539 | 2776973623,Legal deposit 540 | 3019217387,Crime news 541 | 2779111255,History of journalism 542 | 2910250570,Newspapers as Topic 543 | 2779546711,Penny press 544 | 2776757517,Newspaper digitization 545 | 2910534252,Newsclipping 546 | 2780756850,News design 547 | 2779944825,Headlinese 548 | 81959379,Broadcasting 549 | 520681616,Digital television 550 | 9819579,Tuner 551 | 83529365,Broadcast communication network 552 | 58911810,Radio broadcasting 553 | 41062264,Digital broadcasting 554 | 943373,Atomic broadcast 555 | 77757571,Multimedia Broadcast Multicast Service 556 | 2989465874,Broadcast channels 557 | 110157686,Broadcasting (networking) 558 | 2779883265,Electronic program guide 559 | 556509198,Public broadcasting 560 | 2985624630,Broadcasting system 561 | 119452085,Commercial broadcasting 562 | 2780818791,Teletext 563 | 91285054,Broadcast television systems 564 | 2994381574,Broadcast system 565 | 2778749970,Conditional access 566 | 2779106878,Digital audio broadcasting 567 | 2987348774,Broadcast data 568 | 2994534981,Television channel 569 | 2780079832,Digital multimedia broadcasting 570 | 2994104004,Broadcast service 571 | 2776847985,Single-frequency network 572 | 2994466296,Radio program 573 | 183384803,Automatic dependent surveillance-broadcast 574 | 2011517,Broadcast journalism 575 | 2992306869,Satellite television 576 | 2779438827,Television station 577 | 2993676337,Broadcast transmission 578 | 55322685,Media conglomerate 579 | 2992481583,Broadcast time 580 | 2779213998,Mobile television 581 | 196227537,Broadcast transmitter 582 | 2987586235,Multimedia broadcasting 583 | 68163228,Radio Data System 584 | 2991920864,Community radio 585 | 2988547615,Speech summarization 586 | 2777695277,DVB-H 587 | 2982719622,Video broadcast 588 | 2984608069,Wireless broadcast 589 | 159505674,Broadcasting of sports events 590 | 2779461089,FM broadcasting 591 | 2780698354,Broadcast quality 592 | 2778985329,ISDB 593 | 2986444337,Near video on demand 594 | 2777514068,Broadcast band 595 | 2775997990,Narrowcasting 596 | 2987442367,Tv viewer 597 | 2985711970,Mobile broadcast 598 | 2991660179,Broadcast packet 599 | 2779742664,Broadcast address 600 | 2779056648,International broadcasting 601 | 2986347997,Wireless broadcasting 602 | 2987043902,Broadcasting algorithms 603 | 2779081413,AM stereo 604 | 2992202738,Mobile broadcasting 605 | 2987005673,Broadcast scheduling 606 | 2776313748,Broadcast range 607 | 2777452754,TV-Anytime 608 | 2781191505,DAB ensemble 609 | 2778553611,1seg 610 | -------------------------------------------------------------------------------- /raw_data/3_blacklist/30_8_NABS_FOS/30_process_blacklist_fos.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | df = pd.read_excel('NABS_FOS_update_2020-08-20_NOT-RELEVANT__ed_VS.xlsx')[['fos_number', 'fos_name']] 4 | df.columns = ['fos_id', 'fos_name'] 5 | 6 | df.to_csv('30_BlacklistFOS.csv', index=False) 7 | -------------------------------------------------------------------------------- /raw_data/3_blacklist/30_8_NABS_FOS/NABS_FOS_update_2020-08-20_NOT-RELEVANT__ed_VS.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/3_blacklist/30_8_NABS_FOS/NABS_FOS_update_2020-08-20_NOT-RELEVANT__ed_VS.xlsx -------------------------------------------------------------------------------- /raw_data/3_blacklist/AssembleBlacklist.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | df_blacklist = pd.DataFrame(columns=['fos_id', 'fos_name', 'source']) 5 | 6 | for directory in filter(lambda dir: '.' not in dir, os.listdir()): 7 | try: 8 | blacklist_fos_fname = list(filter(lambda oname: '_BlacklistFOS.csv' in oname, os.listdir(directory)))[0] 9 | df = pd.read_csv(f'{directory}/{blacklist_fos_fname}') 10 | assert list(df.columns) == ['fos_id', 'fos_name'], "*_BlacklistFOS.csv column names must be ['fos_id', 'fos_name']" 11 | except IndexError: 12 | print('Sdg Terms are not processed in {directory}') 13 | continue 14 | except AssertionError as e: 15 | print(f'In directory {directory}\n{e}') 16 | continue 17 | 18 | df['source'] = directory 19 | 20 | df_blacklist = pd.concat([df_blacklist, df], axis=0) 21 | 22 | df_blacklist.sort_values(['fos_id', 'source'], inplace=True) 23 | 24 | df_blacklist.to_csv('Blacklist.csv', index=False) 25 | -------------------------------------------------------------------------------- /raw_data/3_blacklist/Blacklist.csv: -------------------------------------------------------------------------------- 1 | fos_id,fos_name,source 2 | 943373,Atomic broadcast,30_8_NABS_FOS 3 | 1670747,Agribusiness,30_8_NABS_FOS 4 | 2011517,Broadcast journalism,30_8_NABS_FOS 5 | 2011517,Broadcast journalism,30_8_NABS_FOS 6 | 2437467,Perfection,30_8_NABS_FOS 7 | 3987366,Livelihood,30_8_NABS_FOS 8 | 4445939,Islam,30_8_NABS_FOS 9 | 5166401,Tailings,30_8_NABS_FOS 10 | 6363049,Volcanism,30_8_NABS_FOS 11 | 6557445,Agronomy,30_8_NABS_FOS 12 | 7028197,Gangue,30_8_NABS_FOS 13 | 7251660,Structural geology,30_8_NABS_FOS 14 | 8824402,Landslides vs. Rock strength,30_8_NABS_FOS 15 | 8868529,Taste,30_8_NABS_FOS 16 | 9299846,Secularization,30_8_NABS_FOS 17 | 9819579,Tuner,30_8_NABS_FOS 18 | 9992130,Pessimism,30_8_NABS_FOS 19 | 10180917,Conscience,30_8_NABS_FOS 20 | 10869588,Church history,30_8_NABS_FOS 21 | 11872896,Fractional crystallization (geology),30_8_NABS_FOS 22 | 13558536,Cropping,30_8_NABS_FOS 23 | 16189245,News bureau,30_8_NABS_FOS 24 | 16397148,Tillage,30_8_NABS_FOS 25 | 16670881,Neotectonics,30_8_NABS_FOS 26 | 16674752,Mining engineering,30_8_NABS_FOS 27 | 16759151,Online presence management,30_8_NABS_FOS 28 | 16942324,Lithosphere,30_8_NABS_FOS 29 | 17235551,Self,30_8_NABS_FOS 30 | 17616946,Pastoralism,30_8_NABS_FOS 31 | 18296254,Skepticism,30_8_NABS_FOS 32 | 21410773,Shoot,30_8_NABS_FOS 33 | 21410773,Shoot,30_8_NABS_FOS 34 | 21441200,Mineral physics,30_8_NABS_FOS 35 | 22029948,Dice,30_8_NABS_FOS 36 | 22508944,PEST analysis,30_8_NABS_FOS 37 | 22512106,Shear wave splitting,30_8_NABS_FOS 38 | 23148476,Seismic anisotropy,30_8_NABS_FOS 39 | 23295444,Shear zone,30_8_NABS_FOS 40 | 23923706,Ophiolite,30_8_NABS_FOS 41 | 23923706,Ophiolite,30_8_NABS_FOS 42 | 24461792,Perennial plant,30_8_NABS_FOS 43 | 26144545,Cut and fill,30_8_NABS_FOS 44 | 27206212,Theology,30_8_NABS_FOS 45 | 27362006,Gestalt psychology,30_8_NABS_FOS 46 | 32198211,Greenhouse,30_8_NABS_FOS 47 | 32198211,Greenhouse,30_8_NABS_FOS 48 | 32506930,Hegelianism,30_8_NABS_FOS 49 | 32772713,Charisma,30_8_NABS_FOS 50 | 33556824,Hydrogeology,30_8_NABS_FOS 51 | 35496372,Phloem,30_8_NABS_FOS 52 | 36248471,Seeding,30_8_NABS_FOS 53 | 37523158,Hadean,30_8_NABS_FOS 54 | 37523158,Hadean,30_8_NABS_FOS 55 | 37923429,Intensive farming,30_8_NABS_FOS 56 | 38304854,Manure,30_8_NABS_FOS 57 | 39511330,Logo,30_8_NABS_FOS 58 | 41062264,Digital broadcasting,30_8_NABS_FOS 59 | 41242791,Rock mass classification,30_8_NABS_FOS 60 | 42211076,Reliable Sources,30_8_NABS_FOS 61 | 42796848,Xenolith,30_8_NABS_FOS 62 | 42972112,Veterinary medicine,30_8_NABS_FOS 63 | 43143990,Conidium,30_8_NABS_FOS 64 | 44938399,Petrogenesis,30_8_NABS_FOS 65 | 44938399,Petrogenesis,30_8_NABS_FOS 66 | 45292766,Bark,30_8_NABS_FOS 67 | 46328234,Organoleptic,30_8_NABS_FOS 68 | 46517748,Drift mining,30_8_NABS_FOS 69 | 46580973,Blackdamp,30_8_NABS_FOS 70 | 46610780,Hinduism,30_8_NABS_FOS 71 | 46757340,Poaceae,30_8_NABS_FOS 72 | 47136581,Agricultural machinery,30_8_NABS_FOS 73 | 48185193,Media,30_8_NABS_FOS 74 | 48189365,Hybrid,30_8_NABS_FOS 75 | 48743137,Organic matter,30_8_NABS_FOS 76 | 49799701,Xylem,30_8_NABS_FOS 77 | 50379869,Hermeneutics,30_8_NABS_FOS 78 | 50660011,Tropics,30_8_NABS_FOS 79 | 50682988,Rift,30_8_NABS_FOS 80 | 51151373,Mid-ocean ridge,30_8_NABS_FOS 81 | 51417038,Phenology,30_8_NABS_FOS 82 | 51832835,Environmental management system,30_8_NABS_FOS 83 | 52991690,Culling,30_8_NABS_FOS 84 | 53007507,Browning,30_8_NABS_FOS 85 | 53657456,Peat,30_8_NABS_FOS 86 | 54924851,Sustainable agriculture,30_8_NABS_FOS 87 | 55322685,Media conglomerate,30_8_NABS_FOS 88 | 55322685,Media conglomerate,30_8_NABS_FOS 89 | 55969652,photoperiodism,30_8_NABS_FOS 90 | 56859440,Bouguer anomaly,30_8_NABS_FOS 91 | 57664001,Agroecosystem,30_8_NABS_FOS 92 | 58097730,Subduction,30_8_NABS_FOS 93 | 58348228,Auteur theory,30_8_NABS_FOS 94 | 58625266,Lode,30_8_NABS_FOS 95 | 58911810,Radio broadcasting,30_8_NABS_FOS 96 | 60136833,Social media optimization,30_8_NABS_FOS 97 | 61783943,Luck,30_8_NABS_FOS 98 | 61968832,Animal husbandry,30_8_NABS_FOS 99 | 64476972,Sustainable Agriculture Innovation Network,30_8_NABS_FOS 100 | 66511971,Mining geology,30_8_NABS_FOS 101 | 66914385,Poultry farming,30_8_NABS_FOS 102 | 67236022,Mantle (geology),30_8_NABS_FOS 103 | 68163228,Radio Data System,30_8_NABS_FOS 104 | 71762439,Arable land,30_8_NABS_FOS 105 | 73440236,Psyche,30_8_NABS_FOS 106 | 74103781,Ornamental plant,30_8_NABS_FOS 107 | 74216064,Social computing,30_8_NABS_FOS 108 | 74256435,Flood myth,30_8_NABS_FOS 109 | 75296557,Husk,30_8_NABS_FOS 110 | 75639521,Field experiment,30_8_NABS_FOS 111 | 75639521,Field experiment,30_8_NABS_FOS 112 | 75699723,Buddhism,30_8_NABS_FOS 113 | 76960060,Umwelt,30_8_NABS_FOS 114 | 77757571,Multimedia Broadcast Multicast Service,30_8_NABS_FOS 115 | 77928131,Tectonics,30_8_NABS_FOS 116 | 79572550,Partial melting,30_8_NABS_FOS 117 | 81461190,Temperate climate,30_8_NABS_FOS 118 | 81959379,Broadcasting,30_8_NABS_FOS 119 | 83529365,Broadcast communication network,30_8_NABS_FOS 120 | 83559648,Croatian,30_8_NABS_FOS 121 | 83740816,Gibberellin,30_8_NABS_FOS 122 | 83948199,Incompatible element,30_8_NABS_FOS 123 | 84372278,Peridotite,30_8_NABS_FOS 124 | 85582077,Paddy field,30_8_NABS_FOS 125 | 85675897,Soil management,30_8_NABS_FOS 126 | 88862950,Irrigation,30_8_NABS_FOS 127 | 88862950,Irrigation,30_8_NABS_FOS 128 | 88972607,Human fertilization,30_8_NABS_FOS 129 | 91285054,Broadcast television systems,30_8_NABS_FOS 130 | 91304198,Hebrew,30_8_NABS_FOS 131 | 92047909,Hyperbolic function,30_8_NABS_FOS 132 | 92596616,Lineament,30_8_NABS_FOS 133 | 93011207,Geotechnical investigation,30_8_NABS_FOS 134 | 93746451,Mineral redox buffer,30_8_NABS_FOS 135 | 94236395,Stoping,30_8_NABS_FOS 136 | 97490223,Focal mechanism,30_8_NABS_FOS 137 | 97842125,Rock mechanics,30_8_NABS_FOS 138 | 100701293,Germination,30_8_NABS_FOS 139 | 101000010,Canopy,30_8_NABS_FOS 140 | 101293273,User-generated content,30_8_NABS_FOS 141 | 102044607,Adit,30_8_NABS_FOS 142 | 102523778,Form of the Good,30_8_NABS_FOS 143 | 103797069,Domestic sheep reproduction,30_8_NABS_FOS 144 | 104727253,Biological pest control,30_8_NABS_FOS 145 | 105462344,Nutrient management,30_8_NABS_FOS 146 | 108010975,Pruning,30_8_NABS_FOS 147 | 109162521,Soil conservation,30_8_NABS_FOS 148 | 109281948,Stratigraphy,30_8_NABS_FOS 149 | 110041135,Thrust fault,30_8_NABS_FOS 150 | 110157686,Broadcasting (networking),30_8_NABS_FOS 151 | 111021475,Protestantism,30_8_NABS_FOS 152 | 112077630,Irrigation management,30_8_NABS_FOS 153 | 112939947,Green Revolution,30_8_NABS_FOS 154 | 113052830,Land degradation,30_8_NABS_FOS 155 | 113522999,Fall of man,30_8_NABS_FOS 156 | 113658590,Muck,30_8_NABS_FOS 157 | 114793014,Geomorphology,30_8_NABS_FOS 158 | 115930662,Shelf life,30_8_NABS_FOS 159 | 118518473,Agriculture,30_8_NABS_FOS 160 | 118817206,Organic farming,30_8_NABS_FOS 161 | 119452085,Commercial broadcasting,30_8_NABS_FOS 162 | 119477230,Plate tectonics,30_8_NABS_FOS 163 | 120217122,Precision agriculture,30_8_NABS_FOS 164 | 122690726,"Land use, land-use change and forestry",30_8_NABS_FOS 165 | 122959257,Seismic tomography,30_8_NABS_FOS 166 | 123963621,Integrated pest management,30_8_NABS_FOS 167 | 127200247,Hydraulic mining,30_8_NABS_FOS 168 | 127723449,Core–mantle boundary,30_8_NABS_FOS 169 | 128361363,Symbol,30_8_NABS_FOS 170 | 128383755,Agricultural productivity,30_8_NABS_FOS 171 | 128536511,History of religions,30_8_NABS_FOS 172 | 128758860,Woody plant,30_8_NABS_FOS 173 | 128954607,Tectonophysics,30_8_NABS_FOS 174 | 129225989,Cash crop,30_8_NABS_FOS 175 | 129454956,Field research,30_8_NABS_FOS 176 | 130979935,Ansatz,30_8_NABS_FOS 177 | 132215390,Abiotic component,30_8_NABS_FOS 178 | 133479454,Mycelium,30_8_NABS_FOS 179 | 133979268,Vision,30_8_NABS_FOS 180 | 134215735,Flock,30_8_NABS_FOS 181 | 136752280,Geodynamics,30_8_NABS_FOS 182 | 137580998,Crop,30_8_NABS_FOS 183 | 137580998,Crop,30_8_NABS_FOS 184 | 137607661,Land tenure,30_8_NABS_FOS 185 | 137660486,Growing season,30_8_NABS_FOS 186 | 137776501,Point of delivery,30_8_NABS_FOS 187 | 139496715,Deforestation,30_8_NABS_FOS 188 | 140230471,Planetary core,30_8_NABS_FOS 189 | 140441402,Carbonatite,30_8_NABS_FOS 190 | 141005173,Shifting cultivation,30_8_NABS_FOS 191 | 141282968,Plant ecology,30_8_NABS_FOS 192 | 141646446,Continental crust,30_8_NABS_FOS 193 | 142796444,Nutrient,30_8_NABS_FOS 194 | 143128703,Middle Ages,30_8_NABS_FOS 195 | 144027150,Horticulture,30_8_NABS_FOS 196 | 150152722,Judaism,30_8_NABS_FOS 197 | 150668497,Dry weight,30_8_NABS_FOS 198 | 150668497,Dry weight,30_8_NABS_FOS 199 | 150772632,Arid,30_8_NABS_FOS 200 | 150999391,Terrane,30_8_NABS_FOS 201 | 152212766,The Republic,30_8_NABS_FOS 202 | 152972079,Plate reconstruction,30_8_NABS_FOS 203 | 154802760,Giant impact hypothesis,30_8_NABS_FOS 204 | 155030161,Mass communication,30_8_NABS_FOS 205 | 155785087,Natural law,30_8_NABS_FOS 206 | 155868670,Root system,30_8_NABS_FOS 207 | 156005406,Subsistence agriculture,30_8_NABS_FOS 208 | 156571341,Cyberpsychology,30_8_NABS_FOS 209 | 156663261,Agroecology,30_8_NABS_FOS 210 | 157140304,Agrarian society,30_8_NABS_FOS 211 | 157670687,Postharvest,30_8_NABS_FOS 212 | 159505674,Broadcasting of sports events,30_8_NABS_FOS 213 | 159719176,Engineering geology,30_8_NABS_FOS 214 | 159750122,Soil water,30_8_NABS_FOS 215 | 159789966,Lingua franca,30_8_NABS_FOS 216 | 160804572,Silicate perovskite,30_8_NABS_FOS 217 | 161176658,Pesticide,30_8_NABS_FOS 218 | 161221295,Plant physiology,30_8_NABS_FOS 219 | 161487207,Derecho,30_8_NABS_FOS 220 | 164105321,Catalan,30_8_NABS_FOS 221 | 164663123,Cosmos,30_8_NABS_FOS 222 | 166839181,Graben,30_8_NABS_FOS 223 | 167275870,Media system dependency theory,30_8_NABS_FOS 224 | 167752473,News values,30_8_NABS_FOS 225 | 167752473,News values,30_8_NABS_FOS 226 | 167919410,Metasomatism,30_8_NABS_FOS 227 | 168197293,Pollination,30_8_NABS_FOS 228 | 168568655,Medical entomology,30_8_NABS_FOS 229 | 168741863,Sowing,30_8_NABS_FOS 230 | 168741863,Sowing,30_8_NABS_FOS 231 | 169081014,Mysticism,30_8_NABS_FOS 232 | 172353545,Ripening,30_8_NABS_FOS 233 | 173419221,Crossbreed,30_8_NABS_FOS 234 | 173853756,Dialog box,30_8_NABS_FOS 235 | 175181221,Prospecting,30_8_NABS_FOS 236 | 175760724,Crop rotation,30_8_NABS_FOS 237 | 178165689,Inflorescence,30_8_NABS_FOS 238 | 178408851,Content marketing,30_8_NABS_FOS 239 | 179158327,Palaeogeography,30_8_NABS_FOS 240 | 179974421,Rock bolt,30_8_NABS_FOS 241 | 180362636,Thrust tectonics,30_8_NABS_FOS 242 | 180903884,Rationalism,30_8_NABS_FOS 243 | 182744844,Metaphysics,30_8_NABS_FOS 244 | 183135511,Natural resource management,30_8_NABS_FOS 245 | 183282558,Kimberlite,30_8_NABS_FOS 246 | 183384803,Automatic dependent surveillance-broadcast,30_8_NABS_FOS 247 | 183889291,Crop residue,30_8_NABS_FOS 248 | 184977646,Open-pit mining,30_8_NABS_FOS 249 | 185476388,Cotyledon,30_8_NABS_FOS 250 | 186096623,Ground pressure,30_8_NABS_FOS 251 | 186857363,Siege,30_8_NABS_FOS 252 | 189797535,Drought tolerance,30_8_NABS_FOS 253 | 192039558,Biofertilizer,30_8_NABS_FOS 254 | 193605714,Steam shovel,30_8_NABS_FOS 255 | 194105502,Biblical studies,30_8_NABS_FOS 256 | 194775826,Herd,30_8_NABS_FOS 257 | 195081551,Ultramafic rock,30_8_NABS_FOS 258 | 196227537,Broadcast transmitter,30_8_NABS_FOS 259 | 196690852,Social analytics,30_8_NABS_FOS 260 | 197321923,Cultivar,30_8_NABS_FOS 261 | 197321923,Cultivar,30_8_NABS_FOS 262 | 199007388,Diapir,30_8_NABS_FOS 263 | 201280247,Newspaper,30_8_NABS_FOS 264 | 201867031,Continental margin,30_8_NABS_FOS 265 | 202050865,Hectare,30_8_NABS_FOS 266 | 207469975,Shaft mining,30_8_NABS_FOS 267 | 207581243,Agrochemical,30_8_NABS_FOS 268 | 502990516,Agricultural land,30_8_NABS_FOS 269 | 503923677,Social web,30_8_NABS_FOS 270 | 507981020,Agricultural education,30_8_NABS_FOS 271 | 510538283,Phosphorus,30_8_NABS_FOS 272 | 512654426,Public domain,30_8_NABS_FOS 273 | 513193947,Fodder,30_8_NABS_FOS 274 | 518677369,Social media,30_8_NABS_FOS 275 | 520681616,Digital television,30_8_NABS_FOS 276 | 521751864,Christian ministry,30_8_NABS_FOS 277 | 523966790,Animal welfare,30_8_NABS_FOS 278 | 529147693,News media,30_8_NABS_FOS 279 | 530479602,Opera,30_8_NABS_FOS 280 | 534701709,Old Testament,30_8_NABS_FOS 281 | 540442320,Pest control,30_8_NABS_FOS 282 | 540442320,Pest control,30_8_NABS_FOS 283 | 543192267,Magic (paranormal),30_8_NABS_FOS 284 | 549605437,Food security,30_8_NABS_FOS 285 | 549698073,Structure of the Earth,30_8_NABS_FOS 286 | 551968917,Christianity,30_8_NABS_FOS 287 | 556509198,Public broadcasting,30_8_NABS_FOS 288 | 558299567,Mass media,30_8_NABS_FOS 289 | 559400886,Land management,30_8_NABS_FOS 290 | 2775835988,Grassland,30_8_NABS_FOS 291 | 2775858120,Memoria,30_8_NABS_FOS 292 | 2775891814,Weed,30_8_NABS_FOS 293 | 2775898560,Common Agricultural Policy,30_8_NABS_FOS 294 | 2775944640,Utopia,30_8_NABS_FOS 295 | 2775976403,Aphid,30_8_NABS_FOS 296 | 2775997990,Narrowcasting,30_8_NABS_FOS 297 | 2776050585,Scrutiny,30_8_NABS_FOS 298 | 2776082042,Vulpes,30_8_NABS_FOS 299 | 2776096895,Seedling,30_8_NABS_FOS 300 | 2776134716,Sacrifice,30_8_NABS_FOS 301 | 2776184289,Farallon Plate,30_8_NABS_FOS 302 | 2776211767,Doctrine,30_8_NABS_FOS 303 | 2776222705,Wild boar,30_8_NABS_FOS 304 | 2776242653,Pepper,30_8_NABS_FOS 305 | 2776247511,Zebu,30_8_NABS_FOS 306 | 2776286235,Phaseolus,30_8_NABS_FOS 307 | 2776305542,Problema,30_8_NABS_FOS 308 | 2776313748,Broadcast range,30_8_NABS_FOS 309 | 2776327621,Flesh,30_8_NABS_FOS 310 | 2776347870,Passions,30_8_NABS_FOS 311 | 2776373379,Chlorophyll,30_8_NABS_FOS 312 | 2776405206,Revelation,30_8_NABS_FOS 313 | 2776451879,Infestation,30_8_NABS_FOS 314 | 2776474821,Mushroom,30_8_NABS_FOS 315 | 2776475172,Soil quality,30_8_NABS_FOS 316 | 2776482104,Breed,30_8_NABS_FOS 317 | 2776521926,Brahman,30_8_NABS_FOS 318 | 2776527531,Persian,30_8_NABS_FOS 319 | 2776585538,Agenda-setting theory,30_8_NABS_FOS 320 | 2776629827,Panasqueira,30_8_NABS_FOS 321 | 2776632002,Legume,30_8_NABS_FOS 322 | 2776632002,Legume,30_8_NABS_FOS 323 | 2776684731,Garcia,30_8_NABS_FOS 324 | 2776698055,Crust,30_8_NABS_FOS 325 | 2776727279,Heaven,30_8_NABS_FOS 326 | 2776747608,Brassica,30_8_NABS_FOS 327 | 2776757517,Newspaper digitization,30_8_NABS_FOS 328 | 2776760134,Gold mining,30_8_NABS_FOS 329 | 2776763651,Ferropericlase,30_8_NABS_FOS 330 | 2776797426,Biogeosciences,30_8_NABS_FOS 331 | 2776847985,Single-frequency network,30_8_NABS_FOS 332 | 2776892586,Brand engagement,30_8_NABS_FOS 333 | 2776908094,Anthelmintic,30_8_NABS_FOS 334 | 2776911728,Courage,30_8_NABS_FOS 335 | 2776915394,Customer engagement,30_8_NABS_FOS 336 | 2776932993,Ethos,30_8_NABS_FOS 337 | 2776960312,Hock,30_8_NABS_FOS 338 | 2776973623,Legal deposit,30_8_NABS_FOS 339 | 2776977481,Dairy cattle,30_8_NABS_FOS 340 | 2777108408,Sugar,30_8_NABS_FOS 341 | 2777114023,Withers,30_8_NABS_FOS 342 | 2777122596,Praxis,30_8_NABS_FOS 343 | 2777146433,Badger,30_8_NABS_FOS 344 | 2777151259,Mallophaga,30_8_NABS_FOS 345 | 2777178263,Land reform,30_8_NABS_FOS 346 | 2777199308,Louse,30_8_NABS_FOS 347 | 2777201227,Overburden,30_8_NABS_FOS 348 | 2777222677,Worship,30_8_NABS_FOS 349 | 2777222942,Corriedale,30_8_NABS_FOS 350 | 2777225262,Veterinary pathology,30_8_NABS_FOS 351 | 2777239683,Virtue,30_8_NABS_FOS 352 | 2777257828,Virtual archaeology,30_8_NABS_FOS 353 | 2777370179,Social media measurement,30_8_NABS_FOS 354 | 2777425756,Gold panning,30_8_NABS_FOS 355 | 2777438998,Tribunal,30_8_NABS_FOS 356 | 2777452754,TV-Anytime,30_8_NABS_FOS 357 | 2777461220,Germplasm,30_8_NABS_FOS 358 | 2777474537,Pheasant,30_8_NABS_FOS 359 | 2777477151,Prayer,30_8_NABS_FOS 360 | 2777480983,USArray,30_8_NABS_FOS 361 | 2777481183,Market access,30_8_NABS_FOS 362 | 2777499811,Ivermectin,30_8_NABS_FOS 363 | 2777514068,Broadcast band,30_8_NABS_FOS 364 | 2777582232,CONTEST,30_8_NABS_FOS 365 | 2777612826,Insect,30_8_NABS_FOS 366 | 2777617010,Mainstream,30_8_NABS_FOS 367 | 2777695277,DVB-H,30_8_NABS_FOS 368 | 2777776507,Lexico,30_8_NABS_FOS 369 | 2777786777,Flea,30_8_NABS_FOS 370 | 2777835648,Filter bubble,30_8_NABS_FOS 371 | 2777904157,Grazing,30_8_NABS_FOS 372 | 2777963300,Ovis,30_8_NABS_FOS 373 | 2777976947,Eimeria,30_8_NABS_FOS 374 | 2777992645,North American Plate,30_8_NABS_FOS 375 | 2777994876,Pacific Plate,30_8_NABS_FOS 376 | 2778002360,Rump,30_8_NABS_FOS 377 | 2778052875,Bildung,30_8_NABS_FOS 378 | 2778053677,Pasture,30_8_NABS_FOS 379 | 2778134537,Domestic pig,30_8_NABS_FOS 380 | 2778136425,Struthio,30_8_NABS_FOS 381 | 2778143190,Minnesota Geological Survey,30_8_NABS_FOS 382 | 2778157034,Sorghum,30_8_NABS_FOS 383 | 2778157034,Sorghum,30_8_NABS_FOS 384 | 2778182169,Jako,30_8_NABS_FOS 385 | 2778226015,Capreolus,30_8_NABS_FOS 386 | 2778261408,Eurasian Plate,30_8_NABS_FOS 387 | 2778375701,Ground stone,30_8_NABS_FOS 388 | 2778402112,Agricultural extension,30_8_NABS_FOS 389 | 2778412320,iPhoneography,30_8_NABS_FOS 390 | 2778452349,Rural poverty,30_8_NABS_FOS 391 | 2778471503,Basin and Range Province,30_8_NABS_FOS 392 | 2778524612,Mining law,30_8_NABS_FOS 393 | 2778553611,1seg,30_8_NABS_FOS 394 | 2778691696,Dairy farming,30_8_NABS_FOS 395 | 2778692574,Faith,30_8_NABS_FOS 396 | 2778729106,Social media analytics,30_8_NABS_FOS 397 | 2778738651,Novelty,30_8_NABS_FOS 398 | 2778749970,Conditional access,30_8_NABS_FOS 399 | 2778761015,Solanaceae,30_8_NABS_FOS 400 | 2778802261,Orthodoxy,30_8_NABS_FOS 401 | 2778838397,Uses and gratifications theory,30_8_NABS_FOS 402 | 2778839144,Medical geology,30_8_NABS_FOS 403 | 2778852317,Agricultural policy,30_8_NABS_FOS 404 | 2778856526,Cow-calf,30_8_NABS_FOS 405 | 2778871292,Social television,30_8_NABS_FOS 406 | 2778877831,Cryptosporidium,30_8_NABS_FOS 407 | 2778882853,Phlogopite,30_8_NABS_FOS 408 | 2778896172,Manifesto,30_8_NABS_FOS 409 | 2778983918,Wife,30_8_NABS_FOS 410 | 2778985329,ISDB,30_8_NABS_FOS 411 | 2779021329,Destiny,30_8_NABS_FOS 412 | 2779056648,International broadcasting,30_8_NABS_FOS 413 | 2779070535,Street gutter,30_8_NABS_FOS 414 | 2779081413,AM stereo,30_8_NABS_FOS 415 | 2779096232,Hydraulic fracturing,30_8_NABS_FOS 416 | 2779103253,Duty,30_8_NABS_FOS 417 | 2779106878,Digital audio broadcasting,30_8_NABS_FOS 418 | 2779111255,History of journalism,30_8_NABS_FOS 419 | 2779113645,Like button,30_8_NABS_FOS 420 | 2779197568,Sunflower,30_8_NABS_FOS 421 | 2779213998,Mobile television,30_8_NABS_FOS 422 | 2779220025,Peasant,30_8_NABS_FOS 423 | 2779329348,Feedlot,30_8_NABS_FOS 424 | 2779370140,Forage,30_8_NABS_FOS 425 | 2779371384,Biomass,30_8_NABS_FOS 426 | 2779422593,Soil crust,30_8_NABS_FOS 427 | 2779429622,Litter,30_8_NABS_FOS 428 | 2779438500,Honor,30_8_NABS_FOS 429 | 2779438827,Television station,30_8_NABS_FOS 430 | 2779461089,FM broadcasting,30_8_NABS_FOS 431 | 2779546711,Penny press,30_8_NABS_FOS 432 | 2779552062,Roe deer,30_8_NABS_FOS 433 | 2779557943,Canis,30_8_NABS_FOS 434 | 2779587293,Straw,30_8_NABS_FOS 435 | 2779620486,Tick,30_8_NABS_FOS 436 | 2779678110,Fungus,30_8_NABS_FOS 437 | 2779728303,Pride,30_8_NABS_FOS 438 | 2779742380,Gabion,30_8_NABS_FOS 439 | 2779742664,Broadcast address,30_8_NABS_FOS 440 | 2779795913,Hurrying,30_8_NABS_FOS 441 | 2779824472,Herb,30_8_NABS_FOS 442 | 2779829227,Vitality,30_8_NABS_FOS 443 | 2779867292,African Plate,30_8_NABS_FOS 444 | 2779867394,Bubalus,30_8_NABS_FOS 445 | 2779880937,Whinstone,30_8_NABS_FOS 446 | 2779883265,Electronic program guide,30_8_NABS_FOS 447 | 2779885849,Milking,30_8_NABS_FOS 448 | 2779914258,Nili-Ravi,30_8_NABS_FOS 449 | 2779944825,Headlinese,30_8_NABS_FOS 450 | 2779980370,Magma ocean,30_8_NABS_FOS 451 | 2779980370,Magma ocean,30_8_NABS_FOS 452 | 2780043312,Hydraulic fill,30_8_NABS_FOS 453 | 2780054949,Spinach,30_8_NABS_FOS 454 | 2780079832,Digital multimedia broadcasting,30_8_NABS_FOS 455 | 2780117336,Farm income,30_8_NABS_FOS 456 | 2780138947,Dry matter,30_8_NABS_FOS 457 | 2780284631,Sire,30_8_NABS_FOS 458 | 2780310893,Passion,30_8_NABS_FOS 459 | 2780323295,Cervus,30_8_NABS_FOS 460 | 2780356177,Baltic Shield,30_8_NABS_FOS 461 | 2780356177,Baltic Shield,30_8_NABS_FOS 462 | 2780414537,Maple,30_8_NABS_FOS 463 | 2780415144,SAINT,30_8_NABS_FOS 464 | 2780422510,Humanity,30_8_NABS_FOS 465 | 2780441040,Slacktivism,30_8_NABS_FOS 466 | 2780460740,Jackal,30_8_NABS_FOS 467 | 2780487972,Veterinary parasitology,30_8_NABS_FOS 468 | 2780505807,Beef cattle,30_8_NABS_FOS 469 | 2780556036,South Pole–Aitken basin,30_8_NABS_FOS 470 | 2780563676,Aroma,30_8_NABS_FOS 471 | 2780564743,Social CRM,30_8_NABS_FOS 472 | 2780580889,Panorama,30_8_NABS_FOS 473 | 2780618852,Pollen,30_8_NABS_FOS 474 | 2780695499,Social media mining,30_8_NABS_FOS 475 | 2780698354,Broadcast quality,30_8_NABS_FOS 476 | 2780710533,Governo,30_8_NABS_FOS 477 | 2780719635,Flavor,30_8_NABS_FOS 478 | 2780727426,Awassi,30_8_NABS_FOS 479 | 2780739461,Compost,30_8_NABS_FOS 480 | 2780739461,Compost,30_8_NABS_FOS 481 | 2780756850,News design,30_8_NABS_FOS 482 | 2780818791,Teletext,30_8_NABS_FOS 483 | 2780822299,Soul,30_8_NABS_FOS 484 | 2780942940,Stishovite,30_8_NABS_FOS 485 | 2780968714,Mange,30_8_NABS_FOS 486 | 2780997048,Digital footprint,30_8_NABS_FOS 487 | 2781079927,Dimension stone,30_8_NABS_FOS 488 | 2781121916,Bow drill,30_8_NABS_FOS 489 | 2781179785,Valencia,30_8_NABS_FOS 490 | 2781191505,DAB ensemble,30_8_NABS_FOS 491 | 2781207809,South American Plate,30_8_NABS_FOS 492 | 2781287369,Stuttgart,30_8_NABS_FOS 493 | 2781343547,Media Practice Model,30_8_NABS_FOS 494 | 2781354396,Enthusiasm,30_8_NABS_FOS 495 | 2781368420,Biosecurity,30_8_NABS_FOS 496 | 2781384534,Gospel,30_8_NABS_FOS 497 | 2781390083,Pinctada fucata,30_8_NABS_FOS 498 | 2908605944,Meleagris gallopavo,30_8_NABS_FOS 499 | 2908982167,Cattle Diseases,30_8_NABS_FOS 500 | 2909031412,Gallus gallus domesticus,30_8_NABS_FOS 501 | 2909086881,Stone quarry,30_8_NABS_FOS 502 | 2909619495,Food animal,30_8_NABS_FOS 503 | 2909623323,Waste Dumps,30_8_NABS_FOS 504 | 2909642594,Shaft (site),30_8_NABS_FOS 505 | 2909771501,Goat Diseases,30_8_NABS_FOS 506 | 2909895380,Guinea fowl,30_8_NABS_FOS 507 | 2910164855,Logging car,30_8_NABS_FOS 508 | 2910250570,Newspapers as Topic,30_8_NABS_FOS 509 | 2910477778,DUMP formation,30_8_NABS_FOS 510 | 2910514300,Root stones,30_8_NABS_FOS 511 | 2910534252,Newsclipping,30_8_NABS_FOS 512 | 2910651670,Bird Diseases,30_8_NABS_FOS 513 | 2910697619,Hearing analyzer,30_8_NABS_FOS 514 | 2910921642,Mineral industries,30_8_NABS_FOS 515 | 2910990604,Lama glama,30_8_NABS_FOS 516 | 2911060314,Laboratory Animal Science,30_8_NABS_FOS 517 | 2911132530,Mine surveyor,30_8_NABS_FOS 518 | 2911210907,Support pressure,30_8_NABS_FOS 519 | 2982719622,Video broadcast,30_8_NABS_FOS 520 | 2982966219,Plant growth,30_8_NABS_FOS 521 | 2982966219,Plant growth,30_8_NABS_FOS 522 | 2984029112,Social mining,30_8_NABS_FOS 523 | 2984157484,Mining industry,30_8_NABS_FOS 524 | 2984608069,Wireless broadcast,30_8_NABS_FOS 525 | 2984648278,Ambient awareness,30_8_NABS_FOS 526 | 2985624630,Broadcasting system,30_8_NABS_FOS 527 | 2985692548,Crisis informatics,30_8_NABS_FOS 528 | 2985711970,Mobile broadcast,30_8_NABS_FOS 529 | 2985889538,Social media marketing,30_8_NABS_FOS 530 | 2986347997,Wireless broadcasting,30_8_NABS_FOS 531 | 2986426982,Social stream,30_8_NABS_FOS 532 | 2986444337,Near video on demand,30_8_NABS_FOS 533 | 2987005673,Broadcast scheduling,30_8_NABS_FOS 534 | 2987034934,Earth crust,30_8_NABS_FOS 535 | 2987043902,Broadcasting algorithms,30_8_NABS_FOS 536 | 2987325470,Social commerce,30_8_NABS_FOS 537 | 2987348774,Broadcast data,30_8_NABS_FOS 538 | 2987376390,Electronic word of mouth,30_8_NABS_FOS 539 | 2987442367,Tv viewer,30_8_NABS_FOS 540 | 2987586235,Multimedia broadcasting,30_8_NABS_FOS 541 | 2987800000,Crisis mapping,30_8_NABS_FOS 542 | 2988327197,Online harassment,30_8_NABS_FOS 543 | 2988338654,News sharing,30_8_NABS_FOS 544 | 2988529969,Cold storage,30_8_NABS_FOS 545 | 2988547615,Speech summarization,30_8_NABS_FOS 546 | 2988622424,Social multimedia,30_8_NABS_FOS 547 | 2988676352,Rural development,30_8_NABS_FOS 548 | 2988833398,Social event detection,30_8_NABS_FOS 549 | 2988996608,Online engagement,30_8_NABS_FOS 550 | 2989393167,Personal learning network,30_8_NABS_FOS 551 | 2989409935,Crop production,30_8_NABS_FOS 552 | 2989465874,Broadcast channels,30_8_NABS_FOS 553 | 2991660179,Broadcast packet,30_8_NABS_FOS 554 | 2991667299,Capra hircus,30_8_NABS_FOS 555 | 2991862235,Animal health,30_8_NABS_FOS 556 | 2991870026,Social media network,30_8_NABS_FOS 557 | 2991920864,Community radio,30_8_NABS_FOS 558 | 2991922516,Rock pressure,30_8_NABS_FOS 559 | 2992067306,Mineral potential,30_8_NABS_FOS 560 | 2992202738,Mobile broadcasting,30_8_NABS_FOS 561 | 2992211155,Grain yield,30_8_NABS_FOS 562 | 2992306869,Satellite television,30_8_NABS_FOS 563 | 2992330363,Urban geology,30_8_NABS_FOS 564 | 2992406196,Waste dump,30_8_NABS_FOS 565 | 2992407798,Iron mining,30_8_NABS_FOS 566 | 2992481583,Broadcast time,30_8_NABS_FOS 567 | 2992547679,Land mine,30_8_NABS_FOS 568 | 2992637229,Michel foucault,30_8_NABS_FOS 569 | 2992647939,Consumer engagement,30_8_NABS_FOS 570 | 2992730755,Agricultural development,30_8_NABS_FOS 571 | 2992974802,Geological exploration,30_8_NABS_FOS 572 | 2992981300,Salt deposit,30_8_NABS_FOS 573 | 2992990004,Solid rock,30_8_NABS_FOS 574 | 2993003885,Land area,30_8_NABS_FOS 575 | 2993054622,Core formation,30_8_NABS_FOS 576 | 2993102984,Mine planning,30_8_NABS_FOS 577 | 2993134977,Mineral deposit,30_8_NABS_FOS 578 | 2993139054,Water buffalo,30_8_NABS_FOS 579 | 2993172631,Social news,30_8_NABS_FOS 580 | 2993199473,Plant biochemistry,30_8_NABS_FOS 581 | 2993240939,Online activism,30_8_NABS_FOS 582 | 2993252152,Rock body,30_8_NABS_FOS 583 | 2993273313,Chemical control,30_8_NABS_FOS 584 | 2993323123,Rock slope,30_8_NABS_FOS 585 | 2993426613,Arabic sentiment analysis,30_8_NABS_FOS 586 | 2993437602,Rock structure,30_8_NABS_FOS 587 | 2993492720,Gold production,30_8_NABS_FOS 588 | 2993527706,Salt mine,30_8_NABS_FOS 589 | 2993531722,Zea mays,30_8_NABS_FOS 590 | 2993555337,Personal branding,30_8_NABS_FOS 591 | 2993676337,Broadcast transmission,30_8_NABS_FOS 592 | 2993808335,Seismic velocity,30_8_NABS_FOS 593 | 2993865493,Social data analytics,30_8_NABS_FOS 594 | 2994012208,Upper crust,30_8_NABS_FOS 595 | 2994012208,Upper crust,30_8_NABS_FOS 596 | 2994104004,Broadcast service,30_8_NABS_FOS 597 | 2994289516,Geological investigation,30_8_NABS_FOS 598 | 2994381574,Broadcast system,30_8_NABS_FOS 599 | 2994460426,Pig farms,30_8_NABS_FOS 600 | 2994466296,Radio program,30_8_NABS_FOS 601 | 2994534981,Television channel,30_8_NABS_FOS 602 | 2994537864,Human medicine,30_8_NABS_FOS 603 | 3017754109,Companion animal,30_8_NABS_FOS 604 | 3017803470,Crust formation,30_8_NABS_FOS 605 | 3017937595,Dog owners,30_8_NABS_FOS 606 | 3018078696,Dwarf goats,30_8_NABS_FOS 607 | 3018846106,Fear of missing out,30_8_NABS_FOS 608 | 3019217387,Crime news,30_8_NABS_FOS 609 | 3020113513,Small ruminant,30_8_NABS_FOS 610 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | #To use the API you will need to have docker installed on your system. 5 | #See instructions how to install docker on your operating system: https://docs.docker.com/get-docker/ 6 | -------------------------------------------------------------------------------- /sampleAPICall.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Apr 3 18:37:11 2020 5 | 6 | @author: lukas-pkl 7 | """ 8 | 9 | """ 10 | To use the API, please download and run the docker conatiner 11 | 12 | in bash : 13 | 14 | docker pull technoteai/osdg 15 | docker run --name my-open-sdg -p 5000:5000 technoteai/osdg:lattest 16 | 17 | 18 | """ 19 | 20 | 21 | 22 | 23 | import requests 24 | 25 | 26 | 27 | data = { 'query': """Using satellite data on deforestation and weather in Malawi and 28 | linking those datasets with household survey datasets, we estimate the causal 29 | effect of deforestation on access to clean drinking water. In the existing 30 | literature on forest science and hydrology, the consensus is that 31 | deforestation increases water yield. In this study, we directly examine the 32 | causal effect of deforestation on households’ access to clean drinking water. 33 | Results of the two-stage least-squares (2SLS) with cluster and time fixed-effect 34 | estimations illustrate strong empirical evidence that deforestation decreases 35 | access to clean drinking water. Falsification tests show that the possibility of 36 | our instrumental variable picking up an unobserved time trend is very unlikely. 37 | We find that a 1.0-percentage-point increase in deforestation decreases access 38 | to clean drinking water by 0.93 percentage points. With this estimated impact, 39 | deforestation in the last decade in Malawi (14%) has had the same magnitude of 40 | effect on access to clean drinking water as that of a 9% decrease in rainfall. 41 | """ } 42 | 43 | 44 | response = requests.post('http://localhost:5000/search', data=data) 45 | 46 | result = response.text 47 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import re 3 | 4 | sws = set([ 5 | 'ourselves', 'should', 'often', 'does', 'this', 'beside', 'well', 6 | 'among', 'throughout', 'being', 'become', 'yourselves', 'namely', 7 | 'whom', 'nothing', 'thus', 'many', '’re', 'had', 'somewhere', 'made', 8 | 'still', "'re", 'eight', 'of', 'yours', 'further', 'again', 'by', 9 | 'anyhow', 'whenever', 'both', 'first', 'third', 'whither', 'all', 10 | 'whether', 'amount', 'afterwards', 'alone', 'she', 'where', 'seemed', 11 | 'something', 'mine', 'whatever', 'most', 'doing', 'behind', 12 | 'thereupon', 'whole', 'hers', 'ca', 'a', 'before', 'forty', '’d', 13 | '‘s', 'three', 'anything', 'via', 'hereafter', 'him', 'as', 'those', 14 | 'here', 'around', '’ve', 'much', 'some', 'whereas', 'several', 'has', 15 | 'done', 'besides', 'am', 'hereby', '‘d', 'yet', 'make', 'none', 16 | 'while', 'just', 'towards', 'sometimes', 'his', 'into', 'various', 17 | 'their', 'thence', 'so', 'either', 'about', 'once', 'onto', 'thru', 18 | "'m", 'one', 'seems', 'between', 'say', 'mostly', 'otherwise', 19 | 'herself', 'might', 'and', 'least', 'did', 'hence', 'any', 'do', 20 | 'each', 'whereupon', 'becoming', 'thereby', "'ll", 'two', 'yourself', 21 | 'these', 'through', 'four', "'s", 'last', 'on', 'along', 'could', 22 | "n't", 'front', 'not', 'quite', '’m', 'at', 'he', 'ten', 'very', 23 | 'himself', 'although', 'now', 'it', 'move', 'bottom', 'within', 24 | 'can', 'sometime', 'out', 'elsewhere', 'empty', 'such', 'after', 25 | 'seeming', 'put', 'us', 'upon', 'please', 'used', 'except', 'n‘t', 26 | 'ours', 'six', 'though', 'without', 'why', 'however', 'above', 27 | 'herein', 'else', 'them', 'formerly', 'since', 'take', 'beyond', 28 | 'whence', 'n’t', 'been', 'nor', 'wherever', 'everywhere', 'hundred', 29 | 'but', 'latterly', 'really', 'is', 'with', 'hereupon', 'we', 30 | 'someone', 'whereby', 'in', 'because', 'latter', 'eleven', 'serious', 31 | 'twenty', 'name', 'may', 'itself', 'to', 'there', "'ve", 'whereafter', 32 | 'ever', 'perhaps', 'everyone', 'sixty', 'seem', 'which', 'almost', 33 | 'anywhere', 'the', 'wherein', 'its', 'cannot', 'keep', 'twelve', 34 | 'moreover', 'they', 'more', 'regarding', 'next', 'you', 'your', 35 | 'own', 'enough', 'side', 're', 'neither', 'have', 'during', 'under', 36 | 'will', 'would', 'over', 'therein', 'became', 'beforehand', 'using', 37 | 'part', 'my', 'that', 'themselves', '’ll', 'myself', 'somehow', 38 | 'together', 'top', 'from', 'then', 'are', 'give', 'back', 'less', 39 | 'always', 'never', 'becomes', 'until', "'d", 'go', 'i', 'whose', 40 | 'below', 'former', 'our', 'be', 'even', 'due', 'fifteen', 'every', 41 | 'than', 'rather', 'how', 'an', 'across', '‘ve', 'another', 'must', 42 | 'noone', 'against', '’s', 'others', 'per', 'already', 'off', 'too', 43 | 'was', 'when', 'also', 'other', 'therefore', 'see', 'up', 'indeed', 44 | 'what', '‘re', 'down', 'nobody', 'everything', 'whoever', 'five', 45 | 'me', 'nevertheless', 'toward', 'same', 'meanwhile', 'call', 'if', 46 | 'anyone', 'or', 'nowhere', 'were', 'unless', 'get', 'nine', 'her', 47 | 'for', '‘ll', 'who', 'fifty', 'few', 'only', 'anyway', 'no', 48 | 'amongst', 'show', '‘m', 'full', 'thereafter' 49 | ]) 50 | 51 | 52 | def levenshtein_ratio(s, t): 53 | """ levenshtein_ratio_and_distance: 54 | Calculates levenshtein distance between two strings. 55 | If ratio_calc = True, the function computes the 56 | levenshtein distance ratio of similarity between two strings 57 | For all i and j, distance[i,j] will contain the Levenshtein 58 | distance between the first i characters of s and the 59 | first j characters of t 60 | original code from: 61 | https://www.datacamp.com/community/tutorials/fuzzy-string-python 62 | """ 63 | # Initialize matrix of zeros 64 | rows = len(s)+1 65 | cols = len(t)+1 66 | distance = np.zeros((rows, cols), dtype=int) 67 | 68 | # Populate matrix of zeros with the indeces of each character of both strings 69 | for i in range(1, rows): 70 | for k in range(1, cols): 71 | distance[i][0] = i 72 | distance[0][k] = k 73 | 74 | # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions 75 | for col in range(1, cols): 76 | for row in range(1, rows): 77 | if s[row-1] == t[col-1]: 78 | cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0 79 | else: 80 | # In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio 81 | # the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1. 82 | cost = 2 83 | 84 | distance[row][col] = min( 85 | distance[row-1][col] + 1, # Cost of deletions 86 | distance[row][col-1] + 1, # Cost of insertions 87 | distance[row-1][col-1] + cost) # Cost of substitutions 88 | 89 | # Computation of the Levenshtein Distance Ratio 90 | Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t)) 91 | return Ratio 92 | 93 | 94 | def process_fosname(string): 95 | """Function to normalize FOS names """ 96 | good_chars = "abcdefghijklmnoprstuvwxyz0123456789 " 97 | string = string.lower() 98 | string = string.replace("-", " ") 99 | string = "".join(i for i in string if i in good_chars) 100 | string = string.replace(" ", " ") 101 | if string[-1] == " ": 102 | string = string[:-1] 103 | if string[0] == " ": 104 | string = string[1:] 105 | return string 106 | 107 | 108 | def sdg_label_sort(sdg_label): 109 | try: 110 | sdg_nr = int(re.findall(r'\d+', sdg_label)[0]) 111 | except IndexError: 112 | sdg_nr = sdg_label 113 | return sdg_nr 114 | --------------------------------------------------------------------------------