├── .gitignore
├── AssemblingOntology.py
├── AssemblingTerms.py
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── FOSMAP.json
├── FOSMAP_700.json
├── InterimTerms.json
├── LICENSE
├── MatchedFOS.json
├── Methodology.md
├── OSDG-Ontology.json
├── OSDG-Ontology.xlsx
├── OSDG-Ontology_ver-min-1.json
├── OSDG_DATA_SOURCES.md
├── README.md
├── comparison_fos_update.xlsx
├── images
├── Methodology-visual_0511_Updated.png
├── OSDG.png
└── OSDG_new.png
├── raw_data
├── 0_add
│ ├── 00_add_validated
│ │ ├── 0_PuigOntology
│ │ │ ├── 0_ProcessedKeyTerms.json
│ │ │ ├── 0_process_key_terms.py
│ │ │ └── Ontology.csv
│ │ └── 6_SDGIO_terms
│ │ │ ├── 6_ProcessedKeyTerms.json
│ │ │ ├── 6_process_key_terms.py
│ │ │ └── SDG Terms by Indicator.xlsx
│ ├── 01_add_generated
│ │ ├── 1_FP7-4-SD_edited
│ │ │ ├── 1_ProcessedKeyTerms.json
│ │ │ ├── 1_process_key_terms.py
│ │ │ ├── FOSMAP.json
│ │ │ ├── NewWU.json
│ │ │ ├── ProjectFOS.json
│ │ │ ├── WU_projectSDGs.json
│ │ │ └── bad_fos.py
│ │ ├── 2_LinkedSDG_Concepts
│ │ │ ├── 2_ProcessedKeyTerms.json
│ │ │ ├── 2_process_key_terms.py
│ │ │ └── LinkedSDG_Data.xlsx
│ │ ├── 3_SDGPathfinder_DocumentConcepts
│ │ │ ├── 3_ProcessedKeyTerms.json
│ │ │ ├── 3_process_key_terms.py
│ │ │ └── OECD_SDG_betas.xlsx
│ │ ├── 4_SDGPathfinder_Keywords
│ │ │ ├── 4_ProcessedKeyTerms.json
│ │ │ ├── 4_process_key_terms.py
│ │ │ └── keywords.csv
│ │ ├── 5_LinkedSDG_DocumentExtracts
│ │ │ ├── 5_ProcessedKeyTerms.json
│ │ │ ├── 5_process_key_terms.py
│ │ │ └── LinkedSDG_DocumentExtracts.xlsx
│ │ ├── 7_EC_Policy_Doc_Terms
│ │ │ ├── 7_ProcessedKeyTerms.json
│ │ │ ├── 7_process_key_terms.py
│ │ │ └── ECPolicyDocs_Ngrams REVISED.xlsx
│ │ └── 9_SIRIS_Science4SDGs
│ │ │ ├── 9_ProcessedKeyTerms.json
│ │ │ ├── 9_process_key_terms.py
│ │ │ ├── sdg_vocabulary_V1.2 [zenodo](single_shhet).xlsx
│ │ │ └── sdg_vocabulary_V1.2 [zenodo].xlsx
│ ├── 02_add_all_to_all
│ │ ├── 10_PPMI_boost
│ │ │ ├── 10_ProcessedFOS.json
│ │ │ ├── 10_process_fos.py
│ │ │ ├── SDG FOS updated 06 01.xlsx
│ │ │ └── SDG FOS updated 06 12.xlsx
│ │ └── 8_NABS_FOS
│ │ │ ├── 8_ProcessedFOS.json
│ │ │ ├── 8_process_fos.py
│ │ │ └── NABS_FOS_update_2020-08-20_ed_VS.xlsx
│ ├── GeneratedSdgTerms.json
│ └── ValidatedSdgTerms.json
├── 1_replace
│ ├── 11_TJL-24_review
│ │ ├── 11_ReplaceFOS.json
│ │ ├── 11_process_replace_fos.py
│ │ └── osdg_fos_paper_citation_counts_REPLACE_v2_ed_VS.xlsx
│ ├── 12_Review_2020-10-02
│ │ ├── 12_ReplaceFOS.json
│ │ ├── 12_process_replace_fos.py
│ │ └── replace-review_2020-10-02.csv
│ └── ReplacedFOS.xlsx
├── 2_remove
│ ├── 20_FP7-4-SD_edited
│ │ ├── 20_RemoveFOS.json
│ │ ├── 20_process_remove_fos.py
│ │ └── bad_fos.csv
│ ├── 21_8_NABS_FOS
│ │ ├── 21_RemoveFOS.json
│ │ ├── 21_process_remove_fos.py
│ │ └── NABS_FOS_update_2020-08-20_NOT-RELEVANT__ed_VS.xlsx
│ ├── 22_TJL-24_review
│ │ ├── 22_RemoveFOS.json
│ │ ├── 22_process_remove_fos.py
│ │ └── osdg_fos_paper_citation_counts_REMOVE_v2_ed_VS.xlsx
│ ├── 23_Restructuring_review
│ │ ├── 23_RemoveFOS.json
│ │ ├── 23_process_remove_fos.py
│ │ └── sdg-fos_restructuring-v3_to-remove.xlsx
│ ├── 24_Review_2020-10-02
│ │ ├── 24_RemoveFOS.json
│ │ ├── 24_process_remove_fos.py
│ │ └── remove-review_2020-10-02.csv
│ ├── 25_TOL-7_MostPopularSDG3FOS
│ │ ├── 25_RemoveFOS.json
│ │ ├── 25_process_remove_fos.py
│ │ └── TOL-7_MostPopularSDG3RemoveFOS.csv
│ └── RemovedFOS.xlsx
└── 3_blacklist
│ ├── 30_8_NABS_FOS
│ ├── 30_BlacklistFOS.csv
│ ├── 30_process_blacklist_fos.py
│ └── NABS_FOS_update_2020-08-20_NOT-RELEVANT__ed_VS.xlsx
│ ├── AssembleBlacklist.py
│ └── Blacklist.csv
├── requirements.txt
├── sampleAPICall.py
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .DS_Store
11 | .Python
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | pip-wheel-metadata/
25 | share/python-wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .nox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | *.py,cover
52 | .hypothesis/
53 | .pytest_cache/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 | db.sqlite3-journal
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # pipenv
89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
92 | # install all needed dependencies.
93 | #Pipfile.lock
94 |
95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
96 | __pypackages__/
97 |
98 | # Celery stuff
99 | celerybeat-schedule
100 | celerybeat.pid
101 |
102 | # SageMath parsed files
103 | *.sage.py
104 |
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 |
114 | # VSCode project settings
115 | .vscode
116 |
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 |
121 | # Rope project settings
122 | .ropeproject
123 |
124 | # mkdocs documentation
125 | /site
126 |
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 |
132 | # Pyre type checker
133 | .pyre/
134 |
--------------------------------------------------------------------------------
/AssemblingOntology.py:
--------------------------------------------------------------------------------
1 | from multiprocessing import cpu_count
2 | from tqdm import tqdm
3 | from utils import process_fosname, levenshtein_ratio, sws, sdg_label_sort
4 |
5 | import concurrent.futures
6 | import json
7 | import os
8 | import pandas as pd
9 |
10 |
11 | def process_add_all_to_all_fos():
12 | path = 'raw_data/0_add/02_add_all_to_all'
13 | processed_fos = dict()
14 | add_all_to_all_data_paths = [
15 | f'{path}/{directory_name}'
16 | for directory_name in os.listdir(path)
17 | if '.' not in directory_name
18 | ]
19 | for directory in add_all_to_all_data_paths:
20 | try:
21 | processed_sdg_fos_fname = list(filter(lambda oname: '_ProcessedFOS.json' in oname, os.listdir(directory)))[0]
22 | except IndexError:
23 | print('Sdg FOS are not processed in {directory}')
24 | continue
25 | with open(f'{directory}/{processed_sdg_fos_fname}', 'r') as file_:
26 | processed_sdg_fos = json.load(file_)
27 | for sdg_label, fos in processed_sdg_fos.items():
28 | if sdg_label not in processed_fos.keys():
29 | processed_fos[sdg_label] = set()
30 | processed_fos[sdg_label].update(map(lambda x: (str(x[0]), x[1]), fos))
31 |
32 | return processed_fos
33 |
34 |
35 | def process_replace_fos():
36 | replace_fos = []
37 |
38 | path = 'raw_data/1_replace'
39 | add_replace_data_paths = sorted([
40 | f'{path}/{directory_name}'
41 | for directory_name in os.listdir(path)
42 | if '.' not in directory_name
43 | ],
44 | key=lambda x: int(x.split('/')[-1].split('_')[0]))
45 |
46 | for directory in add_replace_data_paths:
47 | try:
48 | processed_replace_fos_fname = list(filter(lambda oname: '_ReplaceFOS.json' in oname, os.listdir(directory)))[0]
49 | except IndexError:
50 | print('Sdg replace FOS are not processed in {directory}')
51 | continue
52 | with open(f'{directory}/{processed_replace_fos_fname}', 'r') as file_:
53 | processed_replace_fos = json.load(file_)
54 | for fos_id, moves in processed_replace_fos.items():
55 | for move in moves:
56 | replace_fos.append((str(fos_id), move))
57 |
58 | return replace_fos
59 |
60 |
61 | def process_remove_fos():
62 | remove_fos = dict()
63 |
64 | path = 'raw_data/2_remove'
65 | add_remove_data_paths = [
66 | f'{path}/{directory_name}'
67 | for directory_name in os.listdir(path)
68 | if '.' not in directory_name
69 | ]
70 | for directory in add_remove_data_paths:
71 | try:
72 | processed_remove_fos_fname = list(filter(lambda oname: '_RemoveFOS.json' in oname, os.listdir(directory)))[0]
73 | except IndexError:
74 | print('Sdg remove FOS are not processed in {directory}')
75 | continue
76 | with open(f'{directory}/{processed_remove_fos_fname}', 'r') as file_:
77 | processed_remove_fos = json.load(file_)
78 |
79 | for sdg_label, fos_ids in processed_remove_fos.items():
80 | if sdg_label not in remove_fos.keys():
81 | remove_fos[sdg_label] = set()
82 | remove_fos[sdg_label].update(map(lambda fos_id: str(fos_id), fos_ids))
83 |
84 | return remove_fos
85 |
86 |
87 | with open("InterimTerms.json", "r") as file_:
88 | sdg_terms = json.loads(file_.read())
89 |
90 | with open('FOSMAP_700.json', 'r') as file_:
91 | fos_map_700 = json.load(file_)
92 |
93 | with open("FOSMAP.json", "r") as file_:
94 | fos_map = json.loads(file_.read())
95 | fos_to_match = [(fos_id, process_fosname(fos_name)) for fos_id, fos_name in fos_map.items()]
96 |
97 |
98 | """
99 | Matching with Fields of Study from MS Academic (v10-10-2019)
100 | Match criteria:
101 | all tokens from a concept must be present in FOS name
102 | levenstein similarity between concept and FOS name must be > 0.85
103 | """
104 | sdg_matched_fos = dict()
105 |
106 |
107 | def _match_terms_to_fos(sdg_label, terms, fos_to_match, sws, use_pbar, total):
108 | sdg_matched_fos = dict()
109 | if use_pbar:
110 | step = total // len(terms)
111 | total = step * len(terms)
112 | p_bar = tqdm(terms, desc=f'Processing {sdg_label}', total=total, leave=True)
113 | for term, sources in terms:
114 | matched_fos = []
115 | term_parts = list(filter(lambda w: w not in sws, term.split()))
116 | for fos_id, fos_name in fos_to_match:
117 | if all(p in fos_name for p in term_parts) and levenshtein_ratio(term, fos_name) > 0.85:
118 | matched_fos.append([str(fos_id), fos_name])
119 |
120 | matched_fos = sorted(matched_fos, key=lambda x: x[1])
121 | matched_fos_ids, matched_fos_names = list(map(lambda x: x[0], matched_fos)), list(map(lambda x: x[1], matched_fos))
122 | sdg_matched_fos[term] = {
123 | "sources": sorted(sources),
124 | "matched_FOS_ids": matched_fos_ids,
125 | "matched_FOS_names": matched_fos_names
126 | }
127 |
128 | if use_pbar:
129 | p_bar.update(step)
130 | if use_pbar:
131 | p_bar.close()
132 |
133 | return sdg_label, sdg_matched_fos
134 |
135 |
136 | n_workers = cpu_count() - 1
137 | for sdg_label, terms in sdg_terms.items():
138 | terms = list(terms.items())
139 | term_batches = []
140 | bs = (len(terms) + n_workers - 1) // n_workers
141 | for i in range(n_workers):
142 | batch = terms[i*bs:(i+1)*bs]
143 | if batch:
144 | term_batches.append(batch)
145 | with concurrent.futures.ProcessPoolExecutor(max_workers=n_workers) as executor:
146 | futures = []
147 | for i, batch in enumerate(term_batches):
148 | use_pbar = i == (len(term_batches) - 2)
149 | futures.append(executor.submit(
150 | _match_terms_to_fos,
151 | sdg_label, batch, fos_to_match[:], sws,
152 | use_pbar=use_pbar, total=len(terms)
153 | ))
154 |
155 | for future in concurrent.futures.as_completed(futures):
156 | sdg_label, matched_fos = future.result()
157 | if sdg_label not in sdg_matched_fos.keys():
158 | sdg_matched_fos[sdg_label] = dict()
159 | sdg_matched_fos[sdg_label].update(matched_fos)
160 |
161 | sdg_labels = sorted(sdg_matched_fos.keys(), key=sdg_label_sort)
162 | sdg_matched_fos = {
163 | sdg_label: {
164 | fos: sdg_matched_fos[sdg_label][fos] for fos in sorted(sdg_matched_fos[sdg_label].keys())
165 | } for sdg_label in sdg_labels
166 | }
167 | with open("MatchedFOS.json", "w") as file_:
168 | json.dump(sdg_matched_fos, file_)
169 |
170 |
171 | sdg_fos = dict()
172 | for sdg_label, sdg_term_data in sdg_matched_fos.items():
173 | foses = set()
174 | for term_data in list(sdg_term_data.values()):
175 | foses.update(term_data['matched_FOS_ids'])
176 | sdg_fos[sdg_label] = foses
177 |
178 | print('\n\n\t--- Percentage of matched FOS ---')
179 | for sdg_label, sdg_term_data in sdg_matched_fos.items():
180 | c = sum(not term_data["matched_FOS_ids"] for term_data in sdg_term_data.values())
181 | print(f'\t{sdg_label} - {100 - int(c * 100 / len(sdg_term_data))}%')
182 |
183 |
184 | """
185 | Adding 0_add/02_all_to_all FOS
186 | """
187 | processed_all_to_all_fos = process_add_all_to_all_fos()
188 | for sdg_label, foses in processed_all_to_all_fos.items():
189 | print(f'{sdg_label} - {len(foses)}')
190 | fos_ids = list(map(lambda fos: fos[0], foses))
191 | if sdg_label not in sdg_fos.keys():
192 | sdg_fos[sdg_label] = set()
193 | sdg_fos[sdg_label].update(fos_ids)
194 |
195 |
196 | """
197 | Replacing 1_replace/ FOS
198 | """
199 | data_replaced_fos = {'fos_id': [], 'fos_name': [], 'from_sdg': [], 'to_sdg': []}
200 | processed_replace_fos = process_replace_fos()
201 | for fos_id, move in processed_replace_fos:
202 | fos_name = fos_map_700.get(fos_id, '')
203 | from_sdg, to_sdg = move
204 | try:
205 | sdg_fos[from_sdg].remove(fos_id)
206 | except KeyError:
207 | from_sdg = ''
208 | sdg_fos[to_sdg].add(fos_id)
209 |
210 | data_replaced_fos['fos_id'].append(fos_id)
211 | data_replaced_fos['fos_name'].append(fos_name)
212 | data_replaced_fos['from_sdg'].append(from_sdg)
213 | data_replaced_fos['to_sdg'].append(to_sdg)
214 |
215 |
216 | df_replaced = pd.DataFrame(data_replaced_fos)
217 | df_replaced.to_excel('raw_data/1_replace/ReplacedFOS.xlsx', index=False)
218 |
219 | """
220 | Removing 2_remove/ FOS
221 | """
222 | data_removed_fos = {'sdg_label': [], 'fos_id': [], 'fos_name': []}
223 | removed_fos = dict()
224 | processed_remove_fos = process_remove_fos()
225 | for sdg_label, fos_to_remove in processed_remove_fos.items():
226 | if sdg_label not in removed_fos.keys():
227 | removed_fos[sdg_label] = set()
228 |
229 | if sdg_label in sdg_fos.keys():
230 | removed_fos[sdg_label].update(sdg_fos[sdg_label].intersection(fos_to_remove))
231 | sdg_fos[sdg_label] = sdg_fos[sdg_label].difference(fos_to_remove)
232 | else:
233 | removed_fos[sdg_label] = []
234 |
235 | for sdg_label, rm_fos_ids in removed_fos.items():
236 | for fos_id in rm_fos_ids:
237 | fos_name = fos_map_700.get(str(fos_id))
238 | if not fos_name:
239 | fos_name = ''
240 | data_removed_fos['sdg_label'].append(sdg_label)
241 | data_removed_fos['fos_id'].append(fos_id)
242 | data_removed_fos['fos_name'].append(fos_name)
243 |
244 | df_removed = pd.DataFrame(data_removed_fos).sort_values(['sdg_label', 'fos_name'])
245 | df_removed.to_excel('raw_data/2_remove/RemovedFOS.xlsx', index=False)
246 |
247 | """
248 | Writing to file
249 | """
250 | for sdg_label, fos_ids in sdg_fos.items():
251 | sdg_fos[sdg_label] = sorted(fos_ids)
252 |
253 | print("\n\t--- Final FOS Count ---")
254 | for sdg_label, foses in sdg_fos.items():
255 | print(f'\t{sdg_label} - {len(foses)}')
256 |
257 | with open('OSDG-Ontology.json', 'r') as file_:
258 | sdg_fos_old = json.load(file_)
259 |
260 | with open('OSDG-Ontology_ver-min-1.json', 'w') as file_:
261 | json.dump(sdg_fos_old, file_)
262 |
263 | with open("OSDG-Ontology.json", "w") as file_:
264 | json.dump(sdg_fos, file_)
265 |
266 | # Representative OSDG-Ontology
267 | data_ontology = {'SDG label': [], 'FOS-ID': [], 'FOS-Name': [], 'Link to MAG': []}
268 | for sdg_label, fos_ids in sdg_fos.items():
269 | sdg_nr = int(sdg_label.split('_')[1])
270 | for fos_id in fos_ids:
271 | fos_name = fos_map_700.get(fos_id, None)
272 | mag_link = f'https://academic.microsoft.com/topic/{fos_id}'
273 | data_ontology['SDG label'].append(sdg_nr)
274 | data_ontology['FOS-ID'].append(fos_id)
275 | data_ontology['FOS-Name'].append(fos_name)
276 | data_ontology['Link to MAG'].append(mag_link)
277 |
278 | df_ontology = pd.DataFrame(data_ontology).sort_values(['SDG label', 'FOS-Name', 'FOS-ID'])
279 | df_ontology['SDG label'] = df_ontology['SDG label'].apply(lambda sdg_nr: f'SDG_{sdg_nr}')
280 |
281 | df_ontology.to_excel('OSDG-Ontology.xlsx', index=False)
282 |
283 |
284 | """
285 | Comparing to the last SdgFOS.json version
286 | """
287 | with open('raw_data/0_add/02_add_all_to_all/8_NABS_FOS/8_ProcessedFOS.json', 'r') as file_:
288 | nabs = json.load(file_)
289 |
290 | with open('raw_data/0_add/02_add_all_to_all/10_PPMI_boost/10_ProcessedFOS.json', 'r') as file_:
291 | boost = json.load(file_)
292 |
293 | data = {
294 | 'sdg': [],
295 | 'add_or_remove': [],
296 | 'fos_id': [], 'fos_name': [],
297 | 'sources': [], 'isinReplaced': [], 'isinRemoved':[]
298 | }
299 | for sdg_label in sorted(set(list(sdg_fos.keys()) + list(sdg_fos_old.keys())), key=sdg_label_sort):
300 | old_foses = sdg_fos_old.get(sdg_label, [])
301 | new_foses = sdg_fos.get(sdg_label, [])
302 |
303 | added_foses = list(set(new_foses).difference(old_foses))
304 | removed_foses = list(set(old_foses).difference(new_foses))
305 |
306 | # Added
307 | for fos_id in added_foses:
308 | fos_name = fos_map_700[fos_id]
309 |
310 | sources = set()
311 | for mterm, mterm_data in sdg_matched_fos[sdg_label].items():
312 | if fos_id in mterm_data['matched_FOS_ids']:
313 | sources.update(mterm_data['sources'])
314 |
315 | # 8 Nabs & 10 boost aka ATA
316 | nabs_fos_ids = list(map(lambda fos: fos[0], nabs.get(sdg_label, [])))
317 | boost_fos_ids = list(map(lambda fos: fos[0], boost.get(sdg_label, [])))
318 | if fos_id in nabs_fos_ids:
319 | sources.add('8_NABS_FOS')
320 | if fos_id in boost_fos_ids:
321 | sources.add('10_PPMI_fos')
322 |
323 | # Replaced
324 | isin_replaced = fos_id in df_replaced[df_replaced.to_sdg == sdg_label].fos_id.astype(str).tolist()
325 |
326 | data['sdg'].append(sdg_label)
327 | data['add_or_remove'].append('add')
328 | data['fos_id'].append(fos_id)
329 | data['fos_name'].append(fos_name)
330 | data['sources'].append(list(sources) if list(sources) else None)
331 | data['isinReplaced'].append(isin_replaced)
332 | data['isinRemoved'].append(False)
333 |
334 | # Removed
335 | for fos_id in removed_foses:
336 | fos_name = fos_map_700[fos_id]
337 |
338 | sources = set()
339 | for mterm, mterm_data in sdg_matched_fos[sdg_label].items():
340 | if fos_id in mterm_data['matched_FOS_ids']:
341 | sources.update(mterm_data['sources'])
342 |
343 | # 8 Nabs & 10 boost aka ATA
344 | nabs_fos_ids = list(map(lambda fos: fos[0], nabs.get(sdg_label, [])))
345 | boost_fos_ids = list(map(lambda fos: fos[0], boost.get(sdg_label, [])))
346 | if fos_id in nabs_fos_ids:
347 | sources.add('8_NABS_FOS')
348 | if fos_id in boost_fos_ids:
349 | sources.add('10_PPMI_fos')
350 |
351 | # Replaced
352 | isin_replaced = fos_id in df_replaced[df_replaced.from_sdg == sdg_label].fos_id.astype(str).tolist()
353 | isin_removed = fos_id in df_removed[df_removed.sdg_label == sdg_label].fos_id.astype(str).tolist()
354 |
355 | data['sdg'].append(sdg_label)
356 | data['add_or_remove'].append('removed')
357 | data['fos_id'].append(fos_id)
358 | data['fos_name'].append(fos_name)
359 | data['sources'].append(list(sources) if list(sources) else None)
360 | data['isinReplaced'].append(isin_replaced)
361 | data['isinRemoved'].append(isin_removed)
362 |
363 | df_comparison = pd.DataFrame(data).sort_values(['add_or_remove', 'isinReplaced', 'isinRemoved', 'sdg'])
364 | df_comparison.to_excel('comparison_fos_update.xlsx', index=False)
365 |
366 |
--------------------------------------------------------------------------------
/AssemblingTerms.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 | from utils import sdg_label_sort
3 |
4 | import json
5 | import os
6 |
7 |
8 | INTER_ADD_PATH = 'raw_data/0_add'
9 |
10 | add_validated_data_paths = [
11 | f'{INTER_ADD_PATH}/00_add_validated/{directory_name}'
12 | for directory_name in os.listdir(f'{INTER_ADD_PATH}/00_add_validated')
13 | if '.' not in directory_name
14 | ]
15 |
16 | add_generated_data_paths = [
17 | f'{INTER_ADD_PATH}/01_add_generated/{directory_name}'
18 | for directory_name in os.listdir(f'{INTER_ADD_PATH}/01_add_generated')
19 | if '.' not in directory_name
20 | ]
21 |
22 | add_all_to_all_data_paths = [
23 | f'{INTER_ADD_PATH}/02_add_all_to_all/{directory_name}'
24 | for directory_name in os.listdir(f'{INTER_ADD_PATH}/02_add_all_to_all')
25 | if '.' not in directory_name
26 | ]
27 |
28 |
29 | # Gather *_ProcessedKeyTerms -----
30 | sdg_terms_add_validated, sdg_terms_add_generated = dict(), dict()
31 | term_sources = dict()
32 |
33 | # Validated
34 | for directory in add_validated_data_paths:
35 | try:
36 | processed_sdg_terms_fname = list(filter(lambda oname: '_ProcessedKeyTerms.json' in oname, os.listdir(directory)))[0]
37 | with open(f'{directory}/{processed_sdg_terms_fname}', 'r') as file_:
38 | processed_sdg_terms = json.load(file_)
39 | processed_sdg_terms = {sdg_label: processed_sdg_terms[sdg_label] for sdg_label in sorted(processed_sdg_terms.keys())}
40 | except IndexError:
41 | print(f'Sdg Terms are not processed in {directory}')
42 | continue
43 |
44 | for sdg_label, terms in processed_sdg_terms.items():
45 | if sdg_label not in sdg_terms_add_validated.keys():
46 | sdg_terms_add_validated[sdg_label] = set()
47 | sdg_terms_add_validated[sdg_label].update(terms)
48 |
49 | # Update term sources
50 | if sdg_label not in term_sources.keys():
51 | term_sources[sdg_label] = OrderedDict()
52 | for term in sdg_terms_add_validated[sdg_label]:
53 | if term not in term_sources[sdg_label].keys():
54 | term_sources[sdg_label][term] = []
55 | term_sources[sdg_label][term].append(directory.split('/')[-1])
56 |
57 | # All to all # TODO leave it for matching? if not, it goes into assembling sdg_fos_script. Must be checked for conflicts when assembling generated
58 | for directory in add_all_to_all_data_paths:
59 | try:
60 | processed_sdg_fos_fname = list(filter(lambda oname: '_ProcessedFOS.json' in oname, os.listdir(directory)))[0]
61 | with open(f'{directory}/{processed_sdg_fos_fname}', 'r') as file_:
62 | processed_sdg_fos = json.load(file_)
63 | processed_sdg_fos = {sdg_label: processed_sdg_fos[sdg_label] for sdg_label in sorted(processed_sdg_fos.keys())}
64 | except IndexError:
65 | print(f'Sdg FOS are not processed in {directory}')
66 | continue
67 |
68 | for sdg_label, foses in processed_sdg_fos.items():
69 | terms = list(map(lambda x: x[1], foses)) # TODO All to all has ids and might move to Assemblign SdgFos script
70 | if sdg_label not in sdg_terms_add_validated.keys():
71 | sdg_terms_add_validated[sdg_label] = set()
72 | sdg_terms_add_validated[sdg_label].update(terms)
73 |
74 | # Update term sources
75 | if sdg_label not in term_sources.keys():
76 | term_sources[sdg_label] = OrderedDict()
77 | for term in sdg_terms_add_validated[sdg_label]:
78 | if term in terms:
79 | if term not in term_sources[sdg_label].keys():
80 | term_sources[sdg_label][term] = []
81 | term_sources[sdg_label][term].append(directory.split('/')[-1])
82 |
83 | sdg_terms_add_validated = {
84 | sdg_label: sorted(list(sdg_terms_add_validated[sdg_label]))
85 | for sdg_label in sorted(sdg_terms_add_validated.keys(), key=sdg_label_sort)
86 | }
87 |
88 | with open(f'{INTER_ADD_PATH}/ValidatedSdgTerms.json', 'w') as file_:
89 | json.dump(sdg_terms_add_validated, file_)
90 |
91 |
92 | # Generated
93 | gen_term_sources = dict()
94 |
95 | for directory in add_generated_data_paths:
96 | try:
97 | processed_sdg_terms_fname = list(filter(lambda oname: '_ProcessedKeyTerms.json' in oname, os.listdir(directory)))[0]
98 | with open(f'{directory}/{processed_sdg_terms_fname}', 'r') as file_:
99 | processed_sdg_terms = json.load(file_)
100 | processed_sdg_terms = {sdg_label: processed_sdg_terms[sdg_label] for sdg_label in sorted(processed_sdg_terms.keys())}
101 | except IndexError:
102 | print(f'Sdg Terms are not processed in {directory}')
103 | continue
104 |
105 | for sdg_label, terms in processed_sdg_terms.items():
106 | if sdg_label not in sdg_terms_add_generated.keys():
107 | sdg_terms_add_generated[sdg_label] = set()
108 | sdg_terms_add_generated[sdg_label].update(terms)
109 |
110 | # Update gen term sources
111 | for term in sdg_terms_add_generated[sdg_label]:
112 | if term not in term_sources[sdg_label].keys():
113 | term_sources[sdg_label][term] = []
114 | term_sources[sdg_label][term].append(directory.split('/')[-1])
115 |
116 | term_dist = OrderedDict()
117 | for terms in sdg_terms_add_generated.values():
118 | for term in terms:
119 | if term not in term_dist.keys():
120 | term_dist[term] = 1
121 | else:
122 | term_dist[term] += 1
123 |
124 | multi_sdg_terms = [term for term, freq in term_dist.items() if freq > 1] # TODO add to file to keep track
125 |
126 | for sdg_label, terms in sdg_terms_add_generated.items():
127 | terms = terms.difference(multi_sdg_terms)
128 | for v_sdg_label, v_terms in sdg_terms_add_validated.items():
129 | if v_sdg_label != sdg_label:
130 | terms = terms.difference(v_terms)
131 | sdg_terms_add_generated[sdg_label] = terms
132 |
133 | # Update fos source for both validated and generated
134 | if sdg_label in gen_term_sources.keys():
135 | for term, sources in gen_term_sources[sdg_label].items():
136 | if term in sdg_terms_add_generated[sdg_label]:
137 | if term not in term_sources[sdg_label].keys():
138 | term_sources[sdg_label][term] = []
139 | term_sources[sdg_label][term] += sources
140 |
141 | sdg_terms_add_generated = {
142 | sdg_label: sorted(list(sdg_terms_add_generated[sdg_label]))
143 | for sdg_label in sorted(sdg_terms_add_generated.keys(), key=sdg_label_sort)
144 | }
145 |
146 | with open(f'{INTER_ADD_PATH}/GeneratedSdgTerms.json', 'w') as file_:
147 | json.dump(sdg_terms_add_generated, file_)
148 |
149 | # Combined Validated and Generated Sdg Terms
150 | sdg_ontology_combined = OrderedDict()
151 |
152 | ata_sources = [path.split('/')[-1] for path in add_all_to_all_data_paths]
153 | sdg_labels = sorted(set(list(sdg_terms_add_validated.keys()) + list(sdg_terms_add_generated.keys())), key=sdg_label_sort)
154 | for sdg_label in sdg_labels:
155 | sdg_ontology_combined[sdg_label] = OrderedDict()
156 | validated_terms = sdg_terms_add_validated[sdg_label] if sdg_label in sdg_terms_add_validated.keys() else []
157 | generated_terms = sdg_terms_add_generated[sdg_label] if sdg_label in sdg_terms_add_generated.keys() else []
158 |
159 | for term in sorted(list(set(validated_terms + generated_terms))):
160 | t_sources = sorted(term_sources[sdg_label][term], key=sdg_label_sort)
161 | if all(src in ata_sources for src in t_sources):
162 | continue
163 | if term not in sdg_ontology_combined[sdg_label].keys():
164 | sdg_ontology_combined[sdg_label][term] = dict()
165 | sdg_ontology_combined[sdg_label][term] = t_sources
166 |
167 | with open("InterimTerms.json", "w") as file_:
168 | file_.write(json.dumps(sdg_ontology_combined))
169 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at osdg@technote.ai. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to contribute to OSDG?
2 |
3 | The OSDG project welcomes contributions from all users and communities. We have identified several areas, which could benefit the project the most:
4 |
5 | 1) Suggest new data sources - If you have a data source or a classifier to recognize one or more SDG, you can suggest it to be added to the OSDG data sources.
6 | 2) Suggest new/better rules for data source cleaning / integration.
7 | 3) Suggest improvements to the procedure for matching the items in the combined ontology to the Fields of Study in Microsoft Academic.
8 |
9 | This can be done in various ways :
10 | a) by posting an issue on OSDG GitHub ;
11 | b) by contacting the team at [osdg@technote.ai](mailto:osdg@technote.ai);
12 | c) By forking there project repository, integrating the new data and then creating a pull request(read more about [pull requests](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)).
13 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/Methodology.md:
--------------------------------------------------------------------------------
1 | ## Methodology
2 | OSDG aims to:
3 | - integrate various existing attempts to classify research according to SustainableDevelopment Goals,
4 | - make this process open, transparent and user-friendly.
5 |
6 | OSDG integrates the existing research into a comprehensive approach, and does so in a way that evades the shortcomings of former individual approaches and duplication of research efforts.
7 |
8 |
9 |
10 |
11 |
12 | ## About the project
13 | In short, OSDG builds an **integrated ontology** from the feature sets identified in previous research, and then matches the ontology items to the topics from [Microsoft Academic](https://academic.microsoft.com/home).
14 | OSDG takes relevant text features (such as ontology items, features from machine-learning models or extracted keywords) from the previous research, cleans them and merges them into a comprehensive, constantly-growing OSDG ontology. The ontology items are mapped to the ever-growing list of topics/Fields of Study in the Microsoft Academic Graph (MAG).
15 | By doing this, we:
16 | - expand the ontology – acquire more key terms associated with the relevant MAG Topics, natively called Fields of Study (FOS);
17 | - capture more nuanced relationships between individual terms and latent concepts.
18 |
19 | ## How does OSDG work?
20 | OSDG processes user queries in the following steps:
21 | 1) It tags the user query with FOS’es from Microsoft Academic Graph (MAG);
22 | 2) It cross-references the FOS’es assigned to the user query with the OSDG Ontology and determines which SDGs (if any) are relevant for the query;
23 | 3) The relevance of a SDG to a query is interpreted as being “Strong” or “Moderate” depending on a specific threshold that is specifically adjusted for each SDG by testing the tool on a set of 16 000 scientific publication abstracts).
24 |
25 | Head to the Search page to put our methodology to practical use. If you see something that requires improvement or you would like to contact our data team, please state your enquiry using our contact form.
26 | ## References and inspiration
27 |
28 | The list of data sources used in the current version of the OSDG Tool are [here](https://github.com/TechNote-ai/osdg/blob/master/OSDG_DATA_SOURCES.md). OSDG leverages the data from [Microsoft Academic](https://academic.microsoft.com/home):
29 |
30 | 1) Sinha, A., Shen, Z., Song, Y., Ma, H., Eide, D., Hsu, B.-J. & Wang, K. (2015). AnOverview of Microsoft Academic Service (MAS) and Applications. Proceedings of the24th International Conference on World Wide Web (p./pp. 243--246), Republic andCanton of Geneva, Switzerland: International World Wide Web Conferences SteeringCommittee. ISBN: 978-1-4503-3473-0. doi:10.1145/2740908.27428398.
31 | 2) Wang, K., Shen, Z., Huang, C., Wu, C., Eide, D., Dong, Y., Qian, J., Kanakia, A., Chen,A.C., & Rogahn, R. (2019). A Review of Microsoft Academic Services for Science ofScience Studies. Frontiers in Big Data, 2. doi:10.3389/FDATA.2019.00045
32 |
--------------------------------------------------------------------------------
/OSDG-Ontology.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/OSDG-Ontology.xlsx
--------------------------------------------------------------------------------
/OSDG_DATA_SOURCES.md:
--------------------------------------------------------------------------------
1 | # SDG Data Sources
2 |
3 | ## Expert validated data sources
4 |
5 | | Index | Description | Folder Name | Link |
6 | | :------: | :------ | :------: | ------: |
7 | | 0. | SDG Ontology compiled by Dr Nuria B. Puig and E. Mauleon| 0_PuigOntology | [Dataset](https://figshare.com/articles/SDG_ontology/11106113/1) |
8 | | 6. | Terms by Indicator from SDGIO Ontology | 6_SDGIO_Terms | [Link to SDGIO GitHub ](https://github.com/SDG-InterfaceOntology/sdgio) |
9 | ##
10 | ## Generated data sources
11 |
12 | | Index | Description | Folder Name | Link |
13 | | :------: | :------ | :------: | ------: |
14 | | 1. | Mapping from "FP7-4-SD" Project (edited VS and LP) | 1_FP7-4-SD_edited | [Link to Project website](https://www.fp7-4-sd.eu/) |
15 | | 2. | Concepts UN Linked SDG tool extracted from academic publications | 2_LinkedSDG_Concepts | [Link to LinkedSGS Tool](http://linkedsdg.apps.officialstatistics.org/#/) |
16 | | 3. | Concepts extracted from SDG Pathfinder documents extracted via ML | 3_SDGPathfiner_DocumentConcepts | [Document Colletion](https://sdg-pathfinder.org/) ; [Modelling Description](https://ppmi.lt/) |
17 | | 4. | Keywords from SDG Pathfinder indicated by the SDG Pathfinder tool itself| 4_SDGPathfinder_Keywords| [SDG Pathfinder](https://sdg-pathfinder.org/) |
18 | | 5. | Concepts UN Linked SDG tool extracted from Administrative Documents | 5_LinkedSDG_DocumentExtracts | [Link to LinkedSGS Tool](http://linkedsdg.apps.officialstatistics.org/#/) |
19 | | 7. | Concepts linked to SDGs from EC Policy Documents | 7_EC_Policy_Doc_Terms | Skrynnyk & Stanciauskas ( 2020 upcoming ) |
20 | | 9. | Keywords from "Science4SDGs" project | 9_SIRIS_Science4SDGs | [Link to "Science4SDGs" project](http://science4sdgs.sirisacademic.com/) |
21 | ##
22 |
23 | ## ATA data sources
24 |
25 | | Index | Description | Folder Name | Link |
26 | | :------: | :------ | :------: | ------: |
27 | | 8. | FOS'es Linked to NABs Areas | 8_NABS_FOS | [Link to Eurostat](https://ec.europa.eu/eurostat/ramon/nomenclatures/index.cfm?TargetUrl=LST_NOM_DTL&StrNom=CEPA_1994&StrLanguageCode=EN&IntPcKey=4431590&StrLayoutCode=HIERARCHIC) |
28 | | 10. | A boost of SDG relevant FOS'es compiled by PPMI researchers | 10_PPMI_boost | [PPMI](https://ppmi.lt)|
29 | #
30 |
31 | ***
32 |
33 |
34 | ****
35 | # Raw Data structure
36 | * `raw_data/`
37 | * `0_add/`
38 | * `00_add_validated/`\
39 | **Expert validated term labels**\
40 | **→** each data source must produce:
41 | *`*_ProcessedKeyTerms.json`*
42 | ```python
43 | {
44 | 'SDG_1': ['term_1', 'term_2', ...],
45 | 'SDG_2': ['term_3', 'term_4', ...],
46 | ...
47 | }
48 | ```
49 | * `01_add_generated/`\
50 | **Expert validated term labels**\
51 | **→** each data source must produce:
52 | *`*_ProcessedKeyTerms.json`*
53 | ```python
54 | {
55 | 'SDG_1': ['term_1', 'term_2', ...],
56 | 'SDG_2': ['term_3', 'term_4', ...],
57 | ...
58 | }
59 | ```
60 | * `02_add_all_to_all/`\
61 | **Expert validated FOS labels**\
62 | **→** each data source must produce:
63 | *`*_ProcessedFOS.json`*
64 | ```python
65 | {
66 | 'SDG_1': [['fos_id_1', 'fos_name_1'], ['fos_id_2', 'fos_name_2'], ...],
67 | 'SDG_2': [['fos_id_3', 'fos_name_3'], ['fos_id_4', 'fos_name_4'], ...],
68 | ...
69 | }
70 | ```
71 | * `1_replace/`\
72 | **Mapping for FOS SDG label reassignment from `SDG_a` to `SDG_b`**\
73 | **→** each data source must produce:
74 | *`*_ReplaceFOS.json`*
75 | ```python
76 | {
77 | 'fos_id_1': [['SDG_1', 'SDG_2'], ...],
78 | 'fos_id_2': [['SDG_3', 'SDG_4'], ...],
79 | ...
80 | }
81 | ```
82 | * `2_remove/`\
83 | **FOS to remove from sdg assigned FOS lists**\
84 | **→** each data source must produce:
85 | *`*_RemoveFOS.json`*
86 | ```python
87 | {
88 | 'SDG_1': ['fos_id_1', 'fos_id_2', ...],
89 | 'SDG_2': ['fos_id_1', 'fos_id_3', ...],
90 | ...
91 | }
92 | ```
93 | * `Blacklist`\
94 | **Irrelevant FOS**\
95 | **→** each data source must produce:
96 | *`*_Blacklist.csv`*
97 | | fos_id | fos_name |
98 | | :------ | :-------- |
99 | | fos_id_1 | fos_name_1 |
100 | | fos_id_2 | fos_name_2 |
101 | | ... | ...|
102 |
103 |
104 |
105 |
106 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ### OSDG has moved to a new repository (https://github.com/osdg-ai/osdg-mapping).
4 | **All the updates will be made to the new repo only!**
5 |
6 |
--------------------------------------------------------------------------------
/comparison_fos_update.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/comparison_fos_update.xlsx
--------------------------------------------------------------------------------
/images/Methodology-visual_0511_Updated.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/images/Methodology-visual_0511_Updated.png
--------------------------------------------------------------------------------
/images/OSDG.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/images/OSDG.png
--------------------------------------------------------------------------------
/images/OSDG_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/images/OSDG_new.png
--------------------------------------------------------------------------------
/raw_data/0_add/00_add_validated/0_PuigOntology/0_process_key_terms.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Mon Apr 27 17:39:53 2020
5 |
6 | @author: lukas
7 | """
8 | import json
9 |
10 |
11 | data = {}
12 |
13 | file = open("Ontology.csv" , "r" , encoding = "latin1")
14 | for line in file :
15 | parts = line[:-1].split(";")
16 | if len(parts) == 1:
17 | break
18 | else:
19 | if parts[1] != "clasification" :
20 | if parts[1] in data:
21 | data[ parts[1] ].append( parts[0] )
22 | else:
23 | data[ parts[1] ] = [ parts[0] ]
24 | file.close()
25 |
26 | #%%
27 | replacables_symbols = ["&" , "-" , '"' , " "]
28 | replacables_words = ["and" , "or" , "for", "&" , "of" , "sdg" , "oecd" , "arctic"]
29 | def pre_proc( list_o_strings ):
30 | """
31 | Keeps only the keywords longer than 4 characters ;
32 | Strips non Alphanumeric chars ;
33 | Removes basic interluding words ( "and" , "of" , etc. ) ;
34 | Deduplicates
35 | """
36 |
37 | processed = []
38 | alpha = "abcdefghijklmnopqrstuvwxyz0123456789 "
39 | for item in list_o_strings :
40 | item = item.lower()
41 |
42 | for c in replacables_symbols:
43 | item = item.replace( c , " " )
44 | item_p = item.split()
45 | item = " ".join(i for i in item_p if i not in replacables_words)
46 |
47 | if all( c in alpha for c in item ) :
48 | if item.startswith( " " ) :
49 | item = item[ 1: ]
50 | if item.endswith( " " ) :
51 | item = item[:-1]
52 | if len(item) > 4 :
53 | if item not in processed:
54 | processed.append( item )
55 | return processed
56 |
57 | #%%
58 | data_proc = {}
59 |
60 | for key , value in data.items() :
61 | key2 = key.replace("SDG" , "SDG_")
62 | data_proc[ key2 ] = pre_proc( value )
63 |
64 |
65 | #%%
66 | js = json.dumps( data_proc )
67 | file = open("0_ProcessedKeyTerms.json" , "w")
68 | file.write( js )
69 | file.close()
70 |
--------------------------------------------------------------------------------
/raw_data/0_add/00_add_validated/0_PuigOntology/Ontology.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/00_add_validated/0_PuigOntology/Ontology.csv
--------------------------------------------------------------------------------
/raw_data/0_add/00_add_validated/6_SDGIO_terms/6_ProcessedKeyTerms.json:
--------------------------------------------------------------------------------
1 | {"SDG_1": ["access to basic services", "disaster", "poverty reduction programme", "national action plans that support actions that eradicate poverty sustainably use natural resources"], "SDG_2": ["undernourishment", "stunting", "malnutrition wasting overweight", "volume production", "total factor productivity", "sustainable agricultural practices", "agricultural households", "eco friendly fertilizers", "ex situ crop collections enrichment index", "local crops breeds wild relatives", "import export tariffs", "agricultural export subsidies"], "SDG_3": ["maternal deaths", "birth", "mortality rate", "neonatel", "hiv infections", "tuberculosis", "malaria", "hepatitis b infections", "neglected tropical diseases", "cardiovascular disease", "suicide", "substance use disorders", "harmful use alcohol", "road traffic", "family planning", "adolescent birth rate", "tracer interventions", "health expenditure", "household pollution", "hazardous chemicals", "tobacco use", "access to affordable medicines vaccines", "official development assistance", "health worker", "13 core capacities"], "SDG_4": ["education children young people", "developmentally on track", "organized learning", "parity indices", "fixed level proficiency in functional skills", "environmental science geoscience", "official development assistance flows scholarships", "teachers"], "SDG_5": ["ever partnered", "married in a union", "unpaid domestic care work", "seats in national parliaments local governments", "managerial positions", "informed decisions", "laws regulations", "ownership secure rights", "legal framework customary law", "mobile telephone"], "SDG_6": ["safely managed drinking water services", "sanitation services", "wastewater safely treated", "good ambient water quality", "water use efficiency", "water resources used", "water related ecosystems extent", "official development assistance water sanitation related", "local communities participation in water sanitation managemnt"], "SDG_7": ["acces to electricity", "clean fuels technology", "renewable energy share final energy consumption", "energy intensity primary energy gdp", "100 billion commitment", "net domestic energy use"], "SDG_8": ["annual growth rate real gdp", "employed person", "informal employment non agricultural employment", "resource productivity", "hourly earnings", "unemployment rate", "child labour", "fatal occupational injury non fatal occupational injury", "international labour organization conventions", "tourism direct gdp tourism industries", "commercial bank branches atm", "aid trade", "social protection employment progammes"], "SDG_9": ["rural population all season road", "freight volume passenger volume", "manufacturing value added", "manufacturing employment", "small scale industries", "loan credit", "co2 emission", "research development", "researchers", "official international support", "mobile network"], "SDG_10": ["household expenditure", "median income", "report that personally felt discriminated against harassed", "labour share", "members in international organizations voting rights in international organizations", "recruitment cost borne by employee", "international migration policy index", "victims human trafficking", "tariff lines", "resource flows development", "remittance costs"], "SDG_11": ["slum informal settlement inadequate housing", "access to public transport convenient", "land consumption population growth rate", "urban planning management", "urban solid waste", "fine particulate matter", "open space public use", "physical harassment sexual harassment women subjected to", "urban regional development plans implementing", "risk reduction resilience strategies implementing"], "SDG_12": ["international multilateral environmental agreements on hazardous other chemicals waste", "treatment waste", "national recycling rate", "sustainability reports", "sustainable public procurement policies action plans implementing", "sustainable development lifestyle topics", "green patent applications", "residual flows", "fossil fuel subsidies", "effective climate change related planning management"], "SDG_14": ["nitrogen use efficiency composite", "coastal marine development", "fish stocks", "protected areas", "negative fishery subsidies", "fisheries", "research in marine technology", "regional seas protocols provisions"], "SDG_15": ["forest area", "forest cover", "net permanent forest loss", "degraded land", "important sites mountain biodiversity", "mountain green cover index", "red list index", "access benefit sharing clearinghouse", "rli species in trade", "illegal trade in wildlife wildlife products", "invasive alien species", "biodiversity ecosystem services values", "sustainable use biodiversity ecosystems conservation", "forestry official development assistance"], "SDG_16": ["victim intentional homicide", "conflict related deaths", "feel safe walking alone", "physical punishment", "human trafficking", "victims violence", "unsentenced detainees", "illicit financial flows", "small arms light weapons", "contact with a public official", "primary government expenditures", "satisfactory experience with public services", "positions in public institutions", "national development plans poverty reduction strategies", "births registered with a civil authority", "physical sexual crime reported by victim"], "SDG_17": ["total governement revenue", "domestic taxes", "net official development assistance", "volume remittances", "debt service", "sustainable development objectives safeguard", "access to patent information", "fixed internet broadband subscriptions", "environmentally sound technologies", "internet use by individual", "sustainable development in three dimensions", "global exports", "tariff", "official development assistance loan agreements", "public private civil society partnerships", "national statistical legislation", "statistical capacity", "inclusive wealth"]}
--------------------------------------------------------------------------------
/raw_data/0_add/00_add_validated/6_SDGIO_terms/6_process_key_terms.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Thu May 15 15:47:41 2020
5 |
6 | @author: lukas
7 | """
8 |
9 |
10 | import pandas as pd
11 | import json
12 | from tqdm import tqdm
13 |
14 | replacables_symbols = ["&" , "-" , '"' , " "]
15 | replacables_words = ["and" , "or" , "for", "&" , "of" , "sdg" , "oecd" , "arctic"]
16 | def pre_proc( list_o_strings ):
17 | """
18 | Keeps only the keywords longer than 4 characters ;
19 | Strips non Alphanumeric chars ;
20 | Removes basic interluding words ( "and" , "of" , etc. ) ;
21 | Deduplicates
22 | """
23 |
24 | processed = []
25 | alpha = "abcdefghijklmnopqrstuvwxyz0123456789 "
26 | for item in list_o_strings :
27 | item = item.lower()
28 |
29 | for c in replacables_symbols:
30 | item = item.replace( c , " " )
31 | item_p = item.split()
32 | item = " ".join(i for i in item_p if i not in replacables_words)
33 |
34 | if all( c in alpha for c in item ) :
35 | if item.startswith( " " ) :
36 | item = item[ 1: ]
37 | if item.endswith( " " ) :
38 | item = item[:-1]
39 | if len(item) > 4 :
40 | if item not in processed:
41 | processed.append( item )
42 | return processed
43 |
44 | dfl = pd.read_excel("SDG Terms by Indicator.xlsx").to_dict( orient = "records")
45 |
46 |
47 | number_map = {"1" : "SDG_1" ,
48 | "2" : "SDG_2",
49 | "3" : "SDG_3",
50 | "4" : "SDG_4",
51 | "5" : "SDG_5",
52 | "6" : "SDG_6",
53 | "7" : "SDG_7",
54 | "8" : "SDG_8",
55 | "9" : "SDG_9",
56 | "10" : "SDG_10",
57 | "11" : "SDG_11",
58 | "12" : "SDG_12",
59 | "13" : "SDG_12",
60 | "14" : "SDG_14",
61 | "15" : "SDG_15",
62 | "16" : "SDG_16",
63 | "17" : "SDG_17",
64 | }
65 |
66 | sdg_words_raw = {}
67 |
68 | for row in tqdm(dfl) :
69 | number = row['Indicator \nNumber'].split(".")[0]
70 | sdg = number_map[ number ]
71 | if sdg not in sdg_words_raw.keys() :
72 | sdg_words_raw[ sdg ] = []
73 | sdg_words_raw[ sdg ].append( str(row["Term(s)"]))
74 |
75 | counter = 0
76 | print("Key Words Identified before cleaning : " )
77 | for key , value in sdg_words_raw.items() :
78 | print( key , " : ", len(value))
79 | counter += len(value)
80 |
81 | print("Overall : ", counter)
82 |
83 | #%%
84 | for key , value in sdg_words_raw.items() :
85 | sdg_words_raw[ key ] = pre_proc( value )
86 |
87 | #%%
88 | """
89 | Deduplicating keywords
90 | """
91 | word_freq_dict = {}
92 | for val in list(sdg_words_raw.values()) :
93 | for v in val :
94 | if v not in word_freq_dict :
95 | word_freq_dict[ v ] = 1
96 | else:
97 | word_freq_dict[ v ] += 1
98 |
99 | #%%
100 | sdg_words = {}
101 | for key , value in sdg_words_raw.items() :
102 | plh = [ i for i in value if word_freq_dict[i] < 2]
103 | sdg_words[ key ] = plh
104 |
105 | #%%
106 | js = json.dumps( sdg_words )
107 | file = open( "6_ProcessedKeyTerms.json" , "w")
108 | file.write( js )
109 | file.close()
110 |
111 | counter = 0
112 | print("Key Words Identified after cleaning: " )
113 | for key , value in sdg_words.items() :
114 | print( key , " : ", len(value))
115 | counter += len(value)
116 |
117 | print("Overall : ", counter)
118 |
--------------------------------------------------------------------------------
/raw_data/0_add/00_add_validated/6_SDGIO_terms/SDG Terms by Indicator.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/00_add_validated/6_SDGIO_terms/SDG Terms by Indicator.xlsx
--------------------------------------------------------------------------------
/raw_data/0_add/01_add_generated/1_FP7-4-SD_edited/1_process_key_terms.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Tue Apr 28 17:02:48 2020
5 |
6 | @author: lukas
7 | """
8 |
9 | import json
10 |
11 | #Hand curated list of bad FOS ids
12 | from bad_fos import *
13 | #%%
14 | file = open("WU_projectSDGs.json" , "r")
15 | project_sdgs = json.loads( file.read() )
16 | file.close()
17 |
18 | file = open("ProjectFOS.json" , "r")
19 | project_fos = json.loads( file.read() )
20 | file.close()
21 |
22 | file = open("FOSMAP.json" , "r")
23 | fos_map = json.loads( file.read() )
24 | file.close()
25 |
26 |
27 | #%%
28 | """
29 | Taking Top 5 FOS'es from each project
30 | Checking if they are not EU project slang related FOS'es (bad_fos)
31 | Assigning them to SDGs and deduplicating
32 | """
33 | sdg_fos_raw = {}
34 | for key , value in project_sdgs.items() :
35 | fos = [ i[0] for i in sorted( project_fos[ key ].items() , key = lambda kv:kv[1] , reverse = True)[0:10] if int( i[ 0 ] ) not in bad_fos ]
36 | for v in value :
37 | if v not in sdg_fos_raw.keys() :
38 | sdg_fos_raw[ v ] = set()
39 | for f in fos :
40 | sdg_fos_raw[ v ].add( f )
41 |
42 | for key , value in sdg_fos_raw.items() :
43 | sdg_fos_raw[ key ] = list( value )
44 |
45 | #Data on SDG 17 in this set is very poor quality ; removing it
46 | pop = sdg_fos_raw.pop("SDG_17", None)
47 |
48 | #%%
49 | """
50 | Removing certain some FOS'es that were assigned to projects but do not align well with SDGs
51 | """
52 | sdg_fos_raw["SDG_1"] = [ i for i in sdg_fos_raw["SDG_1"] if int(i) not in bad_sdg1_fos]
53 | sdg_fos_raw["SDG_2"] = [ i for i in sdg_fos_raw["SDG_2"] if int(i) not in bad_sdg2_fos]
54 |
55 | sdg_fos_raw["SDG_3"] = [ i for i in sdg_fos_raw["SDG_3"] if int(i) not in bad_sdg3_fos]
56 | sdg_fos_raw["SDG_4"] = [ i for i in sdg_fos_raw["SDG_4"] if int(i) not in bad_sdg4_fos]
57 | sdg_fos_raw["SDG_5"] = [ i for i in sdg_fos_raw["SDG_5"] if int(i) not in bad_sdg5_fos]
58 | sdg_fos_raw["SDG_6"] = [ i for i in sdg_fos_raw["SDG_6"] if int(i) not in bad_sdg6_fos]
59 |
60 | sdg_fos_raw["SDG_7"] = [ i for i in sdg_fos_raw["SDG_7"] if int(i) not in bad_sdg7_fos]
61 |
62 | sdg_fos_raw["SDG_8"] = [ i for i in sdg_fos_raw["SDG_8"] if int(i) not in bad_sdg8_fos]
63 | sdg_fos_raw["SDG_9"] = [ i for i in sdg_fos_raw["SDG_9"] if int(i) not in bad_sdg9_fos]
64 | sdg_fos_raw["SDG_10"] = [ i for i in sdg_fos_raw["SDG_10"] if int(i) not in bad_sdg10_fos]
65 |
66 | sdg_fos_raw["SDG_11"] = [ i for i in sdg_fos_raw["SDG_11"] if int(i) not in bad_sdg11_fos]
67 | sdg_fos_raw["SDG_12"] = [ i for i in sdg_fos_raw["SDG_12"] if int(i) not in bad_sdg12_fos]
68 | sdg_fos_raw["SDG_13"] = [ i for i in sdg_fos_raw["SDG_13"] if int(i) not in bad_sdg13_fos]
69 | sdg_fos_raw["SDG_14"] = [ i for i in sdg_fos_raw["SDG_14"] if int(i) not in bad_sdg14_fos]
70 | sdg_fos_raw["SDG_15"] = [ i for i in sdg_fos_raw["SDG_15"] if int(i) not in bad_sdg15_fos]
71 | sdg_fos_raw["SDG_16"] = [ i for i in sdg_fos_raw["SDG_16"] if int(i) not in bad_sdg16_fos]
72 |
73 |
74 | #%%
75 | """
76 | Deduplicating FOS
77 | """
78 | fos_freq_dict = {}
79 | for val in list(sdg_fos_raw.values()) :
80 | for v in val :
81 | if v not in fos_freq_dict :
82 | fos_freq_dict[ v ] = 1
83 | else:
84 | fos_freq_dict[ v ] += 1
85 |
86 | #%%
87 | sdg_fos = {}
88 | for key , value in sdg_fos_raw.items() :
89 | plh = [ i for i in value if fos_freq_dict[i] < 2]
90 | sdg_fos[ key ] = plh
91 |
92 |
93 | #js = json.dumps( sdg_fos )
94 | #file = open("NewWU.json" , "w")
95 | #file.write( js )
96 | #file.close()
97 |
98 | #%%
99 | sdg_fos_s = {}
100 | for key , value in sdg_fos_raw.items() :
101 | plh = [ fos_map[ i ].lower() for i in value if fos_freq_dict[i] < 2 and i in fos_map.keys() ]
102 | sdg_fos_s[ key ] = plh
103 |
104 |
105 | #%%
106 | js = json.dumps( sdg_fos_s )
107 | file = open("1_ProcessedKeyTerms.json" , "w")
108 | file.write( js )
109 | file.close()
110 |
111 | #%%
112 | """
113 | for key , value in sdg_fos.items() :
114 | file = open( key+".txt" , "w")
115 | for v in value :
116 | line = v + "\t" + fos_map[ v ] +"\n"
117 | file.write( line )
118 | file.close()
119 | """
--------------------------------------------------------------------------------
/raw_data/0_add/01_add_generated/2_LinkedSDG_Concepts/2_ProcessedKeyTerms.json:
--------------------------------------------------------------------------------
1 | {"SDG_12": ["hazardous waste", "cleaner production", "sustainable production"], "SDG_8": ["occupational accident", "trade financing", "financial services", "youth employment", "trade promotion", "labour productivity", "decent work", "occupational safety", "occupational hazards", "salaires", "occupational accidents"], "SDG_3": ["non communicable diseases", "diseases", "mortality", "death", "public health", "health personnel", "physicians", "reproductive health", "family planning", "traffic accidents", "suicide", "maternal child health", "narcotic drugs", "tobacco", "youth health", "communicable diseases", "mothers", "child mortality", "maternal mortality", "tuberculosis", "food hygiene", "child health", "traffic safety", "malaria", "toxic substance", "water related diseases", "international health regulations", "childbirth", "infant mortality", "sex education", "medical research", "health hazards", "infants", "tropical disease", "mental health", "smoking", "vaccination", "alcoholism", "road traffic", "deliveries", "mortalities", "toxicity", "maladies infectieuses", "delivery", "infectious diseases", "deaths", "alcohol abuse", "mortality rates", "tropical diseases", "dioxins", "burial", "death rate", "vaccinations", "suicides", "cause death", "toxic chemicals", "community health", "toxic substances", "dioxin", "perinatal mortality", "mortality rate", "seropositivity", "food safety", "toxicities", "hiv infections", "mortalite", "terminally ill", "human mortality", "parturition", "death rates", "burials", "meres", "health the population", "confinement", "malarias", "sante publique", "funeral", "adolescent health", "enfermedades", "addiction to tobacco", "mortalidad", "medical personnel", "sante mentale", "salud publica", "maladies", "newborn babies", "natural death", "narcotics", "confinements", "childbirths", "road safety", "enfants", "funerals", "tobacco addiction", "salud mental", "recien nacidos", "lactantes", "muerte", "toxic discharge", "medecins"], "SDG_9": ["innovation", "industrial infrastructure", "industrialization", "research development", "technology", "technologies", "technological innovations", "scientific personnel", "engineering", "innovations", "technological innovation", "industrial innovation", "industrial infrastructures"], "SDG_5": ["girls", "participation women", "position women", "gender based violence", "harmful traditional practices", "unpaid work", "female circumcision", "women managers", "political participation", "excision", "situation women", "excisions", "sexual violence", "violence against women"], "SDG_1": ["standard living"], "SDG_11": [], "SDG_2": ["sustainable agriculture", "food shortage", "hunger", "malnutrition", "food security", "agricultural policy", "economic policy", "starvation", "food production", "food price", "agricultural development", "famine", "food insecurity", "food prices", "access to food", "food availability", "agricultural policies", "economic policies", "multifunctional agriculture", "economic choices", "food shortages", "local food production", "starvations"], "SDG_6": ["use water", "water management", "use waters"], "SDG_16": ["human rights", "civil registration", "birth reporting", "child abuse", "access to information", "corruption", "rule law", "bribery", "legal protection", "public information", "detained persons", "administration justice", "societe civile", "protection human rights", "corrupt practices", "public bodies", "public institutions", "human rights violations", "corruptions"], "SDG_4": ["educational facilities", "right to education", "computer literacy", "teacher", "scholarships", "preschool education", "educational financing", "teacher training", "teachers", "kindergarten", "kindergartens", "professors", "professor", "educacion", "teaching staff", "educational buildings"], "SDG_7": ["sustainable energy", "energy market", "66 energy", "renewable energy sources", "electrification", "energy resources", "energy", "energy efficiency", "environmentally sound technology", "energies", "energy sector", "fuels", "alternative energy sources", "power sector", "energy supply", "energy supplies", "fuel efficiency", "energy sectors", "energy markets", "clean technologies", "energy efficiencies", "fuel resources"], "SDG_15": ["terrestrial ecosystem", "terrestrial ecosystems", "deforestation", "mountain ecosystems", "freshwater ecosystem", "national parks reserves", "protected area", "desertification", "national park", "forest", "forests", "biological diversity", "forest ecosystems", "biodiversity", "endangered species", "protected areas", "diversidad biologica", "biodiversidad", "bosque", "species diversity", "woodland", "woodlands", "national parks", "biosphere reserves", "freshwater ecosystems", "nature reserves", "biodiversite", "salinisation", "alpine ecology", "alpine ecosystems"], "SDG_13": [], "SDG_17": ["partnership", "internet", "national budget", "economic support", "resources mobilization", "external debt", "debt servicing", "programme evaluation", "programme ownership", "partnerships", "state budget", "third world"], "SDG_14": ["oceans seas", "ocean", "marine life", "marine resources", "marine environment", "marine ecosystems", "marine environments", "oceans", "marine ecology", "oceanos", "marine fauna", "sea resources"], "SDG_10": ["social security", "income distribution", "income inequalities", "income inequality", "social protection", "social insurance"]}
--------------------------------------------------------------------------------
/raw_data/0_add/01_add_generated/2_LinkedSDG_Concepts/2_process_key_terms.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Thu Apr 30 10:16:21 2020
5 |
6 | @author: lukas
7 | """
8 |
9 |
10 | import pandas as pd
11 | import json
12 |
13 | #%%
14 | replacables_symbols = ["&" , "-" , '"' , " "]
15 | replacables_words = ["and" , "or" , "for", "&" , "of" , "sdg" , "oecd" , "arctic"]
16 | def pre_proc( list_o_strings ):
17 | """
18 | Keeps only the keywords longer than 4 characters ;
19 | Strips non Alphanumeric chars ;
20 | Removes basic interluding words ( "and" , "of" , etc. ) ;
21 | Deduplicates
22 | """
23 |
24 | processed = []
25 | alpha = "abcdefghijklmnopqrstuvwxyz0123456789 "
26 | for item in list_o_strings :
27 | item = item.lower()
28 |
29 | for c in replacables_symbols:
30 | item = item.replace( c , " " )
31 | item_p = item.split()
32 | item = " ".join(i for i in item_p if i not in replacables_words)
33 |
34 | if all( c in alpha for c in item ) :
35 | if item.startswith( " " ) :
36 | item = item[ 1: ]
37 | if item.endswith( " " ) :
38 | item = item[:-1]
39 | if len(item) > 4 :
40 | if item not in processed:
41 | processed.append( item )
42 | return processed
43 |
44 |
45 | #%%
46 | dfl = pd.read_excel( "LinkedSDG_Data.xlsx" ).to_dict(orient="records")
47 |
48 | sdg_words_raw = {}
49 | for row in dfl :
50 | if str(row["Goal"]) != "nan" :
51 | sdg = row["Goal"].replace("Goal " , "SDG_")
52 | if sdg not in sdg_words_raw.keys() :
53 | sdg_words_raw[ sdg ] = []
54 | if str(row["Concept"]) != "nan" :
55 | sdg_words_raw[ sdg ].append( row["Concept"].lower() )
56 | if str(row["Keyword"]) != "nan" :
57 | sdg_words_raw[ sdg ].append( row["Keyword"].lower() )
58 |
59 | for key , value in list(sdg_words_raw.items()) :
60 | sdg_words_raw[ key ] = pre_proc( value )
61 |
62 | #%%
63 | """
64 | Deduplicating keywords
65 | """
66 | word_freq_dict = {}
67 | for val in list(sdg_words_raw.values()) :
68 | for v in val :
69 | if v not in word_freq_dict :
70 | word_freq_dict[ v ] = 1
71 | else:
72 | word_freq_dict[ v ] += 1
73 |
74 | #%%
75 | sdg_words = {}
76 | for key , value in sdg_words_raw.items() :
77 | plh = [ i for i in value if word_freq_dict[i] < 2]
78 | sdg_words[ key ] = plh
79 |
80 | #%%
81 | js = json.dumps( sdg_words )
82 | file = open( "2_ProcessedKeyTerms.json" , "w")
83 | file.write( js )
84 | file.close()
85 |
--------------------------------------------------------------------------------
/raw_data/0_add/01_add_generated/2_LinkedSDG_Concepts/LinkedSDG_Data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/01_add_generated/2_LinkedSDG_Concepts/LinkedSDG_Data.xlsx
--------------------------------------------------------------------------------
/raw_data/0_add/01_add_generated/3_SDGPathfinder_DocumentConcepts/3_process_key_terms.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Thu Apr 30 09:43:00 2020
5 |
6 | @author: lukas
7 | """
8 |
9 | import pandas as pd
10 | import json
11 | import ast
12 |
13 |
14 |
15 | #%%
16 | replacables_symbols = ["&" , "-" , '"' , " "]
17 | replacables_words = ["and" , "or" , "for", "&" , "of" , "sdg" , "oecd" , "arctic"]
18 | def pre_proc( list_o_strings ):
19 | """
20 | Keeps only the keywords longer than 4 characters ;
21 | Strips non Alphanumeric chars ;
22 | Removes basic interluding words ( "and" , "of" , etc. ) ;
23 | Deduplicates
24 | """
25 |
26 | processed = []
27 | alpha = "abcdefghijklmnopqrstuvwxyz0123456789 "
28 | for item in list_o_strings :
29 | item = item.lower()
30 |
31 | for c in replacables_symbols:
32 | item = item.replace( c , " " )
33 | item_p = item.split()
34 | item = " ".join(i for i in item_p if i not in replacables_words)
35 |
36 | if all( c in alpha for c in item ) :
37 | if item.startswith( " " ) :
38 | item = item[ 1: ]
39 | if item.endswith( " " ) :
40 | item = item[:-1]
41 | if len(item) > 4 :
42 | if item not in processed:
43 | processed.append( item )
44 | return processed
45 |
46 |
47 | #%%
48 | df = pd.read_excel( "OECD_SDG_betas.xlsx" )
49 | #%%
50 | """
51 | File includes TOP 1000 positive and negative beta coeficients from the regression models for both unigrams and ngrams
52 | We will take top 200 ngrams and top 100 unigrams
53 | """
54 |
55 | unigrams = list( df["Keywords_Positive"])
56 | ngrams = list( df["Ngrams_Positive"])
57 |
58 | sdg_words_raw = {}
59 |
60 | for index, item in enumerate( unigrams ) :
61 |
62 | unigram_short = [v[0] for v in sorted( ast.literal_eval( item ) , key = lambda kv : kv[1] , reverse = True )[0:50] ]
63 | ngram_short = [v[0] for v in sorted( ast.literal_eval( ngrams[ index ] ) , key = lambda kv : kv[1] , reverse = True )[0:250] ]
64 | plh = unigram_short + ngram_short
65 |
66 | key = "SDG_" + str(index+1)
67 |
68 | sdg_words_raw[ key ] = pre_proc( plh )
69 |
70 | #%%
71 | """
72 | Deduplicating keywords
73 | """
74 | word_freq_dict = {}
75 | for val in list(sdg_words_raw.values()) :
76 | for v in val :
77 | if v not in word_freq_dict :
78 | word_freq_dict[ v ] = 1
79 | else:
80 | word_freq_dict[ v ] += 1
81 |
82 | #%%
83 | sdg_words = {}
84 | for key , value in sdg_words_raw.items() :
85 | plh = [ i for i in value if word_freq_dict[i] < 2]
86 | sdg_words[ key ] = plh
87 |
88 | #%%
89 | bad_sdg3_terms = set( [ "data type" ,
90 | "date signature" ,
91 | "date start" ,
92 | "date start date" ,
93 | "deliverable list" ,
94 | "deliverable wp" ,
95 | "demonstrator dissemination" ,
96 | "description deliverable" ,
97 | "developed new" ,
98 | "development new" ,
99 | "dissemination activities" ,
100 | "dissemination report" ,
101 | "document version" ,
102 | "ec contribution" ] )
103 |
104 | sdg3_plh = [ i for i in sdg_words["SDG_3"] if i not in bad_sdg3_terms]
105 | sdg_words[ "SDG_3" ] = sdg3_plh
106 |
107 | #%%
108 | js = json.dumps( sdg_words )
109 | file = open( "3_ProcessedKeyTerms.json" , "w")
110 | file.write( js )
111 | file.close()
--------------------------------------------------------------------------------
/raw_data/0_add/01_add_generated/3_SDGPathfinder_DocumentConcepts/OECD_SDG_betas.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/01_add_generated/3_SDGPathfinder_DocumentConcepts/OECD_SDG_betas.xlsx
--------------------------------------------------------------------------------
/raw_data/0_add/01_add_generated/4_SDGPathfinder_Keywords/4_process_key_terms.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Thu Apr 30 11:43:03 2020
5 |
6 | @author: lukas
7 | """
8 |
9 | import pandas as pd
10 | import json
11 | import ast
12 |
13 |
14 |
15 | #%%
16 | replacables_symbols = ["&" , "-" , '"' , " "]
17 | replacables_words = ["and" , "or" , "for", "&" , "of" , "sdg" , "oecd" , "arctic"]
18 | def pre_proc( list_o_strings ):
19 | """
20 | Keeps only the keywords longer than 4 characters ;
21 | Strips non Alphanumeric chars ;
22 | Removes basic interluding words ( "and" , "of" , etc. ) ;
23 | Deduplicates
24 | """
25 |
26 | processed = []
27 | alpha = "abcdefghijklmnopqrstuvwxyz0123456789 "
28 | for item in list_o_strings :
29 | item = item.lower()
30 |
31 | for c in replacables_symbols:
32 | item = item.replace( c , " " )
33 | item_p = item.split()
34 | item = " ".join(i for i in item_p if i not in replacables_words)
35 |
36 | if all( c in alpha for c in item ) :
37 | if item.startswith( " " ) :
38 | item = item[ 1: ]
39 | if item.endswith( " " ) :
40 | item = item[:-1]
41 | if len(item) > 4 :
42 | if item not in processed:
43 | processed.append( item )
44 | return processed
45 |
46 | #%%
47 | sdg_dict = {
48 | "partnerships-for-the-goals" : "SDG_17" ,
49 | "good-health" : "SDG_3" ,
50 | "no-poverty" : "SDG_1" ,
51 | "life-below-water" : "SDG_14" ,
52 | "peace-justice-and-strong-institutions" : "SDG_16" ,
53 | "decent-work-growth" : "SDG_8" ,
54 | "responsible-consumption" : "SDG_12" ,
55 | "climate-action" : "SDG_13" ,
56 | "industry-innovation-and-infrastructure" : "SDG_9" ,
57 | "gender-equality" : "SDG_5" ,
58 | "affordable-energy" : "SDG_7" ,
59 | "reduced-inequalities" : "SDG_10" ,
60 | "zero-hunger" : "SDG_2" ,
61 | "clean-water" : "SDG_6" ,
62 | "sustainable-cities" : "SDG_11" ,
63 | "quality-education" : "SDG_4" ,
64 | "life-on-land" : "SDG_15" }
65 |
66 |
67 | #%%
68 | dfl = pd.read_csv( "keywords.csv" ).to_dict(orient="records")
69 |
70 | sdg_words_raw = {}
71 |
72 | for row in dfl :
73 | if sdg_dict[ row["sdg"] ] not in sdg_words_raw.keys() :
74 | sdg_words_raw[ sdg_dict[ row["sdg"] ] ] = []
75 | plh = ast.literal_eval( row["keys"] )
76 | for i in plh :
77 | sdg_words_raw[ sdg_dict[ row["sdg"] ] ].append( i["key"].lower())
78 |
79 | #%%
80 | for key , value in sdg_words_raw.items() :
81 | sdg_words_raw[ key ] = pre_proc( value )
82 |
83 | #%%
84 | """
85 | Deduplicating keywords
86 | """
87 | word_freq_dict = {}
88 | for val in list(sdg_words_raw.values()) :
89 | for v in val :
90 | if v not in word_freq_dict :
91 | word_freq_dict[ v ] = 1
92 | else:
93 | word_freq_dict[ v ] += 1
94 |
95 | #%%
96 | sdg_words = {}
97 | for key , value in sdg_words_raw.items() :
98 | plh = [ i for i in value if word_freq_dict[i] < 2]
99 | sdg_words[ key ] = plh
100 |
101 | #%%
102 | js = json.dumps( sdg_words )
103 | file = open( "4_ProcessedKeyTerms.json" , "w")
104 | file.write( js )
105 | file.close()
--------------------------------------------------------------------------------
/raw_data/0_add/01_add_generated/5_LinkedSDG_DocumentExtracts/5_ProcessedKeyTerms.json:
--------------------------------------------------------------------------------
1 | {"SDG_12": ["hazardous waste", "cleaner production", "sustainable production"], "SDG_8": ["occupational accident", "trade financing", "financial services", "youth employment", "trade promotion", "labour productivity", "decent work", "occupational safety", "occupational hazards", "salaires", "occupational accidents"], "SDG_3": ["non communicable diseases", "diseases", "mortality", "death", "public health", "health personnel", "physicians", "reproductive health", "family planning", "traffic accidents", "suicide", "maternal child health", "narcotic drugs", "tobacco", "youth health", "communicable diseases", "mothers", "child mortality", "maternal mortality", "tuberculosis", "food hygiene", "child health", "traffic safety", "malaria", "toxic substance", "water related diseases", "international health regulations", "childbirth", "infant mortality", "sex education", "medical research", "health hazards", "infants", "tropical disease", "mental health", "smoking", "vaccination", "alcoholism", "road traffic", "deliveries", "mortalities", "toxicity", "maladies infectieuses", "delivery", "infectious diseases", "deaths", "alcohol abuse", "mortality rates", "tropical diseases", "dioxins", "burial", "death rate", "vaccinations", "suicides", "cause death", "toxic chemicals", "community health", "toxic substances", "dioxin", "perinatal mortality", "mortality rate", "seropositivity", "food safety", "toxicities", "hiv infections", "mortalite", "terminally ill", "human mortality", "parturition", "death rates", "burials", "meres", "health the population", "confinement", "malarias", "sante publique", "funeral", "adolescent health", "enfermedades", "addiction to tobacco", "mortalidad", "medical personnel", "sante mentale", "salud publica", "maladies", "newborn babies", "natural death", "narcotics", "confinements", "childbirths", "road safety", "enfants", "funerals", "tobacco addiction", "salud mental", "recien nacidos", "lactantes", "muerte", "toxic discharge", "medecins"], "SDG_9": ["innovation", "industrial infrastructure", "industrialization", "research development", "technology", "technologies", "technological innovations", "scientific personnel", "engineering", "innovations", "technological innovation", "industrial innovation", "industrial infrastructures"], "SDG_5": ["girls", "participation women", "position women", "gender based violence", "harmful traditional practices", "unpaid work", "female circumcision", "women managers", "political participation", "excision", "situation women", "excisions", "sexual violence", "violence against women"], "SDG_1": ["standard living"], "SDG_11": [], "SDG_2": ["sustainable agriculture", "food shortage", "hunger", "malnutrition", "food security", "agricultural policy", "economic policy", "starvation", "food production", "food price", "agricultural development", "famine", "food insecurity", "food prices", "access to food", "food availability", "agricultural policies", "economic policies", "multifunctional agriculture", "economic choices", "food shortages", "local food production", "starvations"], "SDG_6": ["use water", "water management", "use waters"], "SDG_16": ["human rights", "civil registration", "birth reporting", "child abuse", "access to information", "corruption", "rule law", "bribery", "legal protection", "public information", "detained persons", "administration justice", "societe civile", "protection human rights", "corrupt practices", "public bodies", "public institutions", "human rights violations", "corruptions"], "SDG_4": ["educational facilities", "right to education", "computer literacy", "teacher", "scholarships", "preschool education", "educational financing", "teacher training", "teachers", "kindergarten", "kindergartens", "professors", "professor", "educacion", "teaching staff", "educational buildings"], "SDG_7": ["sustainable energy", "energy market", "66 energy", "renewable energy sources", "electrification", "energy resources", "energy", "energy efficiency", "environmentally sound technology", "energies", "energy sector", "fuels", "alternative energy sources", "power sector", "energy supply", "energy supplies", "fuel efficiency", "energy sectors", "energy markets", "clean technologies", "energy efficiencies", "fuel resources"], "SDG_15": ["terrestrial ecosystem", "terrestrial ecosystems", "deforestation", "mountain ecosystems", "freshwater ecosystem", "national parks reserves", "protected area", "desertification", "national park", "forest", "forests", "biological diversity", "forest ecosystems", "biodiversity", "endangered species", "protected areas", "diversidad biologica", "biodiversidad", "bosque", "species diversity", "woodland", "woodlands", "national parks", "biosphere reserves", "freshwater ecosystems", "nature reserves", "biodiversite", "salinisation", "alpine ecology", "alpine ecosystems"], "SDG_13": [], "SDG_17": ["partnership", "internet", "national budget", "economic support", "resources mobilization", "external debt", "debt servicing", "programme evaluation", "programme ownership", "partnerships", "state budget", "third world"], "SDG_14": ["oceans seas", "ocean", "marine life", "marine resources", "marine environment", "marine ecosystems", "marine environments", "oceans", "marine ecology", "oceanos", "marine fauna", "sea resources"], "SDG_10": ["social security", "income distribution", "income inequalities", "income inequality", "social protection", "social insurance"]}
--------------------------------------------------------------------------------
/raw_data/0_add/01_add_generated/5_LinkedSDG_DocumentExtracts/5_process_key_terms.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Thu Apr 30 15:47:41 2020
5 |
6 | @author: lukas
7 | """
8 |
9 |
10 |
11 | import pandas as pd
12 | import json
13 |
14 | #%%
15 | replacables_symbols = ["&" , "-" , '"' , " "]
16 | replacables_words = ["and" , "or" , "for", "&" , "of" , "sdg" , "oecd" , "arctic"]
17 | def pre_proc( list_o_strings ):
18 | """
19 | Keeps only the keywords longer than 4 characters ;
20 | Strips non Alphanumeric chars ;
21 | Removes basic interluding words ( "and" , "of" , etc. ) ;
22 | Deduplicates
23 | """
24 |
25 | processed = []
26 | alpha = "abcdefghijklmnopqrstuvwxyz0123456789 "
27 | for item in list_o_strings :
28 | item = item.lower()
29 |
30 | for c in replacables_symbols:
31 | item = item.replace( c , " " )
32 | item_p = item.split()
33 | item = " ".join(i for i in item_p if i not in replacables_words)
34 |
35 | if all( c in alpha for c in item ) :
36 | if item.startswith( " " ) :
37 | item = item[ 1: ]
38 | if item.endswith( " " ) :
39 | item = item[:-1]
40 | if len(item) > 4 :
41 | if item not in processed:
42 | processed.append( item )
43 | return processed
44 |
45 |
46 | #%%
47 | dfl = pd.read_excel( "LinkedSDG_DocumentExtracts.xlsx" ).to_dict(orient="records")
48 |
49 | sdg_words_raw = {}
50 | for row in dfl :
51 | if str(row["Goal"]) != "nan" :
52 | sdg = row["Goal"].replace("Goal " , "SDG_")
53 | if sdg not in sdg_words_raw.keys() :
54 | sdg_words_raw[ sdg ] = []
55 | if str(row["Concept"]) != "nan" :
56 | sdg_words_raw[ sdg ].append( row["Concept"].lower() )
57 | if str(row["Keyword"]) != "nan" :
58 | sdg_words_raw[ sdg ].append( row["Keyword"].lower() )
59 |
60 | for key , value in list(sdg_words_raw.items()) :
61 | sdg_words_raw[ key ] = pre_proc( value )
62 |
63 | #%%
64 | """
65 | Deduplicating keywords
66 | """
67 | word_freq_dict = {}
68 | for val in list(sdg_words_raw.values()) :
69 | for v in val :
70 | if v not in word_freq_dict :
71 | word_freq_dict[ v ] = 1
72 | else:
73 | word_freq_dict[ v ] += 1
74 |
75 | #%%
76 | sdg_words = {}
77 | for key , value in sdg_words_raw.items() :
78 | plh = [ i for i in value if word_freq_dict[i] < 2]
79 | sdg_words[ key ] = plh
80 |
81 | #%%
82 | js = json.dumps( sdg_words )
83 | file = open( "5_ProcessedKeyTerms.json" , "w")
84 | file.write( js )
85 | file.close()
--------------------------------------------------------------------------------
/raw_data/0_add/01_add_generated/5_LinkedSDG_DocumentExtracts/LinkedSDG_DocumentExtracts.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/01_add_generated/5_LinkedSDG_DocumentExtracts/LinkedSDG_DocumentExtracts.xlsx
--------------------------------------------------------------------------------
/raw_data/0_add/01_add_generated/7_EC_Policy_Doc_Terms/7_ProcessedKeyTerms.json:
--------------------------------------------------------------------------------
1 | {"SDG_1": ["old age", "inter american", "economic downturn", "small medium", "el salvador", "poverty eradication", "gender responsive", "pre school", "gender sensitive", "price volatility", "lump sum", "austria belgium"], "SDG_2": ["fishery aquaculture", "adaptation climate change", "central america", "animal health", "research technological", "shelf life", "west africa", "clinical trial"], "SDG_3": ["road map", "hiv aids", "breast cancer", "mg ml"], "SDG_4": ["lifelong learning", "best practice", "vocational education", "teaching profession", "english language", "chamber commerce", "youth unemployment", "north west", "shed light", "early childhood education", "public procurement", "democratic republic congo", "tel mail", "secondary education", "job search", "vocational education training", "unite kingdom"], "SDG_5": ["violence woman", "millennium development goals", "holistic approach", "male female", "essential element", "vocational training", "criminal justice", "small medium sized", "venture capital", "lessons learned", "domestic violence", "post conflict"], "SDG_6": ["analytical method", "soil erosion", "river lake", "water sanitation", "saudi arabia", "wastewater treatment plant", "surface water", "heavy metal", "czech republic", "flood risk", "wastewater treatment", "river basin", "lung cancer", "van den", "supply sanitation", "service provider", "drink water", "urban rural", "add value", "iceland norway", "extreme weather"], "SDG_7": ["added value", "sole responsibility", "remedial action", "free charge", "table contents", "food chain", "oil gas", "wind power", "medical device"], "SDG_8": ["latin american", "energy efficiency", "joint venture", "family life", "northern ireland", "solve problem", "gender balance", "gender gap", "medium term", "migrant worker", "ministry finance", "corporate governance"], "SDG_9": ["air transport", "gender perspective", "emission trading", "electric vehicle", "capital formation", "artificial intelligence", "republic korea", "millennium development goal", "import export", "ex post", "prime minister", "high speed", "freight transport", "scientific technological", "guinea bissau", "high tech"], "SDG_10": ["ethnic minority", "annex iii", "cros border", "root cause", "past decade", "elderly people", "commodity price", "disposable income", "minimum wage"], "SDG_11": ["directorate general", "better understanding", "france germany", "urban mobility", "executive summary", "south east asia", "intellectual property right", "fuel consumption", "high spee", "cash flow", "south west"], "SDG_12": ["ministry agriculture", "win win", "mm mm", "convention biological diversity", "medium sized", "north atlantic", "air pollutant", "fish stock"], "SDG_14": ["marine ecosystem", "biodiversity loss", "north sea", "baseline scenario", "mid term", "bosnia herzegovina", "fishing vessel", "papua new guinea", "longer term", "fax mail"], "SDG_15": ["fossil fuel", "low carbon economy", "kyoto protocol", "easily accessible", "damage cause", "genetic resource", "north east", "central eastern europe", "non discriminatory", "internet thing", "motor vehicle", "mitigation adaptation", "search engine", "solar panel", "biodiversity ecosystem"], "SDG_16": ["van der", "personal datum", "ad hoc"], "SDG_17": ["peer review", "natural resources", "road traffic", "communicable disease", "policy makers"]}
--------------------------------------------------------------------------------
/raw_data/0_add/01_add_generated/7_EC_Policy_Doc_Terms/7_process_key_terms.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Thu May 15 15:47:41 2020
5 |
6 | @author: lukas
7 | """
8 |
9 | import pandas as pd
10 | import json
11 | from tqdm import tqdm
12 |
13 | import ast
14 |
15 | replacables_symbols = ["&" , "-" , '"' , " "]
16 | replacables_words = ["and" , "or" , "for", "&" , "of" , "sdg" , "oecd" , "arctic"]
17 |
18 | def pre_proc( list_o_strings ):
19 | """
20 | Keeps only the keywords longer than 4 characters ;
21 | Strips non Alphanumeric chars ;
22 | Removes basic interluding words ( "and" , "of" , etc. ) ;
23 | Deduplicates
24 | """
25 |
26 | processed = []
27 | alpha = "abcdefghijklmnopqrstuvwxyz0123456789 "
28 | for item in list_o_strings :
29 | item = item.replace("_" , " ")
30 | item = item.lower()
31 |
32 | for c in replacables_symbols:
33 | item = item.replace( c , " " )
34 | item_p = item.split()
35 | item = " ".join(i for i in item_p if i not in replacables_words)
36 |
37 | if all( c in alpha for c in item ) :
38 | if item.startswith( " " ) :
39 | item = item[ 1: ]
40 | if item.endswith( " " ) :
41 | item = item[:-1]
42 | if len(item) > 4 :
43 | if item not in processed:
44 | processed.append( item )
45 | return processed
46 |
47 | dfl = pd.read_excel("ECPolicyDocs_Ngrams REVISED.xlsx").to_dict( orient = "records")
48 |
49 |
50 | number_map = {"Goal_1" : "SDG_1" ,
51 | "Goal_2" : "SDG_2",
52 | "Goal_3" : "SDG_3",
53 | "Goal_4" : "SDG_4",
54 | "Goal_5" : "SDG_5",
55 | "Goal_6" : "SDG_6",
56 | "Goal_7" : "SDG_7",
57 | "Goal_8" : "SDG_8",
58 | "Goal_9" : "SDG_9",
59 | "Goal_10" : "SDG_10",
60 | "Goal_11" : "SDG_11",
61 | "Goal_12" : "SDG_12",
62 | "Goal_13" : "SDG_12",
63 | "Goal_14" : "SDG_14",
64 | "Goal_15" : "SDG_15",
65 | "Goal_16" : "SDG_16",
66 | "Goal_17" : "SDG_17",
67 | }
68 |
69 | sdg_words_raw = {}
70 |
71 | for row in tqdm(dfl) :
72 | number = row['Goal'].split(".")[0]
73 | sdg = number_map[ number ]
74 | if sdg not in sdg_words_raw.keys() :
75 | sdg_words_raw[ sdg ] = []
76 | sdg_words_raw[ sdg ] = list( ast.literal_eval(row["SDG&EC_NgramsOverlap"]))
77 |
78 | counter = 0
79 | print("Key Words Identified before cleaning : " )
80 | for key , value in sdg_words_raw.items() :
81 | print( key , " : ", len(value))
82 | counter += len(value)
83 |
84 | print("Overall : ", counter)
85 |
86 | #%%
87 | for key , value in sdg_words_raw.items() :
88 | sdg_words_raw[ key ] = pre_proc( value )
89 |
90 | #%%
91 | """
92 | Deduplicating keywords
93 | """
94 | word_freq_dict = {}
95 | for val in list(sdg_words_raw.values()) :
96 | for v in val :
97 | if v not in word_freq_dict :
98 | word_freq_dict[ v ] = 1
99 | else:
100 | word_freq_dict[ v ] += 1
101 |
102 | #%%
103 | sdg_words = {}
104 | for key , value in sdg_words_raw.items() :
105 | plh = [ i for i in value if word_freq_dict[i] < 2]
106 | sdg_words[ key ] = plh
107 |
108 | #%%
109 | js = json.dumps( sdg_words )
110 | file = open( "7_ProcessedKeyTerms.json" , "w")
111 | file.write( js )
112 | file.close()
113 |
114 | counter = 0
115 | print("Key Words Identified after cleaning: " )
116 | for key , value in sdg_words.items() :
117 | print( key , " : ", len(value))
118 | counter += len(value)
119 |
120 | print("Overall : ", counter)
121 |
--------------------------------------------------------------------------------
/raw_data/0_add/01_add_generated/7_EC_Policy_Doc_Terms/ECPolicyDocs_Ngrams REVISED.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/01_add_generated/7_EC_Policy_Doc_Terms/ECPolicyDocs_Ngrams REVISED.xlsx
--------------------------------------------------------------------------------
/raw_data/0_add/01_add_generated/9_SIRIS_Science4SDGs/9_process_key_terms.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Thu May 15 15:47:41 2020
5 |
6 | @author: lukas
7 | """
8 |
9 | import pandas as pd
10 | import json
11 | from tqdm import tqdm
12 |
13 | import ast
14 |
15 | replacables_symbols = ["&" , "-" , '"' , " "]
16 | replacables_words = ["and" , "or" , "for", "&" , "of" , "sdg" , "oecd" , "arctic"]
17 |
18 | def pre_proc( list_o_strings ):
19 | """
20 | Keeps only the keywords longer than 4 characters ;
21 | Strips non Alphanumeric chars ;
22 | Removes basic interluding words ( "and" , "of" , etc. ) ;
23 | Deduplicates
24 | """
25 |
26 | processed = []
27 | alpha = "abcdefghijklmnopqrstuvwxyz0123456789 "
28 | for item in list_o_strings :
29 | item = item.replace("_" , " ")
30 | item = item.lower()
31 |
32 | for c in replacables_symbols:
33 | item = item.replace( c , " " )
34 | item_p = item.split()
35 | item = " ".join(i for i in item_p if i not in replacables_words)
36 |
37 | if all( c in alpha for c in item ) :
38 | if item.startswith( " " ) :
39 | item = item[ 1: ]
40 | if item.endswith( " " ) :
41 | item = item[:-1]
42 | if len(item) > 4 :
43 | if item not in processed:
44 | processed.append( item )
45 | return processed
46 |
47 | dfl = pd.read_excel("sdg_vocabulary_V1.2 [zenodo](single_shhet).xlsx").to_dict( orient = "records")
48 |
49 |
50 | number_map = {"SDG 1" : "SDG_1" ,
51 | "SDG 2" : "SDG_2",
52 | "SDG 3" : "SDG_3",
53 | "SDG 4" : "SDG_4",
54 | "SDG 5" : "SDG_5",
55 | "SDG 6" : "SDG_6",
56 | "SDG 7" : "SDG_7",
57 | "SDG 8" : "SDG_8",
58 | "SDG 9" : "SDG_9",
59 | "SDG 10" : "SDG_10",
60 | "SDG 11" : "SDG_11",
61 | "SDG 12" : "SDG_12",
62 | "SDG 13" : "SDG_12",
63 | "SDG 14" : "SDG_14",
64 | "SDG 15" : "SDG_15",
65 | "SDG 16" : "SDG_16",
66 | "SDG 17" : "SDG_17",
67 | }
68 |
69 | sdg_words_raw = {}
70 |
71 | for row in tqdm(dfl) :
72 | number = row['SDG']
73 | sdg = number_map[ number ]
74 | if sdg not in sdg_words_raw.keys() :
75 | sdg_words_raw[ sdg ] = []
76 | sdg_words_raw[ sdg ].append( row["keyword"])
77 | extras = str(row["extra"]).split("|")
78 | sdg_words_raw[sdg]+= extras
79 |
80 | counter = 0
81 | print("Key Words Identified before cleaning : " )
82 | for key , value in sdg_words_raw.items() :
83 | print( key , " : ", len(value))
84 | counter += len(value)
85 |
86 | print("Overall : ", counter)
87 |
88 | #%%
89 | for key , value in sdg_words_raw.items() :
90 | sdg_words_raw[ key ] = pre_proc( value )
91 |
92 | #%%
93 | """
94 | Deduplicating keywords
95 | """
96 | word_freq_dict = {}
97 | for val in list(sdg_words_raw.values()) :
98 | for v in val :
99 | if v not in word_freq_dict :
100 | word_freq_dict[ v ] = 1
101 | else:
102 | word_freq_dict[ v ] += 1
103 |
104 | #%%
105 | sdg_words = {}
106 | for key , value in sdg_words_raw.items() :
107 | plh = [ i for i in value if word_freq_dict[i] < 2]
108 | sdg_words[ key ] = plh
109 |
110 | #%%
111 | js = json.dumps( sdg_words )
112 | file = open( "9_ProcessedKeyTerms.json" , "w")
113 | file.write( js )
114 | file.close()
115 |
116 | counter = 0
117 | print("Key Words Identified after cleaning: " )
118 | for key , value in sdg_words.items() :
119 | print( key , " : ", len(value))
120 | counter += len(value)
121 |
122 | print("Overall : ", counter)
123 |
--------------------------------------------------------------------------------
/raw_data/0_add/01_add_generated/9_SIRIS_Science4SDGs/sdg_vocabulary_V1.2 [zenodo](single_shhet).xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/01_add_generated/9_SIRIS_Science4SDGs/sdg_vocabulary_V1.2 [zenodo](single_shhet).xlsx
--------------------------------------------------------------------------------
/raw_data/0_add/01_add_generated/9_SIRIS_Science4SDGs/sdg_vocabulary_V1.2 [zenodo].xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/01_add_generated/9_SIRIS_Science4SDGs/sdg_vocabulary_V1.2 [zenodo].xlsx
--------------------------------------------------------------------------------
/raw_data/0_add/02_add_all_to_all/10_PPMI_boost/10_process_fos.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import json
3 |
4 |
5 | data_1 = pd.read_excel("SDG FOS updated 06 01.xlsx").to_dict(orient="records")
6 | data_2 = pd.read_excel("SDG FOS updated 06 12.xlsx").to_dict(orient="records")
7 |
8 | sdg_words = {}
9 |
10 | for dfl in (data_1, data_2):
11 | for row in dfl:
12 | if str(row['SDG number']) != "nan":
13 | sdg = f"SDG_{int(row['SDG number'])}"
14 | if sdg not in sdg_words.keys():
15 | sdg_words[sdg] = []
16 | sdg_words[sdg].append((str(row['FOS number']), row["FOS name"]))
17 |
18 |
19 | counter = 0
20 | print("Key Words Identified before cleaning : ")
21 | for key, value in sdg_words.items():
22 | print(key, " : ", len(value))
23 | counter += len(value)
24 |
25 | print("Overall : ", counter)
26 |
27 | for sdg_label in sorted(sdg_words.keys(), key=lambda x: int(x.split('_')[-1])):
28 | sdg_words[sdg_label] = sorted(sdg_words[sdg_label], key=lambda x: x[1])
29 |
30 | with open("10_ProcessedFOS.json", "w") as file_:
31 | file_.write(json.dumps(sdg_words))
32 |
33 | counter = 0
34 | print("Key Words Identified after cleaning: ")
35 | for key, value in sdg_words.items():
36 | print(key, " : ", len(value))
37 | counter += len(value)
38 |
39 | print("Overall : ", counter)
40 |
--------------------------------------------------------------------------------
/raw_data/0_add/02_add_all_to_all/10_PPMI_boost/SDG FOS updated 06 01.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/02_add_all_to_all/10_PPMI_boost/SDG FOS updated 06 01.xlsx
--------------------------------------------------------------------------------
/raw_data/0_add/02_add_all_to_all/10_PPMI_boost/SDG FOS updated 06 12.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/02_add_all_to_all/10_PPMI_boost/SDG FOS updated 06 12.xlsx
--------------------------------------------------------------------------------
/raw_data/0_add/02_add_all_to_all/8_NABS_FOS/8_process_fos.py:
--------------------------------------------------------------------------------
1 | from tqdm import tqdm
2 |
3 | import json
4 | import pandas as pd
5 |
6 |
7 | FNAME_PROCESSED_KEY_TERMS = "8_ProcessedFOS.json"
8 |
9 |
10 | if __name__ == '__main__':
11 | fos_data = pd.read_excel('NABS_FOS_update_2020-08-20_ed_VS.xlsx')[['FOS NAME', 'FOS NUMBER', 'SDG']].drop_duplicates()
12 |
13 | # Ignore fos list
14 | ignore_fos = fos_data[fos_data['SDG'] == 'NOT RELEVANT']['FOS NUMBER'].unique()
15 |
16 | sdg_fos = dict()
17 | for fos_name, fos_id, sdg_nr in tqdm(fos_data[~fos_data['FOS NUMBER'].isin(ignore_fos)].values):
18 | sdg_label = f'SDG_{sdg_nr}'
19 | if sdg_label not in sdg_fos.keys():
20 | sdg_fos[sdg_label] = []
21 | sdg_fos[sdg_label].append((str(fos_id), fos_name))
22 |
23 | for sdg_label in sorted(sdg_fos.keys(), key=lambda x: int(x.split('_')[-1])):
24 | sdg_fos[sdg_label] = sorted(sdg_fos[sdg_label], key=lambda x: x[1])
25 |
26 | with open(FNAME_PROCESSED_KEY_TERMS, 'w') as file_:
27 | json.dump(sdg_fos, file_)
28 |
--------------------------------------------------------------------------------
/raw_data/0_add/02_add_all_to_all/8_NABS_FOS/NABS_FOS_update_2020-08-20_ed_VS.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/0_add/02_add_all_to_all/8_NABS_FOS/NABS_FOS_update_2020-08-20_ed_VS.xlsx
--------------------------------------------------------------------------------
/raw_data/1_replace/11_TJL-24_review/11_ReplaceFOS.json:
--------------------------------------------------------------------------------
1 | {"159390177": [["SDG_6", "SDG_2"]], "50516716": [["SDG_6", "SDG_2"]], "139838865": [["SDG_11", "SDG_10"]], "38774213": [["SDG_2", "SDG_2"], ["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "65580899": [["SDG_3", "SDG_2"], ["SDG_6", "SDG_2"]], "182124840": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "52896960": [["SDG_15", "SDG_7"]], "551662922": [["SDG_12", "SDG_8"]], "152494472": [["SDG_6", "SDG_2"]], "198072978": [["SDG_6", "SDG_2"]], "156634047": [["SDG_6", "SDG_2"]], "39464130": [["SDG_6", "SDG_2"], ["SDG_13", "SDG_2"]], "109332788": [["SDG_11", "SDG_10"]], "21790881": [["SDG_6", "SDG_14"], ["SDG_12", "SDG_14"]], "160934017": [["SDG_6", "SDG_2"]], "141650431": [["SDG_6", "SDG_2"]], "109162521": [["SDG_6", "SDG_2"]], "2779819667": [["SDG_11", "SDG_12"]], "7083945": [["SDG_11", "SDG_12"]], "20529654": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "85675897": [["SDG_2", "SDG_2"], ["SDG_6", "SDG_2"], ["SDG_13", "SDG_2"], ["SDG_15", "SDG_2"]], "205726622": [["SDG_6", "SDG_2"]], "120991184": [["SDG_6", "SDG_2"]], "7959160": [["SDG_6", "SDG_2"]], "175963888": [["SDG_6", "SDG_2"]], "162902727": [["SDG_6", "SDG_2"]], "53421856": [["SDG_6", "SDG_2"]], "81751973": [["SDG_12", "SDG_8"]], "121923324": [["SDG_6", "SDG_2"]], "120217122": [["SDG_15", "SDG_2"]], "29510844": [["SDG_6", "SDG_2"]], "70957220": [["SDG_6", "SDG_2"]], "2776266027": [["SDG_6", "SDG_7"], ["SDG_11", "SDG_7"]], "141185391": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "154414509": [["SDG_6", "SDG_2"]], "2780189059": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "126408429": [["SDG_2", "SDG_2"], ["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "172365310": [["SDG_6", "SDG_2"]], "2779449393": [["SDG_11", "SDG_16"]], "560292": [["SDG_12", "SDG_8"]], "71864017": [["SDG_6", "SDG_2"]], "24649204": [["SDG_12", "SDG_8"]], "38070178": [["SDG_12", "SDG_8"]], "78285338": [["SDG_12", "SDG_8"]], "2777382958": [["SDG_12", "SDG_8"]], "33411773": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "5589519": [["SDG_6", "SDG_2"]], "3742959": [["SDG_6", "SDG_2"]], "53706860": [["SDG_6", "SDG_2"]], "172817999": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "2777638134": [["SDG_11", "SDG_4"]], "58395597": [["SDG_6", "SDG_2"]], "3963096": [["SDG_12", "SDG_8"]], "2777276756": [["SDG_12", "SDG_8"]], "156086215": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "14171219": [["SDG_2", "SDG_2"], ["SDG_6", "SDG_2"]], "2781198434": [["SDG_12", "SDG_8"]], "182745123": [["SDG_6", "SDG_2"]], "2777027713": [["SDG_6", "SDG_2"], ["SDG_7", "SDG_2"]], "45020621": [["SDG_6", "SDG_2"]], "2909722689": [["SDG_6", "SDG_2"]], "104471815": [["SDG_6", "SDG_2"]], "28362043": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "2778202820": [["SDG_12", "SDG_8"]], "2778380070": [["SDG_12", "SDG_8"]], "100474770": [["SDG_6", "SDG_2"]], "102561126": [["SDG_11", "SDG_13"]], "63696750": [["SDG_6", "SDG_2"]], "2775845107": [["SDG_6", "SDG_2"]], "114426456": [["SDG_6", "SDG_2"]], "2780339060": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "154885393": [["SDG_6", "SDG_2"]], "2779872728": [["SDG_12", "SDG_8"]], "59804570": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "174200844": [["SDG_6", "SDG_2"]], "2778099469": [["SDG_11", "SDG_16"]], "2780257989": [["SDG_6", "SDG_2"]], "2779422593": [["SDG_6", "SDG_2"]], "53145804": [["SDG_6", "SDG_2"]], "2778163119": [["SDG_6", "SDG_2"]], "42731165": [["SDG_6", "SDG_2"], ["SDG_15", "SDG_2"]], "152100882": [["SDG_6", "SDG_2"]], "55312793": [["SDG_6", "SDG_2"]], "125596622": [["SDG_6", "SDG_2"]], "160212601": [["SDG_6", "SDG_2"]], "2779746779": [["SDG_6", "SDG_2"]], "2778577444": [["SDG_6", "SDG_2"]], "134906952": [["SDG_6", "SDG_2"]], "2909107899": [["SDG_6", "SDG_2"]], "2778818373": [["SDG_6", "SDG_2"]], "2777073172": [["SDG_6", "SDG_2"]], "2910302653": [["SDG_6", "SDG_2"]]}
--------------------------------------------------------------------------------
/raw_data/1_replace/11_TJL-24_review/11_process_replace_fos.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pandas as pd
3 | import re
4 |
5 |
6 | df = pd.read_excel('osdg_fos_paper_citation_counts_REPLACE_v2_ed_VS.xlsx')
7 |
8 | replace_fos = dict()
9 | for fos_id, replace_from, to_sdg in df[['fos_id', 'sdgs', 'replace_to']].values:
10 | replace_from = map(lambda sdg_nr: f'SDG_{sdg_nr}', re.findall(r'\d+', replace_from))
11 | to_sdg_nr = re.findall(r'\d+', to_sdg)[0]
12 | to_sdg = f"SDG_{to_sdg_nr}"
13 | if fos_id not in replace_fos.keys():
14 | replace_fos[fos_id] = []
15 | for from_sdg in replace_from:
16 | replace_fos[fos_id].append([from_sdg, to_sdg])
17 |
18 | with open('11_ReplaceFOS.json', 'w') as file_:
19 | json.dump(replace_fos, file_)
20 |
--------------------------------------------------------------------------------
/raw_data/1_replace/11_TJL-24_review/osdg_fos_paper_citation_counts_REPLACE_v2_ed_VS.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/1_replace/11_TJL-24_review/osdg_fos_paper_citation_counts_REPLACE_v2_ed_VS.xlsx
--------------------------------------------------------------------------------
/raw_data/1_replace/12_Review_2020-10-02/12_ReplaceFOS.json:
--------------------------------------------------------------------------------
1 | {"53421856": [["SDG_2", "SDG_15"]], "160934017": [["SDG_2", "SDG_15"]], "29510844": [["SDG_2", "SDG_15"]], "7959160": [["SDG_2", "SDG_15"]], "198072978": [["SDG_2", "SDG_15"]], "152494472": [["SDG_2", "SDG_15"]], "120991184": [["SDG_2", "SDG_15"]], "14171219": [["SDG_2", "SDG_15"]], "156634047": [["SDG_2", "SDG_15"]], "159390177": [["SDG_2", "SDG_15"]], "50516716": [["SDG_2", "SDG_15"]], "100474770": [["SDG_2", "SDG_15"]], "53706860": [["SDG_2", "SDG_15"]], "58395597": [["SDG_2", "SDG_15"]], "5589519": [["SDG_2", "SDG_15"]], "175963888": [["SDG_2", "SDG_15"]], "2909722689": [["SDG_2", "SDG_15"]], "172365310": [["SDG_2", "SDG_15"]], "3742959": [["SDG_2", "SDG_15"]], "65580899": [["SDG_2", "SDG_15"]], "63696750": [["SDG_2", "SDG_15"]], "182745123": [["SDG_2", "SDG_15"]], "71864017": [["SDG_2", "SDG_15"]], "174200844": [["SDG_2", "SDG_15"]], "160212601": [["SDG_2", "SDG_15"]], "152100882": [["SDG_2", "SDG_15"]], "104471815": [["SDG_2", "SDG_15"]], "39464130": [["SDG_2", "SDG_15"]], "114426456": [["SDG_2", "SDG_15"]], "125596622": [["SDG_2", "SDG_15"]], "55312793": [["SDG_2", "SDG_15"]], "205726622": [["SDG_2", "SDG_15"]], "2910302653": [["SDG_2", "SDG_15"]], "53145804": [["SDG_2", "SDG_15"]], "139669111": [["SDG_2", "SDG_15"]], "162902727": [["SDG_2", "SDG_15"]], "70957220": [["SDG_2", "SDG_15"]]}
--------------------------------------------------------------------------------
/raw_data/1_replace/12_Review_2020-10-02/12_process_replace_fos.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import json
3 |
4 |
5 | replace_fos = dict()
6 |
7 | df = pd.read_csv('replace-review_2020-10-02.csv')
8 | for fos_id, _, from_sdg, to_sdg in df.values:
9 | fos_id = str(fos_id)
10 | if fos_id not in replace_fos.keys():
11 | replace_fos[fos_id] = []
12 | replace_fos[fos_id].append([from_sdg, to_sdg])
13 |
14 | with open('12_ReplaceFOS.json', 'w') as file_:
15 | json.dump(replace_fos, file_)
16 |
--------------------------------------------------------------------------------
/raw_data/1_replace/12_Review_2020-10-02/replace-review_2020-10-02.csv:
--------------------------------------------------------------------------------
1 | fos_id,fos_name,from_sdg,to_sdg
2 | 53421856,Soil biology,SDG_2,SDG_15
3 | 160934017,Soil type,SDG_2,SDG_15
4 | 29510844,Soil chemistry,SDG_2,SDG_15
5 | 7959160,Soil biodiversity,SDG_2,SDG_15
6 | 198072978,Soil pH,SDG_2,SDG_15
7 | 152494472,Soil classification,SDG_2,SDG_15
8 | 120991184,Soil structure,SDG_2,SDG_15
9 | 14171219,Agricultural soil science,SDG_2,SDG_15
10 | 156634047,Soil horizon,SDG_2,SDG_15
11 | 159390177,Soil science,SDG_2,SDG_15
12 | 50516716,Soil test,SDG_2,SDG_15
13 | 100474770,Soil physics,SDG_2,SDG_15
14 | 53706860,Soil morphology,SDG_2,SDG_15
15 | 58395597,Red soil,SDG_2,SDG_15
16 | 5589519,Soil series,SDG_2,SDG_15
17 | 175963888,Soil texture,SDG_2,SDG_15
18 | 2909722689,Soil Pollutants,SDG_2,SDG_15
19 | 172365310,Soil microbiology,SDG_2,SDG_15
20 | 3742959,Soil survey,SDG_2,SDG_15
21 | 65580899,Soil contamination,SDG_2,SDG_15
22 | 63696750,USDA soil taxonomy,SDG_2,SDG_15
23 | 182745123,Soil gradation,SDG_2,SDG_15
24 | 71864017,Soil map,SDG_2,SDG_15
25 | 174200844,Unified Soil Classification System,SDG_2,SDG_15
26 | 160212601,World Reference Base for Soil Resources,SDG_2,SDG_15
27 | 152100882,Soil color,SDG_2,SDG_15
28 | 104471815,Digital soil mapping,SDG_2,SDG_15
29 | 39464130,Soil carbon,SDG_2,SDG_15
30 | 114426456,Soil thermal properties,SDG_2,SDG_15
31 | 125596622,Soil resilience,SDG_2,SDG_15
32 | 55312793,National Cooperative Soil Survey,SDG_2,SDG_15
33 | 205726622,Soil mechanics,SDG_2,SDG_15
34 | 2910302653,SOIL EXPOSURE,SDG_2,SDG_15
35 | 53145804,Soil food web,SDG_2,SDG_15
36 | 139669111,Understory,SDG_2,SDG_15
37 | 162902727,Soil conditioner,SDG_2,SDG_15
38 | 70957220,Soil compaction,SDG_2,SDG_15
39 |
--------------------------------------------------------------------------------
/raw_data/1_replace/ReplacedFOS.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/1_replace/ReplacedFOS.xlsx
--------------------------------------------------------------------------------
/raw_data/2_remove/20_FP7-4-SD_edited/20_RemoveFOS.json:
--------------------------------------------------------------------------------
1 | {"SDG_1": ["198428699", "170659323", "46578552", "2780196416", "47344431", "2780650499", "2780877353", "2780774518", "193395930", "2781201427", "2778149918", "129963666"], "SDG_2": ["2910099511", "2908895846", "2909158028", "47806933", "2910592767", "2779491563", "2775988993", "18419278", "3480121", "2908684217", "2777626052", "2776423422", "138256199", "65607591", "2779910751", "2777970171", "53007507", "74359761", "2908707527", "2776574205", "2910521719", "2776928777", "2776941976", "2779910956", "2777403496", "2908614360", "2909333666", "2909520665", "2780687331", "58808276", "2909745607", "115220002", "148257392", "2778020220", "106165879", "2780114722", "2910394173", "2910354950", "2780118905", "138020889", "2779814899", "100521375", "37662734", "97483426", "25260931", "2909271431", "2908574822", "51672120", "164065428", "94919774", "2778080818", "38749836", "2781415353", "2909072068", "2909807485", "2775937386", "2777974810", "78161392", "2910479360", "138399698", "120562766", "185671874", "2776744078", "2776709667", "50919411", "65257409", "2781418482", "2777621572", "173837035", "2910092765", "37228920", "170959889", "2909553954", "98536072", "135300049", "21249469", "2777220787", "2780471819", "2779634897", "549605437", "42525527", "2775840374", "2909076198", "27254500", "122735190", "2908952963", "101434241", "130947863", "2910985744", "2776968180", "190638079", "2776020993", "149172842", "2780017871", "2775834976", "120912362", "2910220031", "192233252", "102744134", "107211472", "2779160288", "2910444920", "2909153497", "44283249", "540751848", "2910258602", "2910321295", "175965649", "2779704485", "79850504", "2777958824", "2778365744", "2778172956", "134864226", "4733338", "60635243", "195454712", "78945660", "2781445593", "2777481183", "2778600265", "148196450", "2910992960", "2910222286", "2777926330", "2779091883", "2910342552", "2776482837", "2780051701", "2781446643", "2780015948", "2777103877", "2778599437", "11804174", "2777264270", "2780170040", "2776903312", "175895763", "2779287493", "39257715", "2909270706", "63412515", "2909934379", "2776319399", "2779459076", "2776008721", "2910734704", "2780030891", "2776665880", "2777963300", "60544836", "2911159031", "2909039982", "64417066", "2777849792", "2910859984", "2776609134", "2781314072", "2909612347", "2779168029", "168402607", "2778793514", "2779652696", "117534273", "183688256", "2779316952", "2908972231", "2910200824", "2910994361", "2909856885", "2776809771", "2780535588", "131923401", "2910940979", "2910538944", "2777271545", "2776338311", "2776773308", "2781281389", "2778438103", "2781375701", "2778184291", "2776294106", "111280770", "2909571003", "2778274352", "2777714445", "148846489", "70295763", "2780608908", "147716585", "26355699", "2779218938", "2777647554", "2777250853", "2776317494", "2777469154", "169490005", "9111530", "2777584449", "101512455", "2776947765", "206625514", "45051096", "2777330291", "2781396848", "162947575", "176035894", "48306297", "125900194", "124219066", "2778175407", "2780684046", "2780643479", "2778532622", "152747807", "201437064", "53702515", "2780805685", "2776905826", "196467688", "2776266440", "174253337", "190930322", "202964095", "167887339", "11039648", "141983198", "87841596", "153876917", "2776922509", "2910336849", "2781289450", "2776077682", "2777060948", "80323366", "2779046117", "2779862049", "2779741023", "2776222705", "2780597670", "2777140777", "2779740938"], "SDG_3": ["2910288937", "25166345", "2909111439", "167908162", "2776888527", "2779363792", "198738867", "111684460", "133462117", "91632574", "73751289", "148449293", "2910560156", "2910427492", "2911021130", "2910860471", "30439317", "108074857", "63540848", "2909518570", "2778444522", "2775940519", "2779372377", "2779870758", "2779728303", "2779308462", "2777814067", "2781145028", "37098654"], "SDG_4": ["2780623789", "2909064599", "2910043827", "132758656", "2908885425", "146804397", "24845683", "58346731", "2910324923", "2780550299", "2910181414", "2779686014", "2908678694", "117893075", "74279204", "78015137", "2776675903", "59364581", "61189997", "2780852648", "172905872", "2910043429", "173481278", "2909755642"], "SDG_5": ["2776596443", "541189924", "2908821035", "2909253651", "2778307344", "2779621813", "2781437166"], "SDG_6": ["2779732133", "59269818", "78302586", "544153396", "39442485", "2781128188", "11999413", "108597893", "2781026758", "2778323849"], "SDG_7": ["2911104624", "2776122723", "2775918509", "2909376016", "148718273", "49848784", "2776581130", "138417311", "2776611462", "25915539", "141842801", "112505250", "2778835581", "2910577901", "2909450372", "2908874825", "136649699", "584957", "19766214", "2777382002", "33840335", "147441545", "2779941319", "74824818", "115957382", "2910439062", "2778321654", "2780339557", "2908683193", "199873434", "2780331013", "126172416", "2781309322", "118635694", "2910681606", "2775974325", "202446494", "2909542240", "131747538", "108615695", "2777708149", "118732332", "2909269005", "2780452421", "33134510", "2776909254", "97508593", "151771877", "193809577", "192668324", "2908591035", "2908610585", "123380192", "138171918", "183912175", "2778944020", "99611785", "169574100", "42067281", "166151169", "45872418", "197301865", "2910564024", "2910306918", "2781056475", "2778348927", "125171110", "195534400", "2776365744", "2777045768", "3839877", "14642617", "2778431730", "83160514", "2780611706", "2777071705", "85909142", "79675319", "53914812", "2778776584", "2778334255", "2777890241", "78246475", "47645306", "145460709", "159851900", "88743934", "169961344", "2781333068", "57177791", "2910445384", "2910822426", "2780839634", "2781030502", "2776588390", "63969886", "2780251136", "170133592", "123977732", "15569618", "3283095", "2780611830", "132868160", "2776917865", "505695854", "2779700286", "179036041", "127288500", "51926234", "2778927675", "59329165", "162168397", "106189395", "2778383842", "179733262", "2776892096", "26324664", "2908784896", "191186522", "2909205303", "19966478", "2780688951", "2908749873", "2778569793", "113740612", "81877898", "2776810965", "159795486", "2909187471", "2909741741", "14447218", "199364081", "121629672", "95930237", "148651041", "2777155145", "130207615", "17098449", "42812", "12701381", "54017597", "93552971", "35995877", "126789939", "2780942248", "31771446", "54932901", "201999631", "2776069950", "2780638000", "2909937733", "136155141", "49304495", "166194698", "93953391", "42021957", "108848220", "188116033", "120398109", "2777126586", "62467634", "58896106", "56985126", "21552470", "28472234", "161028810", "2778772182", "2911016986", "192299074", "37530146", "74222875", "2779110910", "17371274", "22116519", "53645450", "82979123", "2776228582", "46787917", "83204339", "107863493", "2777027713", "29621489", "188087704", "2779503484", "2778958889", "60439489", "2777373712", "112578098", "75003639", "2777328224", "151948712", "29652920", "173182743", "29310469", "145597803", "2777134600", "98943031", "154864947", "2780949067", "167310744", "162681261", "52121051", "101188967", "2908581237", "103753734", "2780778756", "2777742759", "2779117831"], "SDG_8": ["2776444593", "2909413384", "27591710", "2778896325", "2911011203", "2910222570", "2910326028", "2780775167", "2911164255"], "SDG_9": ["137099501", "2909452073", "2778097690", "2775925408", "529335014"], "SDG_10": ["102268210", "105152847", "169900460", "61641136", "2910665876", "16976872", "2780776881", "2778078003"], "SDG_11": ["2776902872", "2776673659", "2908674967", "24856439", "183283035", "2778839380", "2779323829", "100675267", "2776689096", "2776508417", "23221634", "2778977993", "2778920248", "2779661781", "75461684", "2777614519", "111603439", "49221354", "2855170", "151890184", "2781255199", "80309976", "2777161741", "2911048674", "2779627320", "29279314", "2775976938", "2909020933", "44263959", "2779636881", "121684219", "550222582", "120208923", "81302111", "2780302256", "2777131152", "2777111354", "53232910", "100368936", "171730128", "80583463", "5072461", "200046510", "81961946", "536930464", "2777488192", "6506403", "120938966", "2778132726", "126082660", "2777440324", "173870130", "18030348", "80368990", "54855816", "2776816662", "125470083", "2778717691", "110604844", "2775886207", "2910286708", "2911120092", "120352889", "122224866", "97250363", "2780940931", "23680986", "2777262768", "2780575044", "2778269189", "128226362", "201743585", "73340581", "7149132", "2778790543", "2776389138", "53160558", "2778497495", "2911224752", "2908570632", "2781461753", "2781099003", "81667532", "50415386", "120588126", "207821765", "2776974013", "2225880", "38439746", "550607084", "2781281093", "107157880", "172438305", "2917558", "39511330", "150506046", "107119854", "69423932", "37350624", "206019424", "2779652681", "2779529265", "2780015235", "2778414658", "50637493", "2780665216", "189360488", "2777328387", "10245270", "2779473934", "103648661", "16189245", "167752473", "11360483", "102792161", "132373408", "2779313563", "2777041775", "2780761308", "2779962852", "2779313700", "20756127", "2910820772", "2777877904", "2779720300", "68640439", "2777735972", "131979681", "2778330474", "2909681832", "2909395910", "2781293718", "21457203", "2908664457", "2908818157", "7856111", "2780066083", "2780423321", "2777988118", "2778821660", "193450905", "2776219102", "2776160632", "148699463", "2775873933", "2775937711", "2776408593", "197553423", "2779279276", "2776764004", "2779462066", "2780444441", "2777048483", "2779722824", "98200471", "14390630", "46737286", "2910792664", "85148207", "2910477109", "200749887", "2910310371", "2777346527", "2775932640", "2781257993", "2909546771", "2911038400", "72355985", "40350719", "2776485071", "168443057", "2780775721", "142442999", "43227947", "2777447984", "193759585", "2776432661", "194229684", "57341113", "2909072158", "52226264", "2909398177", "2777817495", "13743948", "107779570", "137990359"], "SDG_12": ["23138022", "2778253041", "2776596069", "2779338949", "193596192", "84859931", "2777091700", "2780596747", "2777247137", "20820323", "162853370", "72104268", "2776558947", "525650276", "2779738550", "169093310", "63257944", "36067731", "188468808", "148027575", "62232509", "120302269", "101230327"], "SDG_14": ["78275445", "79158427", "2776665970", "173656711", "19889080", "2780309369", "2908618603", "530175646", "2909005227", "29275276", "202824567", "2909048777", "79334102", "51865526", "45942800", "2777721721", "2779086188", "2777894483", "2780042314", "2780583818", "2910866688", "205649164", "16405173", "73525677", "150012506", "2776582039", "102315692", "8182607", "155484110", "204259536", "103500101", "2909697453", "2779919027", "2775922648", "200401390", "37202355", "191506330", "52146309", "171276312", "2780660560", "153440673", "2778206238", "2781330656", "166423231", "2778199754", "2910510794", "2353230", "190703929", "50311922", "58341921", "201490090", "185809878", "164120249", "51450119"], "SDG_15": ["2778049214"], "SDG_16": ["527821871", "2780786045", "33791563", "2781107206", "143425610", "2776982550", "157686319", "9201690", "15758519", "2778906372", "69828861", "104177525", "117353447", "22674136", "2776112939", "2909746666", "174943157", "189809214", "35550292", "105585729", "191393472", "2779254040", "100102862", "31829608", "106544461", "2779686019", "123045823", "43067198", "203165030", "2908548367", "47607710", "2780513914", "2909318246", "2909123673", "2910269103", "2779186577", "9514381", "2779847632", "201762086", "2776614250", "91760546", "120144228", "106737062", "2779566273", "202796686", "2777010668", "2909947951", "2911000069", "2909328758", "146870623", "85014361", "2780351192", "2780262311", "2779387731", "102375830", "2778804986", "65067816", "2781205572", "2908766468", "2779401785", "164172150", "2779359390", "166003498", "2776987467", "2778159086", "164663123", "2910200502", "2777810591", "171906077", "2909832105", "181149355", "33884865", "178489894", "84525096", "2910472664", "134174499", "2780320074", "2910273717", "2778166725", "2780270224", "2909434199", "2909423120", "2778484313", "2778654863", "89198739", "2781351580", "101959639", "971699", "4698774", "2776498708", "77019957", "94915269", "194072897", "82922719", "2781357168", "2779813694", "2776622343", "169796023", "2542834", "167225187", "104383817", "2779129001", "2777257180", "140006998", "74363100", "2778456923", "2910395371", "2779311591", "2908611806", "98940541", "204016326", "185429906", "2910001868", "2909498615", "110921888", "2910704000", "2909609750", "2780049918", "2909902876", "2909804582", "2781195161", "2909314849", "2778436418", "2909600298", "154238967", "2780721665", "76144217", "2778618615", "2199051", "2910471639", "2777490532", "2776459999", "116251930", "130731218", "44750222", "2911010606", "200797679", "164995936", "202292293", "168406668", "48295401", "56906370", "74556096", "2911193946", "2780732888", "140505726", "2909075684", "111498074", "2778605688", "2908834839", "63854197", "2910259063", "2777475166", "2779916870", "75114861", "188649462", "12780434", "68307924", "2781138619", "2775899829", "186835682", "2910173640", "2777240490", "72320291", "2778286736", "69258756", "2776833093", "105409693", "2776905153", "2781115736", "118867912", "2776942576", "33222762", "555379026", "2778290591", "2779881993", "37672646", "8397983", "2779270055", "2909874202", "556297831", "60008888", "33326189", "2778983686", "2779184870", "91435432", "177821555", "2781105336", "2779677046", "12365522", "112138406", "40046163", "138207750", "31901060", "2746353", "2778921735", "2910953355", "2776824162", "2776987546", "2779872411", "162571340", "200185824", "130684572", "21442874", "106289968", "137975842", "27426343", "124568556", "544833334", "2779608074", "200909587", "2778186200", "2909624168", "2908573047", "174127684", "2909263554", "2308441", "2778532584", "86037889", "174176344", "100203831", "2779395397", "196491621", "2777299998", "160776313", "2776911219", "61871575", "2778759178", "128805008", "144090359", "111964698", "44083865", "50091055", "144486260", "167275870", "2775928558", "151989614", "2779079919", "2779793503", "2909933650", "2780098792", "67174900", "2780052528", "2780967490", "2778898898", "2908793332", "2780005421", "110739175", "83516960", "2910491271", "14982408", "50747538", "2780358027", "152568617", "89136471", "184186437", "73649233", "84952885", "50776230", "147027905", "78299736", "134801348", "53076038", "2779714858", "2909071857", "2780656832", "2780542009", "137405303", "16759151", "203094294", "98893333", "123326733", "123583881", "18362487", "35788789", "183617614", "2779073994", "138569888", "2779792404", "2778517334", "2777008152", "183680338", "151120012", "2780342482", "115910719", "153692070", "134066672", "145097563", "35637245", "2776623338", "185822510", "2776983043", "2776486069", "155051475", "2780668467", "2777286522", "8643368", "2777657240", "2780003111", "190771501", "2781196315", "26834231", "2777482191", "196156399", "73712438", "71743495", "186293655", "2908556616", "67666897", "167693441", "175968658", "2910490378", "142944206", "158531012", "5395021", "22241219", "120892966", "191795016", "27826464", "103232671", "129724132", "75398719", "2778989422", "10929652", "154038757", "2779989747", "164226766", "1026927", "118248890", "144709373", "176258234", "2778908344", "83975546", "2908584300", "2777842450", "2780986262", "158154518", "2908935257", "2910048382", "27357055", "56281022", "2781330901", "2780164666", "2775892892", "2777200438", "2909309735", "107584723", "67226441", "45567728", "2777549818", "2779619698", "45737032", "2777646408", "2779751349", "114445506", "18396474", "13652956", "131275738", "2776711565", "2779280868", "150018143", "179302884", "2780723106", "148704626", "83616695", "45326173", "49289754", "114938261", "74370796", "145804949", "2779300802", "2910486168", "2911024786", "2776898426", "2909377819", "115314053", "196690852", "60136833", "2777671340", "45355965", "2777629068", "25566979", "149091818", "2776831955", "124219066", "167900197", "2779288016", "107027933", "2777611316", "2776904728", "2778764671", "2778579508", "146667757", "102213258", "2778216119", "203133693", "2780648150", "2776540713", "38635669", "13459763", "180727682", "37771279", "2776040635", "154800190", "2777480472", "2780968727", "2776831232", "206149592", "202775310", "147346212", "2780164529", "2778565663", "51945325", "22760457", "173836518", "2776889888", "165751822", "2776548393", "181169782", "151211776", "106030495", "2910110944", "2780507753", "182964821", "2778150766", "95713431", "8020162", "85946185", "127613066", "59241245", "97200028", "75773760", "94643802", "180932941", "124086997", "2910431462", "2779352166", "2779886121", "36914074"]}
--------------------------------------------------------------------------------
/raw_data/2_remove/20_FP7-4-SD_edited/20_process_remove_fos.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pandas as pd
3 |
4 | df = pd.read_csv('bad_fos.csv')
5 | df = df.drop_duplicates(['sdg', 'fos_id'])
6 |
7 | remove_fos = dict()
8 | for sdg_label, fos_id, _ in df.values:
9 | if sdg_label not in remove_fos.keys():
10 | remove_fos[sdg_label] = []
11 | remove_fos[sdg_label].append(str(fos_id))
12 |
13 | with open('20_RemoveFOS.json', 'w') as file_:
14 | json.dump(remove_fos, file_)
15 |
--------------------------------------------------------------------------------
/raw_data/2_remove/21_8_NABS_FOS/21_process_remove_fos.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pandas as pd
3 |
4 | df = pd.read_excel('NABS_FOS_update_2020-08-20_NOT-RELEVANT__ed_VS.xlsx')
5 | not_relevant_fos = df['fos_number'].unique().tolist()
6 | remove_fos = {
7 | f'SDG_{sdg_nr}': list(map(lambda fos_id: str(fos_id), not_relevant_fos))
8 | for sdg_nr in range(1, 18)
9 | }
10 |
11 | with open('21_RemoveFOS.json', 'w') as file_:
12 | json.dump(remove_fos, file_)
13 |
--------------------------------------------------------------------------------
/raw_data/2_remove/21_8_NABS_FOS/NABS_FOS_update_2020-08-20_NOT-RELEVANT__ed_VS.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/2_remove/21_8_NABS_FOS/NABS_FOS_update_2020-08-20_NOT-RELEVANT__ed_VS.xlsx
--------------------------------------------------------------------------------
/raw_data/2_remove/22_TJL-24_review/22_RemoveFOS.json:
--------------------------------------------------------------------------------
1 | {"SDG_3": ["71924100", "86803240", "141071460", "142724271", "203014093", "1862650", "177713679", "159047783", "54355233", "159110408", "126838900", "98274493", "2779134260", "121608353", "160735492", "115260700", "526734887", "44228677", "199360897", "133462117", "2779281246", "510538283", "56318395", "205545832", "206836424", "100243477", "32546565", "2779139147", "23131810", "58874564", "56995899", "196697905", "186187911", "529173508", "544657597", "508106653", "2777566558", "505241676", "67649825", "147080431", "15952604", "192144188", "2776714187", "19351080", "551968917", "82789193", "2776780178", "536738050", "22467394", "142757262", "2776459890", "2780503075", "141379421", "42781572", "134659438", "150594956", "174618031", "17235551", "2776866176", "104819515", "147224300", "183469790", "2776321320", "111852164", "2777935641", "111998727", "106847996", "82381507", "2778372188", "65545243", "2776980637", "104122410", "2781460079", "169274487", "2776093513", "14185376", "2779402116", "158592959", "179179568", "2775968528", "181907467", "2780901251", "2776960227", "519991488", "2775933685", "37413474", "89008666", "2776933761", "2776056205", "2780771206", "62826618", "44221107", "70352696", "155911762", "2779466056", "57477423", "135448224", "68710425", "44403221", "147717901", "196777733", "11105738", "39424602", "2778617687", "50952357", "540938839", "2779138802", "141795571", "2776875633", "2781005124", "2781402376", "192039680", "141239990", "2780554537", "152236973", "151286553", "128717455", "2779809887", "10165471", "125938697", "205380661", "2779898584", "122881758", "2780312654", "2778605646", "193641492", "2779909984", "117009084", "116856471", "125198404", "163688568", "2780718992", "2781040256", "2779825147", "151054161", "2780762185", "78780964", "2780754355", "2779033964", "37752577", "20860254", "2779227060", "28225019", "2776970089", "185298936", "196467688", "108905452", "69357855", "118316555", "128644962", "148325268", "140764562", "199561411", "125450847", "2777365067", "2777413408", "25070020", "57805442", "72404758", "82740854", "162466561", "2779905828", "104335537", "199529486", "2776471321", "19163912", "123688308", "2776971686", "2777317252", "2781249807", "2778267616", "145417883", "26573533", "2777215511", "2780407094", "20129857", "101812284", "58916441", "2781044819", "187642187", "2776165558", "2776196091", "16895185", "2780944729", "5041914", "2779310008", "555789112", "81758059", "2781083359", "2779201015", "83100098", "197712280", "69505689", "2777913276", "82484044", "2776591724", "166936260", "2777780933", "91632574", "191364105", "2781426373", "40442364", "2776753347", "2778974597", "52173716", "2777299493", "2780263730", "2777226368", "2777289228", "112098571", "2780330291", "28722885", "34929307", "99762115", "20901353", "156983192", "2908586218", "150670458", "2777967479", "187696735", "94176051", "2908751799", "88606150", "2780822005", "115174607", "2910010793", "91790935", "2777072894", "2908926047", "127634017", "2776695260", "33010914", "73751289", "2780935168", "2909563789", "2778454149", "2778280450", "2778646529", "2908924136", "201033657", "108318186", "2780553527", "176672177", "2910782172", "172710988", "2777936119", "2909859419", "189812789", "36080966", "75458452", "2776451152", "2780931059", "2780391353", "2776955481", "2776395653", "2909397458", "2779974081", "2775908279", "179852193", "2781461121", "2777179688", "2781206205", "24493144", "2908803427", "135935922", "2777191628", "2779561794", "2780186313", "2778580320", "2778279454", "2777429807", "2779401766", "203731517", "2780757305", "2778004377", "2778070212", "2781333626", "105099762", "508295664", "39154926", "2777601251", "2776586755", "2777006632", "18986850", "2778248277", "2778936159", "44980441", "2779051267", "84792229", "2909767253", "14471711", "123741691", "185734982", "2778696743", "2909875802", "141388940", "2776346358", "2910755469", "2777076221", "1060249"], "SDG_9": ["127413603"], "SDG_11": ["15708023", "24890656", "2908647359", "166957645", "2549261", "198394728", "99454951", "149923435", "20625102", "129727815", "2780165032", "115901376", "536315585", "179454799", "2780781376", "202372285", "64413873", "43126263", "76775654", "2775896111", "163428354", "62908951", "22590252", "49876356", "2778073708", "105636585", "42045870", "116081451", "49999975", "2780743171", "556340858", "84250820", "108257041", "1813318", "176165272", "2776280689", "159032367", "2781145028", "130076159", "2778165684", "2781007418", "4590074", "2776928176", "141371185", "83854009", "2777068528", "2776870768", "2780278329", "2779436609", "78390623", "2775838644", "2777790407", "19994219", "2779356876", "2781316319", "57097009", "2781112155", "35647692", "201052633", "122173349", "86811826", "2780210451", "141321718", "2779711381", "2776676706", "2777617796", "2778842010", "2781119000", "2778205265", "26623033", "2909633619", "39014021", "144199811", "204431084"], "SDG_4": ["33923547", "509550671", "150394285", "103208741", "147077947", "55958113", "184356942", "108583219", "542530943", "2779018934", "2777189325", "2776526686", "134290984", "175801342", "154482161", "86637286", "522453465", "2780035574", "521786372", "507981020", "2777075199", "23588892", "138296749", "2781051278", "188116033", "2777603413", "197676734", "2776934989", "148324565", "2780732545", "165364887", "2778325511", "2776622967", "2779106483", "164403151", "2777244724", "8077954", "2779063172", "2779961193", "2909116566", "2781349114", "2909931160", "2777841733", "85597727", "2778197446", "204814006", "2910150694", "164449429"], "SDG_6": ["39432304", "201289731", "138921699", "87717796", "107826830", "41625074", "111368507", "91375879", "54286561", "523546767", "521259446", "2776256026", "2776053758", "2778357586", "522964758", "125907379", "108469399", "82576440", "130797344", "2779547435", "131046424", "188287460", "158836135", "499616599", "36574619", "143020374", "2781287369", "93983250", "52201283", "40241539", "2778572946", "16989226", "2776364969", "198428699", "15098985", "2778182573", "108797546", "108628306", "51832835", "547231352", "68359772", "153102810", "205537798", "130950616", "154261466", "2779282177", "2778148510"], "SDG_10": ["162324750", "119857082", "45355965", "2779119184", "121017731", "51067260", "557691694", "25810664", "166052673", "159176650", "163836022", "162725370", "91093795", "540791928", "2780535194", "109051061", "160333310", "161407221", "84945661", "177309310", "2776354556", "2779625216", "2779201187", "188116033", "191953296", "2776845425", "128963836", "116019233", "97713585", "2909801347", "2908766468", "2781313914", "2909025839", "4162061", "2909439219", "179709323", "2909492420", "100607858", "2779997400", "2776572088", "150432175", "2910289302", "2910645313"], "SDG_2": ["153911025", "77088390", "2779234561", "87976508", "150194340", "105639569", "2779483572", "118643609", "173145845", "549605437", "8673954", "26291073", "140413371", "84699730", "134215735", "30455989", "132964779", "183135511", "2780523633", "35158069", "155739000", "2776841711", "120009192", "2777782036", "2780246931", "2779764123", "123336316", "162889289", "203017698", "93066458", "2778452349", "2780106736", "137555145", "59582021", "54815482", "112939947", "47924181", "105152847", "2778944361", "134068817", "9927688", "2779501324", "91770344", "163588314", "2780238508", "177658893", "98722961", "2777129469", "165237769", "130693829", "2779485152", "2780174665", "8313540", "24144980", "2780907584", "2777858656", "2776111823", "2777617796", "2778733383", "2780745107", "2778359420", "2778200843", "2777416314", "2910375186", "2776672683", "2780871851", "157717039", "23837897", "2776176627", "107888415", "2777794352", "2778554304", "2775868463", "2779287364", "2779425982", "91447561", "199724614", "2778896754", "2909399481", "2910933275", "2909524676", "123244313", "2777953396", "33824837", "2909753820", "2779979797", "2910653396", "2778329027", "2911118914", "2781218492", "2908542670", "2909529903", "2909506248", "96105989", "2777438402", "181607587", "2775991992", "2910584990", "2776708618", "2909152114", "2778003962", "2776007641", "2910283248", "2779529612", "2779316989", "2911208417", "2909807485"], "SDG_13": ["18903297", "2780471494", "153294291", "204530211", "23795335", "537208039", "94061648", "521259446", "58874564", "2780805685", "2777822432", "112964050", "147534773", "123403432", "151406439", "127454912", "2778918656", "173651095", "106199856", "143299363", "136020623", "2775840915", "2779900269", "6964187", "20564796", "114148465", "38262639", "545622115", "128849468", "206145494", "25022447", "93785673", "2780211030", "2777605225"], "SDG_12": ["31972630", "206139338", "181199279", "2779851234", "28328180", "146778888", "52121051", "88182573", "201903717", "67203356", "108713360", "2776985865", "44877443", "32198211", "2776943663", "105306849", "2777612826", "45292766", "167740415", "69991583", "2781400479", "2779570065", "160565873", "182566", "199310239", "2779301550", "82753439", "502701156", "143020374", "2779726014", "49326732", "204217086", "2778035492", "15098985", "62960913", "189123395", "503285160", "117185709", "2780518120", "2776002898", "204983608", "2776908094", "2777566824", "92244383", "170828538", "173366509", "76893819", "183682340", "21338462", "2779167034", "2778734332", "41826821", "29140674", "47187476", "99578197", "2780848231", "54276265", "2779293432", "123703457", "196781063", "2779539549", "88959737", "169824061", "39177556", "28613373", "6907630", "104002121", "186673887", "127045886", "80646779", "171988757", "2777637287", "58640550", "7591567", "192045728", "94866938", "2780210451", "2723826", "164495641", "2775953691", "25796384", "2779299574", "2776936074", "2780569836", "2910251023", "2777121799", "116197896", "46312889", "150839157", "190362163", "201958364", "2909963963", "2778126675", "2775893736", "5035944", "2910127915", "30543370", "159821036", "2778804209"], "SDG_1": ["36289849", "549605437", "2781426361", "2781061807", "2778452349", "2778054917", "2776672683", "2775868463", "78302928", "2909852078"], "SDG_15": ["144027150", "176933379", "185933670", "78458016", "2776042228", "145097563", "130217890", "68189081", "153823671", "81860439", "126343540", "157021035", "56685638", "36727532", "51244244", "2777904157", "97854310", "2778157034", "16397148", "2778049214", "149207113", "202552767", "139669111", "23119410", "66782513", "55347375", "513535597", "43003075", "2779142801", "521815418", "162012527", "72286879", "173979980", "2779152076", "2778148510"], "SDG_8": ["187212893", "147583825", "158886217", "68189081", "176289848", "2778431023", "2779011557", "111226992", "175700187", "78597825", "506796395", "2776125615", "2779986911", "206713868", "2778556080", "93236110", "2780618658", "2778021871", "2908822358", "2779783368", "46312889", "2775893736", "2776498708", "34099160", "41708089", "2776880170", "49906088", "105578763", "2778381653", "2780836627", "2775876557"], "SDG_17": ["171250308", "56739046", "520434653", "191935318", "66204764", "2910001868", "134560507", "159317903", "2777953023", "530175646", "164767435", "47344431", "190960625", "2778300220", "186229450", "2777113093", "28718268", "87616379", "15845906", "2776060655", "138368954", "2776604539", "199491958", "44171179", "2779015535", "70455891", "2776553905", "99743013", "2777481183", "134632028", "2778711553", "552089266", "2778449271", "206103860", "2910910449", "2780575108", "2780903623", "129275984", "2776577793", "68307924", "2778459265", "2777836882", "160354207", "2781328080", "190539079", "158041659", "2780124536", "2909744077", "2776561884", "198891747", "2780479094"], "SDG_14": ["544153396", "159750122", "18918823", "119128265", "76177295", "88862950", "22070199", "197248824", "553184892", "115961737", "2776415932", "88160329", "514928085", "502230775", "152382732", "83419821", "2776023875", "72958200", "23531484", "2779429622", "85721925", "156380964", "68874143", "132543647", "46576788", "111874474", "2777403171", "72634772", "153279818", "49427245", "139369640", "192536144", "143517461", "509746633", "3641667", "82988372", "152613627", "150418976", "2779310246", "2777590139", "2779522410", "2908811810", "14918906", "2909168245", "2776538778", "2908904675", "2780756971", "2776265578", "2911123808", "39077098", "2909168288"], "SDG_7": ["501529594", "544956773", "108225325", "105923489", "90509273", "68801617", "107645774", "75684735", "20788544", "55037315", "151406439", "2777622855", "38677869", "31395832", "2780066083", "2778869765", "78244369", "2779019381", "155373166", "68476402", "509746633", "137851953", "137886200", "33039251", "2780936489", "2777466363", "200630231", "2776970089", "17648541", "80845027", "2778675665", "165998758", "2778330180", "135436540", "2781249646", "37965861", "2778539042", "37415627", "2779252636", "83227832", "86714428", "88417058", "2776058518", "2779200991", "126172416", "38940224", "188818383", "2781309322", "2777586272", "143559376", "9132272", "2776782565", "2779877863", "2776740001", "2779867701", "2779895041"], "SDG_16": ["104267543", "44249647", "171289174", "524765639", "509933004", "46295352", "2777367657", "74501621", "131046424", "112299071", "129603779", "64848388", "2778215748", "2775935494", "2779363069", "162466561", "538473155", "71156930", "710854", "2777963317", "41150092", "2778029865", "156460124", "2778724510", "2780919918", "2910956745", "207035908", "2775982628", "2776363604", "2909494222", "2776686254", "2779889875", "2778736898"], "SDG_5": ["70036468", "55447825", "102587632", "77352025", "46578552", "17632256", "2777973936", "162077342", "104151175", "2780233487", "119693030", "2777667586", "21279758", "37512671", "994546", "2777877159", "48057960", "122251271", "2779881493", "2777941463", "2776689383", "102003337", "119588120", "2778071103", "2777177043", "170806853", "2775906418", "2779865128", "2775850206", "140816417", "2780438625", "2775880612", "2776430950"]}
--------------------------------------------------------------------------------
/raw_data/2_remove/22_TJL-24_review/22_process_remove_fos.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pandas as pd
3 | import re
4 |
5 | df = pd.read_excel('osdg_fos_paper_citation_counts_REMOVE_v2_ed_VS.xlsx')
6 |
7 | remove_fos = dict()
8 | for sdg_to_remove, fos_id in df[['remove', 'fos_id']].values:
9 | sdg_to_remove = map(lambda sdg_nr: f'SDG_{sdg_nr}', re.findall(r'\d+', sdg_to_remove))
10 | for sdg_label in sdg_to_remove:
11 | if sdg_label not in remove_fos.keys():
12 | remove_fos[sdg_label] = []
13 | remove_fos[sdg_label].append(str(fos_id))
14 |
15 | with open('22_RemoveFOS.json', 'w') as file_:
16 | json.dump(remove_fos, file_)
17 |
--------------------------------------------------------------------------------
/raw_data/2_remove/22_TJL-24_review/osdg_fos_paper_citation_counts_REMOVE_v2_ed_VS.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/2_remove/22_TJL-24_review/osdg_fos_paper_citation_counts_REMOVE_v2_ed_VS.xlsx
--------------------------------------------------------------------------------
/raw_data/2_remove/23_Restructuring_review/23_RemoveFOS.json:
--------------------------------------------------------------------------------
1 | {"SDG_2": ["2777480484", "2994333706"], "SDG_15": ["192241223", "176943803", "113754120", "2994352824", "3020462461", "2992165118", "120806208", "64551749", "194187813", "64015301", "2777480484", "2778364563", "2983333560", "74250896", "109902934"], "SDG_11": ["2779548549", "64004221", "2776756561", "79420006", "2994396486", "2776489436", "19096712", "46585869", "2986229148", "2779286702", "2777152325", "2779028214", "2776902267", "76155785", "7131667", "18533594", "2909931525", "184386139", "2909614546", "111943024", "29760336", "2779725038", "2909978109", "5455396", "113145756", "7991579", "2776288101", "86532276", "49304495", "2776941537", "2908570603", "86085837", "2779220109", "2910447950", "45012715", "52069626", "171276312", "2781190202", "103060789", "46135064", "27157697", "3017795126", "2778289769", "2780512908", "2776323365", "74211669", "2780805606", "103189561", "3020114046", "2910432382", "2776576667", "2780273121", "34349720", "2777362114", "110069353", "71839028", "162044005", "2775889553", "96926464", "192126672"], "SDG_14": ["2908583363", "2910560996", "2911073633", "20992447", "2910628358", "1189109488"], "SDG_4": ["2777626052"], "SDG_16": ["133462117"], "SDG_3": ["2778853725", "2908832293", "2908520703", "2908999294"], "SDG_9": ["2779424974"], "SDG_7": ["151174772", "1034443"]}
--------------------------------------------------------------------------------
/raw_data/2_remove/23_Restructuring_review/23_process_remove_fos.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pandas as pd
3 |
4 |
5 | fname = '23_RemoveFOS.json'
6 |
7 | remove_fos = dict()
8 |
9 | df = pd.read_excel('sdg-fos_restructuring-v3_to-remove.xlsx')
10 |
11 | for _, vals in df.iterrows():
12 | sdg, fos_id = vals['sdg'], str(vals['fos_id'])
13 | if sdg not in remove_fos.keys():
14 | remove_fos[sdg] = set()
15 | remove_fos[sdg].add(fos_id)
16 |
17 | for sdg, foses in remove_fos.items():
18 | remove_fos[sdg] = list(foses)
19 |
20 |
21 | with open(fname, 'w') as file_:
22 | json.dump(remove_fos, file_)
23 |
24 |
25 |
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/raw_data/2_remove/23_Restructuring_review/sdg-fos_restructuring-v3_to-remove.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/2_remove/23_Restructuring_review/sdg-fos_restructuring-v3_to-remove.xlsx
--------------------------------------------------------------------------------
/raw_data/2_remove/24_Review_2020-10-02/24_RemoveFOS.json:
--------------------------------------------------------------------------------
1 | {"SDG_2": ["38774213", "156086215", "126408429", "182124840", "33411773", "172817999", "141185391", "20529654", "42731165", "59804570", "116370137", "126589399", "2780528068", "15147509", "31568149", "2776562576", "24518262", "23519681", "53002841", "197320908", "33283694", "2780189059", "133382796", "2619416", "201401522", "2776107028", "2778625682", "107394435", "2780816530", "28631016", "108216600", "192392207", "2776500793", "2779004245", "97137747", "2775966360", "34070608", "2780086105", "64229544", "2777707638", "91354502", "2776285232", "2776554196", "147103442", "87621631", "119249163", "155987862", "2775841215", "2777106113", "155015343", "2776492830", "89295123", "63651461", "121850381", "198979508", "154702282", "154575652", "150436541", "152491559", "2775999090", "153427425", "2776596991", "123917164", "2777399377", "32120771", "93944068", "62158283", "555313981", "2779128174", "2780946806", "2777380357", "118694661", "60989497", "25382069", "2781208722", "2777472530", "54625482", "2780696901", "59898753", "39571515", "126914827", "173795300", "2776978901", "2776278397", "2777387638", "532801124", "139518226", "2778361644", "2776801807", "2777132354", "2776054349"]}
--------------------------------------------------------------------------------
/raw_data/2_remove/24_Review_2020-10-02/24_process_remove_fos.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import json
3 |
4 |
5 | remove_fos = dict()
6 |
7 | df = pd.read_csv('remove-review_2020-10-02.csv')
8 | for fos_id, _, rm_sdg in df.values:
9 | fos_id = str(fos_id)
10 | if rm_sdg not in remove_fos.keys():
11 | remove_fos[rm_sdg] = []
12 | remove_fos[rm_sdg].append(fos_id)
13 |
14 | with open('24_RemoveFOS.json', 'w') as file_:
15 | json.dump(remove_fos, file_)
16 |
17 |
--------------------------------------------------------------------------------
/raw_data/2_remove/24_Review_2020-10-02/remove-review_2020-10-02.csv:
--------------------------------------------------------------------------------
1 | fos_id,fos_name,from_sdg
2 | 38774213,Soil fertility,SDG_2
3 | 156086215,Soil ecology,SDG_2
4 | 126408429,Soil health,SDG_2
5 | 182124840,Soil organic matter,SDG_2
6 | 33411773,Bulk soil,SDG_2
7 | 172817999,Soil functions,SDG_2
8 | 141185391,Soil retrogression and degradation,SDG_2
9 | 20529654,Topsoil,SDG_2
10 | 42731165,Environmental soil science,SDG_2
11 | 59804570,Soil governance,SDG_2
12 | 116370137,No-till farming,SDG_2
13 | 126589399,Umbrella species,SDG_2
14 | 2780528068,Plant nutrition,SDG_2
15 | 15147509,Conservation reliant species,SDG_2
16 | 31568149,Near-threatened species,SDG_2
17 | 2776562576,Plant strategies,SDG_2
18 | 24518262,Threatened species,SDG_2
19 | 23519681,Montane ecology,SDG_2
20 | 53002841,Plant community,SDG_2
21 | 197320908,Pioneer species,SDG_2
22 | 33283694,Deciduous,SDG_2
23 | 2780189059,Soil stabilization,SDG_2
24 | 133382796,Secondary forest,SDG_2
25 | 2619416,Rainforest,SDG_2
26 | 201401522,Plant cover,SDG_2
27 | 2776107028,Forest dynamics,SDG_2
28 | 2778625682,High forest,SDG_2
29 | 107394435,Quadrat,SDG_2
30 | 2780816530,Forest protection,SDG_2
31 | 28631016,Forest management,SDG_2
32 | 108216600,Tropical and subtropical dry broadleaf forests,SDG_2
33 | 192392207,Clearcutting,SDG_2
34 | 2776500793,Beech,SDG_2
35 | 2779004245,Tilth,SDG_2
36 | 97137747,Forestry,SDG_2
37 | 2775966360,Silviculture,SDG_2
38 | 34070608,Cover crop,SDG_2
39 | 2780086105,Forest product,SDG_2
40 | 64229544,Habitat destruction,SDG_2
41 | 2777707638,Vascular plant,SDG_2
42 | 91354502,Basal area,SDG_2
43 | 2776285232,Tropical forest,SDG_2
44 | 2776554196,Evergreen forest,SDG_2
45 | 147103442,Forest inventory,SDG_2
46 | 87621631,Taiga,SDG_2
47 | 119249163,Felling,SDG_2
48 | 155987862,Selection cutting,SDG_2
49 | 2775841215,Sustainable forest management,SDG_2
50 | 2777106113,Crop simulation model,SDG_2
51 | 155015343,Plant breeding,SDG_2
52 | 2776492830,Dipterocarpaceae,SDG_2
53 | 89295123,Forest pathology,SDG_2
54 | 63651461,Tropical agriculture,SDG_2
55 | 121850381,Certified wood,SDG_2
56 | 198979508,Forest fragmentation,SDG_2
57 | 154702282,Temperate deciduous forest,SDG_2
58 | 154575652,Reforestation,SDG_2
59 | 150436541,Forb,SDG_2
60 | 152491559,Macroecology,SDG_2
61 | 2775999090,Joint Forest Management,SDG_2
62 | 153427425,Biodiversity hotspot,SDG_2
63 | 2776596991,Cultural methods,SDG_2
64 | 123917164,Bumper crop,SDG_2
65 | 2777399377,DSSAT,SDG_2
66 | 32120771,Ecosystem engineer,SDG_2
67 | 93944068,Phytogeography,SDG_2
68 | 62158283,Species translocation,SDG_2
69 | 555313981,Tropical rainforest,SDG_2
70 | 2779128174,Scots pine,SDG_2
71 | 2780946806,Plant functional type,SDG_2
72 | 2777380357,Rainfed agriculture,SDG_2
73 | 118694661,Climax community,SDG_2
74 | 60989497,Red List Index,SDG_2
75 | 25382069,Seral community,SDG_2
76 | 2781208722,Intercropping,SDG_2
77 | 2777472530,Catch crop,SDG_2
78 | 54625482,Community forestry,SDG_2
79 | 2780696901,Conventional tillage,SDG_2
80 | 59898753,Shrubland,SDG_2
81 | 39571515,Undergrowth,SDG_2
82 | 126914827,Flagship species,SDG_2
83 | 173795300,Salvage logging,SDG_2
84 | 2776978901,Tree breeding,SDG_2
85 | 2776278397,Revegetation,SDG_2
86 | 2777387638,Forestry law,SDG_2
87 | 532801124,Crop protection,SDG_2
88 | 139518226,Sclerophyll,SDG_2
89 | 2778361644,Yield gap,SDG_2
90 | 2776801807,Pinus radiata,SDG_2
91 | 2777132354,Shelterwood cutting,SDG_2
92 | 2776054349,Vegetation classification,SDG_2
93 |
--------------------------------------------------------------------------------
/raw_data/2_remove/25_TOL-7_MostPopularSDG3FOS/25_RemoveFOS.json:
--------------------------------------------------------------------------------
1 | {"SDG_3": ["2777532764", "2909375031", "145642194", "2776556313", "2911127567", "106977388", "2908822358", "512399662", "2910036418", "509550671", "2780433410", "2780877353", "204787440", "86804380", "137992405", "2911023962", "545542383", "2780559412", "2780141013", "110894328", "188884661", "190960625", "502701156", "2777471088", "502991105", "22607594", "2777896191", "2779676829", "2776020993", "2779328685", "33623176", "2779671548", "2777161012", "2910661759", "2910448010", "2910661131", "2815619", "2780550299", "38858142", "2780541811", "14498672", "2777607137", "2779141489", "2909731318", "2779629443", "2910654967", "2776818590", "176656743", "178441611", "2781187916", "156312663", "2778369149", "2908819760", "2909715475", "165998758", "131138744", "2910237699", "2909160651", "2911013501", "140608501", "2780848588", "2777143679", "57177791", "2780646005", "2780589914", "111459926", "2908903645", "2779976542", "2778103839", "2777512617", "2910694641", "2779176400", "2780477921", "2781430560", "145798840", "121246419", "2911093041", "61620210", "2780542330", "161126747", "2777335584", "207006810", "2910950043", "31402265", "2780678043", "156168145", "2781332184", "2776370487", "2780812456", "40722700", "2910151648", "2778957590"]}
--------------------------------------------------------------------------------
/raw_data/2_remove/25_TOL-7_MostPopularSDG3FOS/25_process_remove_fos.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import json
3 |
4 |
5 | remove_fos = dict()
6 |
7 | df = pd.read_csv('TOL-7_MostPopularSDG3RemoveFOS.csv')
8 | for fos_id, _, rm_sdg in df.values:
9 | fos_id = str(fos_id)
10 | if rm_sdg not in remove_fos.keys():
11 | remove_fos[rm_sdg] = []
12 | remove_fos[rm_sdg].append(fos_id)
13 |
14 | with open('25_RemoveFOS.json', 'w') as file_:
15 | json.dump(remove_fos, file_)
16 |
--------------------------------------------------------------------------------
/raw_data/2_remove/25_TOL-7_MostPopularSDG3FOS/TOL-7_MostPopularSDG3RemoveFOS.csv:
--------------------------------------------------------------------------------
1 | fos_id,fos_name,from_sdg
2 | 2777532764,Research center,SDG_3
3 | 2909375031,Drug Company,SDG_3
4 | 145642194,Health informatics,SDG_3
5 | 2776556313,Downtown,SDG_3
6 | 2911127567,Generic Product,SDG_3
7 | 106977388,Medical research,SDG_3
8 | 2908822358,Organizational Case Studies,SDG_3
9 | 512399662,Family medicine,SDG_3
10 | 2910036418,Patient care team,SDG_3
11 | 509550671,Medical education,SDG_3
12 | 2780433410,Digital health,SDG_3
13 | 2780877353,Health services research,SDG_3
14 | 204787440,Alternative medicine,SDG_3
15 | 86804380,Construction site safety,SDG_3
16 | 137992405,Health administration,SDG_3
17 | 2911023962,Combination Product,SDG_3
18 | 545542383,Medical emergency,SDG_3
19 | 2780559412,Good-morning,SDG_3
20 | 2780141013,Time-out,SDG_3
21 | 110894328,Biomedical technology,SDG_3
22 | 188884661,Active packaging,SDG_3
23 | 190960625,Water treatment,SDG_3
24 | 502701156,Biomedical sciences,SDG_3
25 | 2777471088,Patient advocacy,SDG_3
26 | 502991105,Clinical research,SDG_3
27 | 22607594,Enabling,SDG_3
28 | 2777896191,Patient experience,SDG_3
29 | 2779676829,Connected health,SDG_3
30 | 2776020993,Group work,SDG_3
31 | 2779328685,Patient safety,SDG_3
32 | 33623176,eMix,SDG_3
33 | 2779671548,Interurban,SDG_3
34 | 2777161012,Institutional research,SDG_3
35 | 2910661759,Treatment room,SDG_3
36 | 2910448010,Delivery location,SDG_3
37 | 2910661131,Training skills,SDG_3
38 | 2815619,Continuous training,SDG_3
39 | 2780550299,Job description,SDG_3
40 | 38858142,Aftertaste,SDG_3
41 | 2780541811,Quackery,SDG_3
42 | 14498672,Effective safety training,SDG_3
43 | 2777607137,Added sugar,SDG_3
44 | 2779141489,Group home,SDG_3
45 | 2909731318,Home deliveries,SDG_3
46 | 2779629443,Cross-training,SDG_3
47 | 2910654967,Fast foods,SDG_3
48 | 2776818590,Natural Product Research,SDG_3
49 | 176656743,Serving size,SDG_3
50 | 178441611,Training effect,SDG_3
51 | 2781187916,Day care,SDG_3
52 | 156312663,Steering committee,SDG_3
53 | 2778369149,Clinical data management,SDG_3
54 | 2908819760,Food selections,SDG_3
55 | 2909715475,What treatment,SDG_3
56 | 165998758,Imaging technology,SDG_3
57 | 131138744,Completed Staff Work,SDG_3
58 | 2910237699,Device Approval,SDG_3
59 | 2909160651,Delivery - action,SDG_3
60 | 2911013501,Delivery timing,SDG_3
61 | 140608501,Review article,SDG_3
62 | 2780848588,Power Balance,SDG_3
63 | 2777143679,Post and core,SDG_3
64 | 57177791,Imaging science,SDG_3
65 | 2780646005,Trafficability,SDG_3
66 | 2780589914,Ingredient,SDG_3
67 | 111459926,Walk-in,SDG_3
68 | 2908903645,Normal delivery,SDG_3
69 | 2779976542,Case report form,SDG_3
70 | 2778103839,Home management,SDG_3
71 | 2777512617,Staffing,SDG_3
72 | 2910694641,Patient name,SDG_3
73 | 2779176400,Medical food,SDG_3
74 | 2780477921,Chewiness,SDG_3
75 | 2781430560,Food pyramid,SDG_3
76 | 145798840,Process safety management,SDG_3
77 | 121246419,Unlicensed assistive personnel,SDG_3
78 | 2911093041,Her Disease,SDG_3
79 | 61620210,Flame-Sim,SDG_3
80 | 2780542330,Clinical data repository,SDG_3
81 | 161126747,Hot work,SDG_3
82 | 2777335584,N-group (finite group theory),SDG_3
83 | 207006810,Improved water source,SDG_3
84 | 2910950043,Reservoir bag,SDG_3
85 | 31402265,Potential space,SDG_3
86 | 2780678043,Group A,SDG_3
87 | 156168145,Passive fire protection,SDG_3
88 | 2781332184,Payment by Results,SDG_3
89 | 2776370487,Sitting,SDG_3
90 | 2780812456,Cooling down,SDG_3
91 | 40722700,Cluster of differentiation,SDG_3
92 | 2910151648,Negative Test Result,SDG_3
93 | 2778957590,CD19,SDG_3
94 |
--------------------------------------------------------------------------------
/raw_data/2_remove/RemovedFOS.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/2_remove/RemovedFOS.xlsx
--------------------------------------------------------------------------------
/raw_data/3_blacklist/30_8_NABS_FOS/30_BlacklistFOS.csv:
--------------------------------------------------------------------------------
1 | fos_id,fos_name
2 | 2987034934,Earth crust
3 | 2776698055,Crust
4 | 79572550,Partial melting
5 | 2994012208,Upper crust
6 | 2780356177,Baltic Shield
7 | 56859440,Bouguer anomaly
8 | 2778471503,Basin and Range Province
9 | 2779422593,Soil crust
10 | 2779980370,Magma ocean
11 | 37523158,Hadean
12 | 3017803470,Crust formation
13 | 2780556036,South Pole–Aitken basin
14 | 549698073,Structure of the Earth
15 | 77928131,Tectonics
16 | 33556824,Hydrogeology
17 | 179158327,Palaeogeography
18 | 159719176,Engineering geology
19 | 2776797426,Biogeosciences
20 | 109281948,Stratigraphy
21 | 6363049,Volcanism
22 | 58097730,Subduction
23 | 119477230,Plate tectonics
24 | 2778261408,Eurasian Plate
25 | 2777994876,Pacific Plate
26 | 2777992645,North American Plate
27 | 2779867292,African Plate
28 | 2781207809,South American Plate
29 | 2776184289,Farallon Plate
30 | 180362636,Thrust tectonics
31 | 152972079,Plate reconstruction
32 | 7251660,Structural geology
33 | 50682988,Rift
34 | 201867031,Continental margin
35 | 141646446,Continental crust
36 | 136752280,Geodynamics
37 | 23295444,Shear zone
38 | 110041135,Thrust fault
39 | 97490223,Focal mechanism
40 | 166839181,Graben
41 | 44938399,Petrogenesis
42 | 150999391,Terrane
43 | 92596616,Lineament
44 | 128954607,Tectonophysics
45 | 23923706,Ophiolite
46 | 199007388,Diapir
47 | 16670881,Neotectonics
48 | 67236022,Mantle (geology)
49 | 16942324,Lithosphere
50 | 51151373,Mid-ocean ridge
51 | 84372278,Peridotite
52 | 11872896,Fractional crystallization (geology)
53 | 44938399,Petrogenesis
54 | 23923706,Ophiolite
55 | 122959257,Seismic tomography
56 | 42796848,Xenolith
57 | 167919410,Metasomatism
58 | 127723449,Core–mantle boundary
59 | 183282558,Kimberlite
60 | 2994012208,Upper crust
61 | 140441402,Carbonatite
62 | 23148476,Seismic anisotropy
63 | 93746451,Mineral redox buffer
64 | 2780356177,Baltic Shield
65 | 22512106,Shear wave splitting
66 | 195081551,Ultramafic rock
67 | 2993808335,Seismic velocity
68 | 2780942940,Stishovite
69 | 2778882853,Phlogopite
70 | 21441200,Mineral physics
71 | 83948199,Incompatible element
72 | 2779980370,Magma ocean
73 | 2993054622,Core formation
74 | 2781390083,Pinctada fucata
75 | 37523158,Hadean
76 | 140230471,Planetary core
77 | 160804572,Silicate perovskite
78 | 2776763651,Ferropericlase
79 | 154802760,Giant impact hypothesis
80 | 2777480983,USArray
81 | 114793014,Geomorphology
82 | 16674752,Mining engineering
83 | 97842125,Rock mechanics
84 | 5166401,Tailings
85 | 41242791,Rock mass classification
86 | 2779096232,Hydraulic fracturing
87 | 175181221,Prospecting
88 | 2777201227,Overburden
89 | 2984157484,Mining industry
90 | 7028197,Gangue
91 | 2776760134,Gold mining
92 | 94236395,Stoping
93 | 184977646,Open-pit mining
94 | 179974421,Rock bolt
95 | 93011207,Geotechnical investigation
96 | 113658590,Muck
97 | 2993323123,Rock slope
98 | 207469975,Shaft mining
99 | 58625266,Lode
100 | 186096623,Ground pressure
101 | 2992974802,Geological exploration
102 | 2779742380,Gabion
103 | 2993134977,Mineral deposit
104 | 102044607,Adit
105 | 2993252152,Rock body
106 | 2991922516,Rock pressure
107 | 2911210907,Support pressure
108 | 2994289516,Geological investigation
109 | 2993102984,Mine planning
110 | 2992547679,Land mine
111 | 2993437602,Rock structure
112 | 2780043312,Hydraulic fill
113 | 2993527706,Salt mine
114 | 2992990004,Solid rock
115 | 2992406196,Waste dump
116 | 26144545,Cut and fill
117 | 2909623323,Waste Dumps
118 | 2910921642,Mineral industries
119 | 127200247,Hydraulic mining
120 | 2992067306,Mineral potential
121 | 2778524612,Mining law
122 | 66511971,Mining geology
123 | 2781079927,Dimension stone
124 | 2778375701,Ground stone
125 | 2993492720,Gold production
126 | 2778839144,Medical geology
127 | 2992981300,Salt deposit
128 | 2909086881,Stone quarry
129 | 2779795913,Hurrying
130 | 2776629827,Panasqueira
131 | 2992407798,Iron mining
132 | 2992330363,Urban geology
133 | 2778143190,Minnesota Geological Survey
134 | 46517748,Drift mining
135 | 2777425756,Gold panning
136 | 193605714,Steam shovel
137 | 2779070535,Street gutter
138 | 46580973,Blackdamp
139 | 2781121916,Bow drill
140 | 2910477778,DUMP formation
141 | 2779880937,Whinstone
142 | 2910697619,Hearing analyzer
143 | 2911132530,Mine surveyor
144 | 8824402,Landslides vs. Rock strength
145 | 2910164855,Logging car
146 | 2909642594,Shaft (site)
147 | 2910514300,Root stones
148 | 42972112,Veterinary medicine
149 | 2776977481,Dairy cattle
150 | 2776482104,Breed
151 | 194775826,Herd
152 | 134215735,Flock
153 | 523966790,Animal welfare
154 | 2779620486,Tick
155 | 2780505807,Beef cattle
156 | 2779885849,Milking
157 | 173419221,Crossbreed
158 | 2776908094,Anthelmintic
159 | 2778877831,Cryptosporidium
160 | 2991862235,Animal health
161 | 2779557943,Canis
162 | 2908982167,Cattle Diseases
163 | 2780284631,Sire
164 | 2777499811,Ivermectin
165 | 66914385,Poultry farming
166 | 52991690,Culling
167 | 2779329348,Feedlot
168 | 2777976947,Eimeria
169 | 2781368420,Biosecurity
170 | 103797069,Domestic sheep reproduction
171 | 2776082042,Vulpes
172 | 2776222705,Wild boar
173 | 2777963300,Ovis
174 | 2777786777,Flea
175 | 2776247511,Zebu
176 | 2779552062,Roe deer
177 | 3018078696,Dwarf goats
178 | 2777146433,Badger
179 | 2776521926,Brahman
180 | 2780968714,Mange
181 | 2778226015,Capreolus
182 | 2779867394,Bubalus
183 | 2994537864,Human medicine
184 | 2777199308,Louse
185 | 2777474537,Pheasant
186 | 3017754109,Companion animal
187 | 2991667299,Capra hircus
188 | 2777114023,Withers
189 | 2778134537,Domestic pig
190 | 2780323295,Cervus
191 | 2778002360,Rump
192 | 2779914258,Nili-Ravi
193 | 2994460426,Pig farms
194 | 2910651670,Bird Diseases
195 | 2777151259,Mallophaga
196 | 2780487972,Veterinary parasitology
197 | 2909771501,Goat Diseases
198 | 3020113513,Small ruminant
199 | 2909031412,Gallus gallus domesticus
200 | 2993139054,Water buffalo
201 | 2776960312,Hock
202 | 2911060314,Laboratory Animal Science
203 | 2780727426,Awassi
204 | 3017937595,Dog owners
205 | 168568655,Medical entomology
206 | 2908605944,Meleagris gallopavo
207 | 2777222942,Corriedale
208 | 2778136425,Struthio
209 | 2777225262,Veterinary pathology
210 | 2780460740,Jackal
211 | 2909619495,Food animal
212 | 2909895380,Guinea fowl
213 | 2910990604,Lama glama
214 | 2778856526,Cow-calf
215 | 144027150,Horticulture
216 | 137580998,Crop
217 | 197321923,Cultivar
218 | 2777108408,Sugar
219 | 21410773,Shoot
220 | 100701293,Germination
221 | 88862950,Irrigation
222 | 168741863,Sowing
223 | 2776096895,Seedling
224 | 2780719635,Flavor
225 | 8868529,Taste
226 | 2780618852,Pollen
227 | 2776373379,Chlorophyll
228 | 150668497,Dry weight
229 | 88972607,Human fertilization
230 | 22508944,PEST analysis
231 | 38304854,Manure
232 | 104727253,Biological pest control
233 | 32198211,Greenhouse
234 | 2779678110,Fungus
235 | 161221295,Plant physiology
236 | 45292766,Bark
237 | 133479454,Mycelium
238 | 2780563676,Aroma
239 | 2982966219,Plant growth
240 | 172353545,Ripening
241 | 51417038,Phenology
242 | 168197293,Pollination
243 | 2777461220,Germplasm
244 | 540442320,Pest control
245 | 2780414537,Maple
246 | 115930662,Shelf life
247 | 2778157034,Sorghum
248 | 2780739461,Compost
249 | 2776632002,Legume
250 | 2779824472,Herb
251 | 49799701,Xylem
252 | 2776242653,Pepper
253 | 75639521,Field experiment
254 | 2993531722,Zea mays
255 | 36248471,Seeding
256 | 2778761015,Solanaceae
257 | 2776451879,Infestation
258 | 55969652,photoperiodism
259 | 2776747608,Brassica
260 | 155868670,Root system
261 | 2776474821,Mushroom
262 | 513193947,Fodder
263 | 2988529969,Cold storage
264 | 46328234,Organoleptic
265 | 2776286235,Phaseolus
266 | 2775976403,Aphid
267 | 74103781,Ornamental plant
268 | 157670687,Postharvest
269 | 2779197568,Sunflower
270 | 85582077,Paddy field
271 | 75296557,Husk
272 | 108010975,Pruning
273 | 137776501,Point of delivery
274 | 178165689,Inflorescence
275 | 83740816,Gibberellin
276 | 2993273313,Chemical control
277 | 185476388,Cotyledon
278 | 2776327621,Flesh
279 | 35496372,Phloem
280 | 2780054949,Spinach
281 | 53007507,Browning
282 | 43143990,Conidium
283 | 6557445,Agronomy
284 | 159750122,Soil water
285 | 2779371384,Biomass
286 | 142796444,Nutrient
287 | 137580998,Crop
288 | 510538283,Phosphorus
289 | 48743137,Organic matter
290 | 197321923,Cultivar
291 | 21410773,Shoot
292 | 88862950,Irrigation
293 | 168741863,Sowing
294 | 161176658,Pesticide
295 | 2780138947,Dry matter
296 | 101000010,Canopy
297 | 46757340,Poaceae
298 | 150668497,Dry weight
299 | 2779587293,Straw
300 | 128758860,Woody plant
301 | 2777612826,Insect
302 | 2777904157,Grazing
303 | 137660486,Growing season
304 | 50660011,Tropics
305 | 150772632,Arid
306 | 2779370140,Forage
307 | 48189365,Hybrid
308 | 81461190,Temperate climate
309 | 32198211,Greenhouse
310 | 2775891814,Weed
311 | 2775835988,Grassland
312 | 2779429622,Litter
313 | 53657456,Peat
314 | 2778053677,Pasture
315 | 2982966219,Plant growth
316 | 141282968,Plant ecology
317 | 132215390,Abiotic component
318 | 540442320,Pest control
319 | 2778157034,Sorghum
320 | 2780739461,Compost
321 | 2776632002,Legume
322 | 24461792,Perennial plant
323 | 75639521,Field experiment
324 | 118518473,Agriculture
325 | 549605437,Food security
326 | 128383755,Agricultural productivity
327 | 3987366,Livelihood
328 | 16397148,Tillage
329 | 139496715,Deforestation
330 | 2988676352,Rural development
331 | 502990516,Agricultural land
332 | 156005406,Subsistence agriculture
333 | 559400886,Land management
334 | 122690726,"Land use, land-use change and forestry"
335 | 85675897,Soil management
336 | 123963621,Integrated pest management
337 | 2779220025,Peasant
338 | 13558536,Cropping
339 | 137607661,Land tenure
340 | 2776475172,Soil quality
341 | 183135511,Natural resource management
342 | 157140304,Agrarian society
343 | 71762439,Arable land
344 | 175760724,Crop rotation
345 | 189797535,Drought tolerance
346 | 109162521,Soil conservation
347 | 2989409935,Crop production
348 | 54924851,Sustainable agriculture
349 | 61968832,Animal husbandry
350 | 118817206,Organic farming
351 | 2778852317,Agricultural policy
352 | 183889291,Crop residue
353 | 1670747,Agribusiness
354 | 202050865,Hectare
355 | 2992211155,Grain yield
356 | 2992730755,Agricultural development
357 | 51832835,Environmental management system
358 | 47136581,Agricultural machinery
359 | 112077630,Irrigation management
360 | 37923429,Intensive farming
361 | 120217122,Precision agriculture
362 | 113052830,Land degradation
363 | 17616946,Pastoralism
364 | 2777178263,Land reform
365 | 105462344,Nutrient management
366 | 57664001,Agroecosystem
367 | 2778452349,Rural poverty
368 | 192039558,Biofertilizer
369 | 2777481183,Market access
370 | 507981020,Agricultural education
371 | 207581243,Agrochemical
372 | 156663261,Agroecology
373 | 2993199473,Plant biochemistry
374 | 64476972,Sustainable Agriculture Innovation Network
375 | 112939947,Green Revolution
376 | 129225989,Cash crop
377 | 2778402112,Agricultural extension
378 | 2780117336,Farm income
379 | 2778691696,Dairy farming
380 | 2775898560,Common Agricultural Policy
381 | 141005173,Shifting cultivation
382 | 2993003885,Land area
383 | 27206212,Theology
384 | 4445939,Islam
385 | 17235551,Self
386 | 2775858120,Memoria
387 | 128361363,Symbol
388 | 74256435,Flood myth
389 | 2777617010,Mainstream
390 | 2778692574,Faith
391 | 150152722,Judaism
392 | 521751864,Christian ministry
393 | 161487207,Derecho
394 | 2776211767,Doctrine
395 | 182744844,Metaphysics
396 | 551968917,Christianity
397 | 2780415144,SAINT
398 | 2778738651,Novelty
399 | 2776050585,Scrutiny
400 | 2779103253,Duty
401 | 2778983918,Wife
402 | 18296254,Skepticism
403 | 133979268,Vision
404 | 143128703,Middle Ages
405 | 2777239683,Virtue
406 | 2780422510,Humanity
407 | 2777122596,Praxis
408 | 2780822299,Soul
409 | 75699723,Buddhism
410 | 102523778,Form of the Good
411 | 2778052875,Bildung
412 | 2780310893,Passion
413 | 2777438998,Tribunal
414 | 152212766,The Republic
415 | 10180917,Conscience
416 | 530479602,Opera
417 | 2777582232,CONTEST
418 | 2781354396,Enthusiasm
419 | 543192267,Magic (paranormal)
420 | 2779438500,Honor
421 | 2778182169,Jako
422 | 111021475,Protestantism
423 | 159789966,Lingua franca
424 | 50379869,Hermeneutics
425 | 173853756,Dialog box
426 | 164105321,Catalan
427 | 32506930,Hegelianism
428 | 129454956,Field research
429 | 2779829227,Vitality
430 | 2776932993,Ethos
431 | 2779728303,Pride
432 | 2780710533,Governo
433 | 169081014,Mysticism
434 | 113522999,Fall of man
435 | 9992130,Pessimism
436 | 2775944640,Utopia
437 | 130979935,Ansatz
438 | 2776684731,Garcia
439 | 91304198,Hebrew
440 | 194105502,Biblical studies
441 | 2777222677,Worship
442 | 2781384534,Gospel
443 | 2776527531,Persian
444 | 2776405206,Revelation
445 | 46610780,Hinduism
446 | 2776134716,Sacrifice
447 | 27362006,Gestalt psychology
448 | 2777477151,Prayer
449 | 2780580889,Panorama
450 | 128536511,History of religions
451 | 2779021329,Destiny
452 | 58348228,Auteur theory
453 | 2776305542,Problema
454 | 2776911728,Courage
455 | 2776727279,Heaven
456 | 92047909,Hyperbolic function
457 | 2781179785,Valencia
458 | 83559648,Croatian
459 | 2778896172,Manifesto
460 | 2781287369,Stuttgart
461 | 534701709,Old Testament
462 | 180903884,Rationalism
463 | 32772713,Charisma
464 | 2992637229,Michel foucault
465 | 2778802261,Orthodoxy
466 | 61783943,Luck
467 | 39511330,Logo
468 | 186857363,Siege
469 | 10869588,Church history
470 | 155785087,Natural law
471 | 2437467,Perfection
472 | 512654426,Public domain
473 | 73440236,Psyche
474 | 76960060,Umwelt
475 | 164663123,Cosmos
476 | 2776347870,Passions
477 | 9299846,Secularization
478 | 22029948,Dice
479 | 2777776507,Lexico
480 | 558299567,Mass media
481 | 155030161,Mass communication
482 | 48185193,Media
483 | 167275870,Media system dependency theory
484 | 55322685,Media conglomerate
485 | 2781343547,Media Practice Model
486 | 518677369,Social media
487 | 74216064,Social computing
488 | 101293273,User-generated content
489 | 503923677,Social web
490 | 156571341,Cyberpsychology
491 | 60136833,Social media optimization
492 | 2776892586,Brand engagement
493 | 2987376390,Electronic word of mouth
494 | 2776915394,Customer engagement
495 | 2985889538,Social media marketing
496 | 16759151,Online presence management
497 | 2778838397,Uses and gratifications theory
498 | 2778729106,Social media analytics
499 | 2987325470,Social commerce
500 | 2780564743,Social CRM
501 | 2992647939,Consumer engagement
502 | 2993555337,Personal branding
503 | 178408851,Content marketing
504 | 2985692548,Crisis informatics
505 | 2777835648,Filter bubble
506 | 2993426613,Arabic sentiment analysis
507 | 2780695499,Social media mining
508 | 2993240939,Online activism
509 | 2991870026,Social media network
510 | 2780997048,Digital footprint
511 | 2988327197,Online harassment
512 | 2777257828,Virtual archaeology
513 | 2988996608,Online engagement
514 | 3018846106,Fear of missing out
515 | 2988833398,Social event detection
516 | 2993172631,Social news
517 | 2778871292,Social television
518 | 2988622424,Social multimedia
519 | 2780441040,Slacktivism
520 | 196690852,Social analytics
521 | 2988338654,News sharing
522 | 2987800000,Crisis mapping
523 | 2779113645,Like button
524 | 2984648278,Ambient awareness
525 | 2777370179,Social media measurement
526 | 2986426982,Social stream
527 | 2989393167,Personal learning network
528 | 2984029112,Social mining
529 | 2993865493,Social data analytics
530 | 2778412320,iPhoneography
531 | 529147693,News media
532 | 167752473,News values
533 | 2011517,Broadcast journalism
534 | 42211076,Reliable Sources
535 | 201280247,Newspaper
536 | 167752473,News values
537 | 16189245,News bureau
538 | 2776585538,Agenda-setting theory
539 | 2776973623,Legal deposit
540 | 3019217387,Crime news
541 | 2779111255,History of journalism
542 | 2910250570,Newspapers as Topic
543 | 2779546711,Penny press
544 | 2776757517,Newspaper digitization
545 | 2910534252,Newsclipping
546 | 2780756850,News design
547 | 2779944825,Headlinese
548 | 81959379,Broadcasting
549 | 520681616,Digital television
550 | 9819579,Tuner
551 | 83529365,Broadcast communication network
552 | 58911810,Radio broadcasting
553 | 41062264,Digital broadcasting
554 | 943373,Atomic broadcast
555 | 77757571,Multimedia Broadcast Multicast Service
556 | 2989465874,Broadcast channels
557 | 110157686,Broadcasting (networking)
558 | 2779883265,Electronic program guide
559 | 556509198,Public broadcasting
560 | 2985624630,Broadcasting system
561 | 119452085,Commercial broadcasting
562 | 2780818791,Teletext
563 | 91285054,Broadcast television systems
564 | 2994381574,Broadcast system
565 | 2778749970,Conditional access
566 | 2779106878,Digital audio broadcasting
567 | 2987348774,Broadcast data
568 | 2994534981,Television channel
569 | 2780079832,Digital multimedia broadcasting
570 | 2994104004,Broadcast service
571 | 2776847985,Single-frequency network
572 | 2994466296,Radio program
573 | 183384803,Automatic dependent surveillance-broadcast
574 | 2011517,Broadcast journalism
575 | 2992306869,Satellite television
576 | 2779438827,Television station
577 | 2993676337,Broadcast transmission
578 | 55322685,Media conglomerate
579 | 2992481583,Broadcast time
580 | 2779213998,Mobile television
581 | 196227537,Broadcast transmitter
582 | 2987586235,Multimedia broadcasting
583 | 68163228,Radio Data System
584 | 2991920864,Community radio
585 | 2988547615,Speech summarization
586 | 2777695277,DVB-H
587 | 2982719622,Video broadcast
588 | 2984608069,Wireless broadcast
589 | 159505674,Broadcasting of sports events
590 | 2779461089,FM broadcasting
591 | 2780698354,Broadcast quality
592 | 2778985329,ISDB
593 | 2986444337,Near video on demand
594 | 2777514068,Broadcast band
595 | 2775997990,Narrowcasting
596 | 2987442367,Tv viewer
597 | 2985711970,Mobile broadcast
598 | 2991660179,Broadcast packet
599 | 2779742664,Broadcast address
600 | 2779056648,International broadcasting
601 | 2986347997,Wireless broadcasting
602 | 2987043902,Broadcasting algorithms
603 | 2779081413,AM stereo
604 | 2992202738,Mobile broadcasting
605 | 2987005673,Broadcast scheduling
606 | 2776313748,Broadcast range
607 | 2777452754,TV-Anytime
608 | 2781191505,DAB ensemble
609 | 2778553611,1seg
610 |
--------------------------------------------------------------------------------
/raw_data/3_blacklist/30_8_NABS_FOS/30_process_blacklist_fos.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | df = pd.read_excel('NABS_FOS_update_2020-08-20_NOT-RELEVANT__ed_VS.xlsx')[['fos_number', 'fos_name']]
4 | df.columns = ['fos_id', 'fos_name']
5 |
6 | df.to_csv('30_BlacklistFOS.csv', index=False)
7 |
--------------------------------------------------------------------------------
/raw_data/3_blacklist/30_8_NABS_FOS/NABS_FOS_update_2020-08-20_NOT-RELEVANT__ed_VS.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechNote-ai/osdg/0dff6230f8442a85c4cd22462a38ba56d79c6c32/raw_data/3_blacklist/30_8_NABS_FOS/NABS_FOS_update_2020-08-20_NOT-RELEVANT__ed_VS.xlsx
--------------------------------------------------------------------------------
/raw_data/3_blacklist/AssembleBlacklist.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 |
4 | df_blacklist = pd.DataFrame(columns=['fos_id', 'fos_name', 'source'])
5 |
6 | for directory in filter(lambda dir: '.' not in dir, os.listdir()):
7 | try:
8 | blacklist_fos_fname = list(filter(lambda oname: '_BlacklistFOS.csv' in oname, os.listdir(directory)))[0]
9 | df = pd.read_csv(f'{directory}/{blacklist_fos_fname}')
10 | assert list(df.columns) == ['fos_id', 'fos_name'], "*_BlacklistFOS.csv column names must be ['fos_id', 'fos_name']"
11 | except IndexError:
12 | print('Sdg Terms are not processed in {directory}')
13 | continue
14 | except AssertionError as e:
15 | print(f'In directory {directory}\n{e}')
16 | continue
17 |
18 | df['source'] = directory
19 |
20 | df_blacklist = pd.concat([df_blacklist, df], axis=0)
21 |
22 | df_blacklist.sort_values(['fos_id', 'source'], inplace=True)
23 |
24 | df_blacklist.to_csv('Blacklist.csv', index=False)
25 |
--------------------------------------------------------------------------------
/raw_data/3_blacklist/Blacklist.csv:
--------------------------------------------------------------------------------
1 | fos_id,fos_name,source
2 | 943373,Atomic broadcast,30_8_NABS_FOS
3 | 1670747,Agribusiness,30_8_NABS_FOS
4 | 2011517,Broadcast journalism,30_8_NABS_FOS
5 | 2011517,Broadcast journalism,30_8_NABS_FOS
6 | 2437467,Perfection,30_8_NABS_FOS
7 | 3987366,Livelihood,30_8_NABS_FOS
8 | 4445939,Islam,30_8_NABS_FOS
9 | 5166401,Tailings,30_8_NABS_FOS
10 | 6363049,Volcanism,30_8_NABS_FOS
11 | 6557445,Agronomy,30_8_NABS_FOS
12 | 7028197,Gangue,30_8_NABS_FOS
13 | 7251660,Structural geology,30_8_NABS_FOS
14 | 8824402,Landslides vs. Rock strength,30_8_NABS_FOS
15 | 8868529,Taste,30_8_NABS_FOS
16 | 9299846,Secularization,30_8_NABS_FOS
17 | 9819579,Tuner,30_8_NABS_FOS
18 | 9992130,Pessimism,30_8_NABS_FOS
19 | 10180917,Conscience,30_8_NABS_FOS
20 | 10869588,Church history,30_8_NABS_FOS
21 | 11872896,Fractional crystallization (geology),30_8_NABS_FOS
22 | 13558536,Cropping,30_8_NABS_FOS
23 | 16189245,News bureau,30_8_NABS_FOS
24 | 16397148,Tillage,30_8_NABS_FOS
25 | 16670881,Neotectonics,30_8_NABS_FOS
26 | 16674752,Mining engineering,30_8_NABS_FOS
27 | 16759151,Online presence management,30_8_NABS_FOS
28 | 16942324,Lithosphere,30_8_NABS_FOS
29 | 17235551,Self,30_8_NABS_FOS
30 | 17616946,Pastoralism,30_8_NABS_FOS
31 | 18296254,Skepticism,30_8_NABS_FOS
32 | 21410773,Shoot,30_8_NABS_FOS
33 | 21410773,Shoot,30_8_NABS_FOS
34 | 21441200,Mineral physics,30_8_NABS_FOS
35 | 22029948,Dice,30_8_NABS_FOS
36 | 22508944,PEST analysis,30_8_NABS_FOS
37 | 22512106,Shear wave splitting,30_8_NABS_FOS
38 | 23148476,Seismic anisotropy,30_8_NABS_FOS
39 | 23295444,Shear zone,30_8_NABS_FOS
40 | 23923706,Ophiolite,30_8_NABS_FOS
41 | 23923706,Ophiolite,30_8_NABS_FOS
42 | 24461792,Perennial plant,30_8_NABS_FOS
43 | 26144545,Cut and fill,30_8_NABS_FOS
44 | 27206212,Theology,30_8_NABS_FOS
45 | 27362006,Gestalt psychology,30_8_NABS_FOS
46 | 32198211,Greenhouse,30_8_NABS_FOS
47 | 32198211,Greenhouse,30_8_NABS_FOS
48 | 32506930,Hegelianism,30_8_NABS_FOS
49 | 32772713,Charisma,30_8_NABS_FOS
50 | 33556824,Hydrogeology,30_8_NABS_FOS
51 | 35496372,Phloem,30_8_NABS_FOS
52 | 36248471,Seeding,30_8_NABS_FOS
53 | 37523158,Hadean,30_8_NABS_FOS
54 | 37523158,Hadean,30_8_NABS_FOS
55 | 37923429,Intensive farming,30_8_NABS_FOS
56 | 38304854,Manure,30_8_NABS_FOS
57 | 39511330,Logo,30_8_NABS_FOS
58 | 41062264,Digital broadcasting,30_8_NABS_FOS
59 | 41242791,Rock mass classification,30_8_NABS_FOS
60 | 42211076,Reliable Sources,30_8_NABS_FOS
61 | 42796848,Xenolith,30_8_NABS_FOS
62 | 42972112,Veterinary medicine,30_8_NABS_FOS
63 | 43143990,Conidium,30_8_NABS_FOS
64 | 44938399,Petrogenesis,30_8_NABS_FOS
65 | 44938399,Petrogenesis,30_8_NABS_FOS
66 | 45292766,Bark,30_8_NABS_FOS
67 | 46328234,Organoleptic,30_8_NABS_FOS
68 | 46517748,Drift mining,30_8_NABS_FOS
69 | 46580973,Blackdamp,30_8_NABS_FOS
70 | 46610780,Hinduism,30_8_NABS_FOS
71 | 46757340,Poaceae,30_8_NABS_FOS
72 | 47136581,Agricultural machinery,30_8_NABS_FOS
73 | 48185193,Media,30_8_NABS_FOS
74 | 48189365,Hybrid,30_8_NABS_FOS
75 | 48743137,Organic matter,30_8_NABS_FOS
76 | 49799701,Xylem,30_8_NABS_FOS
77 | 50379869,Hermeneutics,30_8_NABS_FOS
78 | 50660011,Tropics,30_8_NABS_FOS
79 | 50682988,Rift,30_8_NABS_FOS
80 | 51151373,Mid-ocean ridge,30_8_NABS_FOS
81 | 51417038,Phenology,30_8_NABS_FOS
82 | 51832835,Environmental management system,30_8_NABS_FOS
83 | 52991690,Culling,30_8_NABS_FOS
84 | 53007507,Browning,30_8_NABS_FOS
85 | 53657456,Peat,30_8_NABS_FOS
86 | 54924851,Sustainable agriculture,30_8_NABS_FOS
87 | 55322685,Media conglomerate,30_8_NABS_FOS
88 | 55322685,Media conglomerate,30_8_NABS_FOS
89 | 55969652,photoperiodism,30_8_NABS_FOS
90 | 56859440,Bouguer anomaly,30_8_NABS_FOS
91 | 57664001,Agroecosystem,30_8_NABS_FOS
92 | 58097730,Subduction,30_8_NABS_FOS
93 | 58348228,Auteur theory,30_8_NABS_FOS
94 | 58625266,Lode,30_8_NABS_FOS
95 | 58911810,Radio broadcasting,30_8_NABS_FOS
96 | 60136833,Social media optimization,30_8_NABS_FOS
97 | 61783943,Luck,30_8_NABS_FOS
98 | 61968832,Animal husbandry,30_8_NABS_FOS
99 | 64476972,Sustainable Agriculture Innovation Network,30_8_NABS_FOS
100 | 66511971,Mining geology,30_8_NABS_FOS
101 | 66914385,Poultry farming,30_8_NABS_FOS
102 | 67236022,Mantle (geology),30_8_NABS_FOS
103 | 68163228,Radio Data System,30_8_NABS_FOS
104 | 71762439,Arable land,30_8_NABS_FOS
105 | 73440236,Psyche,30_8_NABS_FOS
106 | 74103781,Ornamental plant,30_8_NABS_FOS
107 | 74216064,Social computing,30_8_NABS_FOS
108 | 74256435,Flood myth,30_8_NABS_FOS
109 | 75296557,Husk,30_8_NABS_FOS
110 | 75639521,Field experiment,30_8_NABS_FOS
111 | 75639521,Field experiment,30_8_NABS_FOS
112 | 75699723,Buddhism,30_8_NABS_FOS
113 | 76960060,Umwelt,30_8_NABS_FOS
114 | 77757571,Multimedia Broadcast Multicast Service,30_8_NABS_FOS
115 | 77928131,Tectonics,30_8_NABS_FOS
116 | 79572550,Partial melting,30_8_NABS_FOS
117 | 81461190,Temperate climate,30_8_NABS_FOS
118 | 81959379,Broadcasting,30_8_NABS_FOS
119 | 83529365,Broadcast communication network,30_8_NABS_FOS
120 | 83559648,Croatian,30_8_NABS_FOS
121 | 83740816,Gibberellin,30_8_NABS_FOS
122 | 83948199,Incompatible element,30_8_NABS_FOS
123 | 84372278,Peridotite,30_8_NABS_FOS
124 | 85582077,Paddy field,30_8_NABS_FOS
125 | 85675897,Soil management,30_8_NABS_FOS
126 | 88862950,Irrigation,30_8_NABS_FOS
127 | 88862950,Irrigation,30_8_NABS_FOS
128 | 88972607,Human fertilization,30_8_NABS_FOS
129 | 91285054,Broadcast television systems,30_8_NABS_FOS
130 | 91304198,Hebrew,30_8_NABS_FOS
131 | 92047909,Hyperbolic function,30_8_NABS_FOS
132 | 92596616,Lineament,30_8_NABS_FOS
133 | 93011207,Geotechnical investigation,30_8_NABS_FOS
134 | 93746451,Mineral redox buffer,30_8_NABS_FOS
135 | 94236395,Stoping,30_8_NABS_FOS
136 | 97490223,Focal mechanism,30_8_NABS_FOS
137 | 97842125,Rock mechanics,30_8_NABS_FOS
138 | 100701293,Germination,30_8_NABS_FOS
139 | 101000010,Canopy,30_8_NABS_FOS
140 | 101293273,User-generated content,30_8_NABS_FOS
141 | 102044607,Adit,30_8_NABS_FOS
142 | 102523778,Form of the Good,30_8_NABS_FOS
143 | 103797069,Domestic sheep reproduction,30_8_NABS_FOS
144 | 104727253,Biological pest control,30_8_NABS_FOS
145 | 105462344,Nutrient management,30_8_NABS_FOS
146 | 108010975,Pruning,30_8_NABS_FOS
147 | 109162521,Soil conservation,30_8_NABS_FOS
148 | 109281948,Stratigraphy,30_8_NABS_FOS
149 | 110041135,Thrust fault,30_8_NABS_FOS
150 | 110157686,Broadcasting (networking),30_8_NABS_FOS
151 | 111021475,Protestantism,30_8_NABS_FOS
152 | 112077630,Irrigation management,30_8_NABS_FOS
153 | 112939947,Green Revolution,30_8_NABS_FOS
154 | 113052830,Land degradation,30_8_NABS_FOS
155 | 113522999,Fall of man,30_8_NABS_FOS
156 | 113658590,Muck,30_8_NABS_FOS
157 | 114793014,Geomorphology,30_8_NABS_FOS
158 | 115930662,Shelf life,30_8_NABS_FOS
159 | 118518473,Agriculture,30_8_NABS_FOS
160 | 118817206,Organic farming,30_8_NABS_FOS
161 | 119452085,Commercial broadcasting,30_8_NABS_FOS
162 | 119477230,Plate tectonics,30_8_NABS_FOS
163 | 120217122,Precision agriculture,30_8_NABS_FOS
164 | 122690726,"Land use, land-use change and forestry",30_8_NABS_FOS
165 | 122959257,Seismic tomography,30_8_NABS_FOS
166 | 123963621,Integrated pest management,30_8_NABS_FOS
167 | 127200247,Hydraulic mining,30_8_NABS_FOS
168 | 127723449,Core–mantle boundary,30_8_NABS_FOS
169 | 128361363,Symbol,30_8_NABS_FOS
170 | 128383755,Agricultural productivity,30_8_NABS_FOS
171 | 128536511,History of religions,30_8_NABS_FOS
172 | 128758860,Woody plant,30_8_NABS_FOS
173 | 128954607,Tectonophysics,30_8_NABS_FOS
174 | 129225989,Cash crop,30_8_NABS_FOS
175 | 129454956,Field research,30_8_NABS_FOS
176 | 130979935,Ansatz,30_8_NABS_FOS
177 | 132215390,Abiotic component,30_8_NABS_FOS
178 | 133479454,Mycelium,30_8_NABS_FOS
179 | 133979268,Vision,30_8_NABS_FOS
180 | 134215735,Flock,30_8_NABS_FOS
181 | 136752280,Geodynamics,30_8_NABS_FOS
182 | 137580998,Crop,30_8_NABS_FOS
183 | 137580998,Crop,30_8_NABS_FOS
184 | 137607661,Land tenure,30_8_NABS_FOS
185 | 137660486,Growing season,30_8_NABS_FOS
186 | 137776501,Point of delivery,30_8_NABS_FOS
187 | 139496715,Deforestation,30_8_NABS_FOS
188 | 140230471,Planetary core,30_8_NABS_FOS
189 | 140441402,Carbonatite,30_8_NABS_FOS
190 | 141005173,Shifting cultivation,30_8_NABS_FOS
191 | 141282968,Plant ecology,30_8_NABS_FOS
192 | 141646446,Continental crust,30_8_NABS_FOS
193 | 142796444,Nutrient,30_8_NABS_FOS
194 | 143128703,Middle Ages,30_8_NABS_FOS
195 | 144027150,Horticulture,30_8_NABS_FOS
196 | 150152722,Judaism,30_8_NABS_FOS
197 | 150668497,Dry weight,30_8_NABS_FOS
198 | 150668497,Dry weight,30_8_NABS_FOS
199 | 150772632,Arid,30_8_NABS_FOS
200 | 150999391,Terrane,30_8_NABS_FOS
201 | 152212766,The Republic,30_8_NABS_FOS
202 | 152972079,Plate reconstruction,30_8_NABS_FOS
203 | 154802760,Giant impact hypothesis,30_8_NABS_FOS
204 | 155030161,Mass communication,30_8_NABS_FOS
205 | 155785087,Natural law,30_8_NABS_FOS
206 | 155868670,Root system,30_8_NABS_FOS
207 | 156005406,Subsistence agriculture,30_8_NABS_FOS
208 | 156571341,Cyberpsychology,30_8_NABS_FOS
209 | 156663261,Agroecology,30_8_NABS_FOS
210 | 157140304,Agrarian society,30_8_NABS_FOS
211 | 157670687,Postharvest,30_8_NABS_FOS
212 | 159505674,Broadcasting of sports events,30_8_NABS_FOS
213 | 159719176,Engineering geology,30_8_NABS_FOS
214 | 159750122,Soil water,30_8_NABS_FOS
215 | 159789966,Lingua franca,30_8_NABS_FOS
216 | 160804572,Silicate perovskite,30_8_NABS_FOS
217 | 161176658,Pesticide,30_8_NABS_FOS
218 | 161221295,Plant physiology,30_8_NABS_FOS
219 | 161487207,Derecho,30_8_NABS_FOS
220 | 164105321,Catalan,30_8_NABS_FOS
221 | 164663123,Cosmos,30_8_NABS_FOS
222 | 166839181,Graben,30_8_NABS_FOS
223 | 167275870,Media system dependency theory,30_8_NABS_FOS
224 | 167752473,News values,30_8_NABS_FOS
225 | 167752473,News values,30_8_NABS_FOS
226 | 167919410,Metasomatism,30_8_NABS_FOS
227 | 168197293,Pollination,30_8_NABS_FOS
228 | 168568655,Medical entomology,30_8_NABS_FOS
229 | 168741863,Sowing,30_8_NABS_FOS
230 | 168741863,Sowing,30_8_NABS_FOS
231 | 169081014,Mysticism,30_8_NABS_FOS
232 | 172353545,Ripening,30_8_NABS_FOS
233 | 173419221,Crossbreed,30_8_NABS_FOS
234 | 173853756,Dialog box,30_8_NABS_FOS
235 | 175181221,Prospecting,30_8_NABS_FOS
236 | 175760724,Crop rotation,30_8_NABS_FOS
237 | 178165689,Inflorescence,30_8_NABS_FOS
238 | 178408851,Content marketing,30_8_NABS_FOS
239 | 179158327,Palaeogeography,30_8_NABS_FOS
240 | 179974421,Rock bolt,30_8_NABS_FOS
241 | 180362636,Thrust tectonics,30_8_NABS_FOS
242 | 180903884,Rationalism,30_8_NABS_FOS
243 | 182744844,Metaphysics,30_8_NABS_FOS
244 | 183135511,Natural resource management,30_8_NABS_FOS
245 | 183282558,Kimberlite,30_8_NABS_FOS
246 | 183384803,Automatic dependent surveillance-broadcast,30_8_NABS_FOS
247 | 183889291,Crop residue,30_8_NABS_FOS
248 | 184977646,Open-pit mining,30_8_NABS_FOS
249 | 185476388,Cotyledon,30_8_NABS_FOS
250 | 186096623,Ground pressure,30_8_NABS_FOS
251 | 186857363,Siege,30_8_NABS_FOS
252 | 189797535,Drought tolerance,30_8_NABS_FOS
253 | 192039558,Biofertilizer,30_8_NABS_FOS
254 | 193605714,Steam shovel,30_8_NABS_FOS
255 | 194105502,Biblical studies,30_8_NABS_FOS
256 | 194775826,Herd,30_8_NABS_FOS
257 | 195081551,Ultramafic rock,30_8_NABS_FOS
258 | 196227537,Broadcast transmitter,30_8_NABS_FOS
259 | 196690852,Social analytics,30_8_NABS_FOS
260 | 197321923,Cultivar,30_8_NABS_FOS
261 | 197321923,Cultivar,30_8_NABS_FOS
262 | 199007388,Diapir,30_8_NABS_FOS
263 | 201280247,Newspaper,30_8_NABS_FOS
264 | 201867031,Continental margin,30_8_NABS_FOS
265 | 202050865,Hectare,30_8_NABS_FOS
266 | 207469975,Shaft mining,30_8_NABS_FOS
267 | 207581243,Agrochemical,30_8_NABS_FOS
268 | 502990516,Agricultural land,30_8_NABS_FOS
269 | 503923677,Social web,30_8_NABS_FOS
270 | 507981020,Agricultural education,30_8_NABS_FOS
271 | 510538283,Phosphorus,30_8_NABS_FOS
272 | 512654426,Public domain,30_8_NABS_FOS
273 | 513193947,Fodder,30_8_NABS_FOS
274 | 518677369,Social media,30_8_NABS_FOS
275 | 520681616,Digital television,30_8_NABS_FOS
276 | 521751864,Christian ministry,30_8_NABS_FOS
277 | 523966790,Animal welfare,30_8_NABS_FOS
278 | 529147693,News media,30_8_NABS_FOS
279 | 530479602,Opera,30_8_NABS_FOS
280 | 534701709,Old Testament,30_8_NABS_FOS
281 | 540442320,Pest control,30_8_NABS_FOS
282 | 540442320,Pest control,30_8_NABS_FOS
283 | 543192267,Magic (paranormal),30_8_NABS_FOS
284 | 549605437,Food security,30_8_NABS_FOS
285 | 549698073,Structure of the Earth,30_8_NABS_FOS
286 | 551968917,Christianity,30_8_NABS_FOS
287 | 556509198,Public broadcasting,30_8_NABS_FOS
288 | 558299567,Mass media,30_8_NABS_FOS
289 | 559400886,Land management,30_8_NABS_FOS
290 | 2775835988,Grassland,30_8_NABS_FOS
291 | 2775858120,Memoria,30_8_NABS_FOS
292 | 2775891814,Weed,30_8_NABS_FOS
293 | 2775898560,Common Agricultural Policy,30_8_NABS_FOS
294 | 2775944640,Utopia,30_8_NABS_FOS
295 | 2775976403,Aphid,30_8_NABS_FOS
296 | 2775997990,Narrowcasting,30_8_NABS_FOS
297 | 2776050585,Scrutiny,30_8_NABS_FOS
298 | 2776082042,Vulpes,30_8_NABS_FOS
299 | 2776096895,Seedling,30_8_NABS_FOS
300 | 2776134716,Sacrifice,30_8_NABS_FOS
301 | 2776184289,Farallon Plate,30_8_NABS_FOS
302 | 2776211767,Doctrine,30_8_NABS_FOS
303 | 2776222705,Wild boar,30_8_NABS_FOS
304 | 2776242653,Pepper,30_8_NABS_FOS
305 | 2776247511,Zebu,30_8_NABS_FOS
306 | 2776286235,Phaseolus,30_8_NABS_FOS
307 | 2776305542,Problema,30_8_NABS_FOS
308 | 2776313748,Broadcast range,30_8_NABS_FOS
309 | 2776327621,Flesh,30_8_NABS_FOS
310 | 2776347870,Passions,30_8_NABS_FOS
311 | 2776373379,Chlorophyll,30_8_NABS_FOS
312 | 2776405206,Revelation,30_8_NABS_FOS
313 | 2776451879,Infestation,30_8_NABS_FOS
314 | 2776474821,Mushroom,30_8_NABS_FOS
315 | 2776475172,Soil quality,30_8_NABS_FOS
316 | 2776482104,Breed,30_8_NABS_FOS
317 | 2776521926,Brahman,30_8_NABS_FOS
318 | 2776527531,Persian,30_8_NABS_FOS
319 | 2776585538,Agenda-setting theory,30_8_NABS_FOS
320 | 2776629827,Panasqueira,30_8_NABS_FOS
321 | 2776632002,Legume,30_8_NABS_FOS
322 | 2776632002,Legume,30_8_NABS_FOS
323 | 2776684731,Garcia,30_8_NABS_FOS
324 | 2776698055,Crust,30_8_NABS_FOS
325 | 2776727279,Heaven,30_8_NABS_FOS
326 | 2776747608,Brassica,30_8_NABS_FOS
327 | 2776757517,Newspaper digitization,30_8_NABS_FOS
328 | 2776760134,Gold mining,30_8_NABS_FOS
329 | 2776763651,Ferropericlase,30_8_NABS_FOS
330 | 2776797426,Biogeosciences,30_8_NABS_FOS
331 | 2776847985,Single-frequency network,30_8_NABS_FOS
332 | 2776892586,Brand engagement,30_8_NABS_FOS
333 | 2776908094,Anthelmintic,30_8_NABS_FOS
334 | 2776911728,Courage,30_8_NABS_FOS
335 | 2776915394,Customer engagement,30_8_NABS_FOS
336 | 2776932993,Ethos,30_8_NABS_FOS
337 | 2776960312,Hock,30_8_NABS_FOS
338 | 2776973623,Legal deposit,30_8_NABS_FOS
339 | 2776977481,Dairy cattle,30_8_NABS_FOS
340 | 2777108408,Sugar,30_8_NABS_FOS
341 | 2777114023,Withers,30_8_NABS_FOS
342 | 2777122596,Praxis,30_8_NABS_FOS
343 | 2777146433,Badger,30_8_NABS_FOS
344 | 2777151259,Mallophaga,30_8_NABS_FOS
345 | 2777178263,Land reform,30_8_NABS_FOS
346 | 2777199308,Louse,30_8_NABS_FOS
347 | 2777201227,Overburden,30_8_NABS_FOS
348 | 2777222677,Worship,30_8_NABS_FOS
349 | 2777222942,Corriedale,30_8_NABS_FOS
350 | 2777225262,Veterinary pathology,30_8_NABS_FOS
351 | 2777239683,Virtue,30_8_NABS_FOS
352 | 2777257828,Virtual archaeology,30_8_NABS_FOS
353 | 2777370179,Social media measurement,30_8_NABS_FOS
354 | 2777425756,Gold panning,30_8_NABS_FOS
355 | 2777438998,Tribunal,30_8_NABS_FOS
356 | 2777452754,TV-Anytime,30_8_NABS_FOS
357 | 2777461220,Germplasm,30_8_NABS_FOS
358 | 2777474537,Pheasant,30_8_NABS_FOS
359 | 2777477151,Prayer,30_8_NABS_FOS
360 | 2777480983,USArray,30_8_NABS_FOS
361 | 2777481183,Market access,30_8_NABS_FOS
362 | 2777499811,Ivermectin,30_8_NABS_FOS
363 | 2777514068,Broadcast band,30_8_NABS_FOS
364 | 2777582232,CONTEST,30_8_NABS_FOS
365 | 2777612826,Insect,30_8_NABS_FOS
366 | 2777617010,Mainstream,30_8_NABS_FOS
367 | 2777695277,DVB-H,30_8_NABS_FOS
368 | 2777776507,Lexico,30_8_NABS_FOS
369 | 2777786777,Flea,30_8_NABS_FOS
370 | 2777835648,Filter bubble,30_8_NABS_FOS
371 | 2777904157,Grazing,30_8_NABS_FOS
372 | 2777963300,Ovis,30_8_NABS_FOS
373 | 2777976947,Eimeria,30_8_NABS_FOS
374 | 2777992645,North American Plate,30_8_NABS_FOS
375 | 2777994876,Pacific Plate,30_8_NABS_FOS
376 | 2778002360,Rump,30_8_NABS_FOS
377 | 2778052875,Bildung,30_8_NABS_FOS
378 | 2778053677,Pasture,30_8_NABS_FOS
379 | 2778134537,Domestic pig,30_8_NABS_FOS
380 | 2778136425,Struthio,30_8_NABS_FOS
381 | 2778143190,Minnesota Geological Survey,30_8_NABS_FOS
382 | 2778157034,Sorghum,30_8_NABS_FOS
383 | 2778157034,Sorghum,30_8_NABS_FOS
384 | 2778182169,Jako,30_8_NABS_FOS
385 | 2778226015,Capreolus,30_8_NABS_FOS
386 | 2778261408,Eurasian Plate,30_8_NABS_FOS
387 | 2778375701,Ground stone,30_8_NABS_FOS
388 | 2778402112,Agricultural extension,30_8_NABS_FOS
389 | 2778412320,iPhoneography,30_8_NABS_FOS
390 | 2778452349,Rural poverty,30_8_NABS_FOS
391 | 2778471503,Basin and Range Province,30_8_NABS_FOS
392 | 2778524612,Mining law,30_8_NABS_FOS
393 | 2778553611,1seg,30_8_NABS_FOS
394 | 2778691696,Dairy farming,30_8_NABS_FOS
395 | 2778692574,Faith,30_8_NABS_FOS
396 | 2778729106,Social media analytics,30_8_NABS_FOS
397 | 2778738651,Novelty,30_8_NABS_FOS
398 | 2778749970,Conditional access,30_8_NABS_FOS
399 | 2778761015,Solanaceae,30_8_NABS_FOS
400 | 2778802261,Orthodoxy,30_8_NABS_FOS
401 | 2778838397,Uses and gratifications theory,30_8_NABS_FOS
402 | 2778839144,Medical geology,30_8_NABS_FOS
403 | 2778852317,Agricultural policy,30_8_NABS_FOS
404 | 2778856526,Cow-calf,30_8_NABS_FOS
405 | 2778871292,Social television,30_8_NABS_FOS
406 | 2778877831,Cryptosporidium,30_8_NABS_FOS
407 | 2778882853,Phlogopite,30_8_NABS_FOS
408 | 2778896172,Manifesto,30_8_NABS_FOS
409 | 2778983918,Wife,30_8_NABS_FOS
410 | 2778985329,ISDB,30_8_NABS_FOS
411 | 2779021329,Destiny,30_8_NABS_FOS
412 | 2779056648,International broadcasting,30_8_NABS_FOS
413 | 2779070535,Street gutter,30_8_NABS_FOS
414 | 2779081413,AM stereo,30_8_NABS_FOS
415 | 2779096232,Hydraulic fracturing,30_8_NABS_FOS
416 | 2779103253,Duty,30_8_NABS_FOS
417 | 2779106878,Digital audio broadcasting,30_8_NABS_FOS
418 | 2779111255,History of journalism,30_8_NABS_FOS
419 | 2779113645,Like button,30_8_NABS_FOS
420 | 2779197568,Sunflower,30_8_NABS_FOS
421 | 2779213998,Mobile television,30_8_NABS_FOS
422 | 2779220025,Peasant,30_8_NABS_FOS
423 | 2779329348,Feedlot,30_8_NABS_FOS
424 | 2779370140,Forage,30_8_NABS_FOS
425 | 2779371384,Biomass,30_8_NABS_FOS
426 | 2779422593,Soil crust,30_8_NABS_FOS
427 | 2779429622,Litter,30_8_NABS_FOS
428 | 2779438500,Honor,30_8_NABS_FOS
429 | 2779438827,Television station,30_8_NABS_FOS
430 | 2779461089,FM broadcasting,30_8_NABS_FOS
431 | 2779546711,Penny press,30_8_NABS_FOS
432 | 2779552062,Roe deer,30_8_NABS_FOS
433 | 2779557943,Canis,30_8_NABS_FOS
434 | 2779587293,Straw,30_8_NABS_FOS
435 | 2779620486,Tick,30_8_NABS_FOS
436 | 2779678110,Fungus,30_8_NABS_FOS
437 | 2779728303,Pride,30_8_NABS_FOS
438 | 2779742380,Gabion,30_8_NABS_FOS
439 | 2779742664,Broadcast address,30_8_NABS_FOS
440 | 2779795913,Hurrying,30_8_NABS_FOS
441 | 2779824472,Herb,30_8_NABS_FOS
442 | 2779829227,Vitality,30_8_NABS_FOS
443 | 2779867292,African Plate,30_8_NABS_FOS
444 | 2779867394,Bubalus,30_8_NABS_FOS
445 | 2779880937,Whinstone,30_8_NABS_FOS
446 | 2779883265,Electronic program guide,30_8_NABS_FOS
447 | 2779885849,Milking,30_8_NABS_FOS
448 | 2779914258,Nili-Ravi,30_8_NABS_FOS
449 | 2779944825,Headlinese,30_8_NABS_FOS
450 | 2779980370,Magma ocean,30_8_NABS_FOS
451 | 2779980370,Magma ocean,30_8_NABS_FOS
452 | 2780043312,Hydraulic fill,30_8_NABS_FOS
453 | 2780054949,Spinach,30_8_NABS_FOS
454 | 2780079832,Digital multimedia broadcasting,30_8_NABS_FOS
455 | 2780117336,Farm income,30_8_NABS_FOS
456 | 2780138947,Dry matter,30_8_NABS_FOS
457 | 2780284631,Sire,30_8_NABS_FOS
458 | 2780310893,Passion,30_8_NABS_FOS
459 | 2780323295,Cervus,30_8_NABS_FOS
460 | 2780356177,Baltic Shield,30_8_NABS_FOS
461 | 2780356177,Baltic Shield,30_8_NABS_FOS
462 | 2780414537,Maple,30_8_NABS_FOS
463 | 2780415144,SAINT,30_8_NABS_FOS
464 | 2780422510,Humanity,30_8_NABS_FOS
465 | 2780441040,Slacktivism,30_8_NABS_FOS
466 | 2780460740,Jackal,30_8_NABS_FOS
467 | 2780487972,Veterinary parasitology,30_8_NABS_FOS
468 | 2780505807,Beef cattle,30_8_NABS_FOS
469 | 2780556036,South Pole–Aitken basin,30_8_NABS_FOS
470 | 2780563676,Aroma,30_8_NABS_FOS
471 | 2780564743,Social CRM,30_8_NABS_FOS
472 | 2780580889,Panorama,30_8_NABS_FOS
473 | 2780618852,Pollen,30_8_NABS_FOS
474 | 2780695499,Social media mining,30_8_NABS_FOS
475 | 2780698354,Broadcast quality,30_8_NABS_FOS
476 | 2780710533,Governo,30_8_NABS_FOS
477 | 2780719635,Flavor,30_8_NABS_FOS
478 | 2780727426,Awassi,30_8_NABS_FOS
479 | 2780739461,Compost,30_8_NABS_FOS
480 | 2780739461,Compost,30_8_NABS_FOS
481 | 2780756850,News design,30_8_NABS_FOS
482 | 2780818791,Teletext,30_8_NABS_FOS
483 | 2780822299,Soul,30_8_NABS_FOS
484 | 2780942940,Stishovite,30_8_NABS_FOS
485 | 2780968714,Mange,30_8_NABS_FOS
486 | 2780997048,Digital footprint,30_8_NABS_FOS
487 | 2781079927,Dimension stone,30_8_NABS_FOS
488 | 2781121916,Bow drill,30_8_NABS_FOS
489 | 2781179785,Valencia,30_8_NABS_FOS
490 | 2781191505,DAB ensemble,30_8_NABS_FOS
491 | 2781207809,South American Plate,30_8_NABS_FOS
492 | 2781287369,Stuttgart,30_8_NABS_FOS
493 | 2781343547,Media Practice Model,30_8_NABS_FOS
494 | 2781354396,Enthusiasm,30_8_NABS_FOS
495 | 2781368420,Biosecurity,30_8_NABS_FOS
496 | 2781384534,Gospel,30_8_NABS_FOS
497 | 2781390083,Pinctada fucata,30_8_NABS_FOS
498 | 2908605944,Meleagris gallopavo,30_8_NABS_FOS
499 | 2908982167,Cattle Diseases,30_8_NABS_FOS
500 | 2909031412,Gallus gallus domesticus,30_8_NABS_FOS
501 | 2909086881,Stone quarry,30_8_NABS_FOS
502 | 2909619495,Food animal,30_8_NABS_FOS
503 | 2909623323,Waste Dumps,30_8_NABS_FOS
504 | 2909642594,Shaft (site),30_8_NABS_FOS
505 | 2909771501,Goat Diseases,30_8_NABS_FOS
506 | 2909895380,Guinea fowl,30_8_NABS_FOS
507 | 2910164855,Logging car,30_8_NABS_FOS
508 | 2910250570,Newspapers as Topic,30_8_NABS_FOS
509 | 2910477778,DUMP formation,30_8_NABS_FOS
510 | 2910514300,Root stones,30_8_NABS_FOS
511 | 2910534252,Newsclipping,30_8_NABS_FOS
512 | 2910651670,Bird Diseases,30_8_NABS_FOS
513 | 2910697619,Hearing analyzer,30_8_NABS_FOS
514 | 2910921642,Mineral industries,30_8_NABS_FOS
515 | 2910990604,Lama glama,30_8_NABS_FOS
516 | 2911060314,Laboratory Animal Science,30_8_NABS_FOS
517 | 2911132530,Mine surveyor,30_8_NABS_FOS
518 | 2911210907,Support pressure,30_8_NABS_FOS
519 | 2982719622,Video broadcast,30_8_NABS_FOS
520 | 2982966219,Plant growth,30_8_NABS_FOS
521 | 2982966219,Plant growth,30_8_NABS_FOS
522 | 2984029112,Social mining,30_8_NABS_FOS
523 | 2984157484,Mining industry,30_8_NABS_FOS
524 | 2984608069,Wireless broadcast,30_8_NABS_FOS
525 | 2984648278,Ambient awareness,30_8_NABS_FOS
526 | 2985624630,Broadcasting system,30_8_NABS_FOS
527 | 2985692548,Crisis informatics,30_8_NABS_FOS
528 | 2985711970,Mobile broadcast,30_8_NABS_FOS
529 | 2985889538,Social media marketing,30_8_NABS_FOS
530 | 2986347997,Wireless broadcasting,30_8_NABS_FOS
531 | 2986426982,Social stream,30_8_NABS_FOS
532 | 2986444337,Near video on demand,30_8_NABS_FOS
533 | 2987005673,Broadcast scheduling,30_8_NABS_FOS
534 | 2987034934,Earth crust,30_8_NABS_FOS
535 | 2987043902,Broadcasting algorithms,30_8_NABS_FOS
536 | 2987325470,Social commerce,30_8_NABS_FOS
537 | 2987348774,Broadcast data,30_8_NABS_FOS
538 | 2987376390,Electronic word of mouth,30_8_NABS_FOS
539 | 2987442367,Tv viewer,30_8_NABS_FOS
540 | 2987586235,Multimedia broadcasting,30_8_NABS_FOS
541 | 2987800000,Crisis mapping,30_8_NABS_FOS
542 | 2988327197,Online harassment,30_8_NABS_FOS
543 | 2988338654,News sharing,30_8_NABS_FOS
544 | 2988529969,Cold storage,30_8_NABS_FOS
545 | 2988547615,Speech summarization,30_8_NABS_FOS
546 | 2988622424,Social multimedia,30_8_NABS_FOS
547 | 2988676352,Rural development,30_8_NABS_FOS
548 | 2988833398,Social event detection,30_8_NABS_FOS
549 | 2988996608,Online engagement,30_8_NABS_FOS
550 | 2989393167,Personal learning network,30_8_NABS_FOS
551 | 2989409935,Crop production,30_8_NABS_FOS
552 | 2989465874,Broadcast channels,30_8_NABS_FOS
553 | 2991660179,Broadcast packet,30_8_NABS_FOS
554 | 2991667299,Capra hircus,30_8_NABS_FOS
555 | 2991862235,Animal health,30_8_NABS_FOS
556 | 2991870026,Social media network,30_8_NABS_FOS
557 | 2991920864,Community radio,30_8_NABS_FOS
558 | 2991922516,Rock pressure,30_8_NABS_FOS
559 | 2992067306,Mineral potential,30_8_NABS_FOS
560 | 2992202738,Mobile broadcasting,30_8_NABS_FOS
561 | 2992211155,Grain yield,30_8_NABS_FOS
562 | 2992306869,Satellite television,30_8_NABS_FOS
563 | 2992330363,Urban geology,30_8_NABS_FOS
564 | 2992406196,Waste dump,30_8_NABS_FOS
565 | 2992407798,Iron mining,30_8_NABS_FOS
566 | 2992481583,Broadcast time,30_8_NABS_FOS
567 | 2992547679,Land mine,30_8_NABS_FOS
568 | 2992637229,Michel foucault,30_8_NABS_FOS
569 | 2992647939,Consumer engagement,30_8_NABS_FOS
570 | 2992730755,Agricultural development,30_8_NABS_FOS
571 | 2992974802,Geological exploration,30_8_NABS_FOS
572 | 2992981300,Salt deposit,30_8_NABS_FOS
573 | 2992990004,Solid rock,30_8_NABS_FOS
574 | 2993003885,Land area,30_8_NABS_FOS
575 | 2993054622,Core formation,30_8_NABS_FOS
576 | 2993102984,Mine planning,30_8_NABS_FOS
577 | 2993134977,Mineral deposit,30_8_NABS_FOS
578 | 2993139054,Water buffalo,30_8_NABS_FOS
579 | 2993172631,Social news,30_8_NABS_FOS
580 | 2993199473,Plant biochemistry,30_8_NABS_FOS
581 | 2993240939,Online activism,30_8_NABS_FOS
582 | 2993252152,Rock body,30_8_NABS_FOS
583 | 2993273313,Chemical control,30_8_NABS_FOS
584 | 2993323123,Rock slope,30_8_NABS_FOS
585 | 2993426613,Arabic sentiment analysis,30_8_NABS_FOS
586 | 2993437602,Rock structure,30_8_NABS_FOS
587 | 2993492720,Gold production,30_8_NABS_FOS
588 | 2993527706,Salt mine,30_8_NABS_FOS
589 | 2993531722,Zea mays,30_8_NABS_FOS
590 | 2993555337,Personal branding,30_8_NABS_FOS
591 | 2993676337,Broadcast transmission,30_8_NABS_FOS
592 | 2993808335,Seismic velocity,30_8_NABS_FOS
593 | 2993865493,Social data analytics,30_8_NABS_FOS
594 | 2994012208,Upper crust,30_8_NABS_FOS
595 | 2994012208,Upper crust,30_8_NABS_FOS
596 | 2994104004,Broadcast service,30_8_NABS_FOS
597 | 2994289516,Geological investigation,30_8_NABS_FOS
598 | 2994381574,Broadcast system,30_8_NABS_FOS
599 | 2994460426,Pig farms,30_8_NABS_FOS
600 | 2994466296,Radio program,30_8_NABS_FOS
601 | 2994534981,Television channel,30_8_NABS_FOS
602 | 2994537864,Human medicine,30_8_NABS_FOS
603 | 3017754109,Companion animal,30_8_NABS_FOS
604 | 3017803470,Crust formation,30_8_NABS_FOS
605 | 3017937595,Dog owners,30_8_NABS_FOS
606 | 3018078696,Dwarf goats,30_8_NABS_FOS
607 | 3018846106,Fear of missing out,30_8_NABS_FOS
608 | 3019217387,Crime news,30_8_NABS_FOS
609 | 3020113513,Small ruminant,30_8_NABS_FOS
610 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | #To use the API you will need to have docker installed on your system.
5 | #See instructions how to install docker on your operating system: https://docs.docker.com/get-docker/
6 |
--------------------------------------------------------------------------------
/sampleAPICall.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Fri Apr 3 18:37:11 2020
5 |
6 | @author: lukas-pkl
7 | """
8 |
9 | """
10 | To use the API, please download and run the docker conatiner
11 |
12 | in bash :
13 |
14 | docker pull technoteai/osdg
15 | docker run --name my-open-sdg -p 5000:5000 technoteai/osdg:lattest
16 |
17 |
18 | """
19 |
20 |
21 |
22 |
23 | import requests
24 |
25 |
26 |
27 | data = { 'query': """Using satellite data on deforestation and weather in Malawi and
28 | linking those datasets with household survey datasets, we estimate the causal
29 | effect of deforestation on access to clean drinking water. In the existing
30 | literature on forest science and hydrology, the consensus is that
31 | deforestation increases water yield. In this study, we directly examine the
32 | causal effect of deforestation on households’ access to clean drinking water.
33 | Results of the two-stage least-squares (2SLS) with cluster and time fixed-effect
34 | estimations illustrate strong empirical evidence that deforestation decreases
35 | access to clean drinking water. Falsification tests show that the possibility of
36 | our instrumental variable picking up an unobserved time trend is very unlikely.
37 | We find that a 1.0-percentage-point increase in deforestation decreases access
38 | to clean drinking water by 0.93 percentage points. With this estimated impact,
39 | deforestation in the last decade in Malawi (14%) has had the same magnitude of
40 | effect on access to clean drinking water as that of a 9% decrease in rainfall.
41 | """ }
42 |
43 |
44 | response = requests.post('http://localhost:5000/search', data=data)
45 |
46 | result = response.text
47 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import re
3 |
4 | sws = set([
5 | 'ourselves', 'should', 'often', 'does', 'this', 'beside', 'well',
6 | 'among', 'throughout', 'being', 'become', 'yourselves', 'namely',
7 | 'whom', 'nothing', 'thus', 'many', '’re', 'had', 'somewhere', 'made',
8 | 'still', "'re", 'eight', 'of', 'yours', 'further', 'again', 'by',
9 | 'anyhow', 'whenever', 'both', 'first', 'third', 'whither', 'all',
10 | 'whether', 'amount', 'afterwards', 'alone', 'she', 'where', 'seemed',
11 | 'something', 'mine', 'whatever', 'most', 'doing', 'behind',
12 | 'thereupon', 'whole', 'hers', 'ca', 'a', 'before', 'forty', '’d',
13 | '‘s', 'three', 'anything', 'via', 'hereafter', 'him', 'as', 'those',
14 | 'here', 'around', '’ve', 'much', 'some', 'whereas', 'several', 'has',
15 | 'done', 'besides', 'am', 'hereby', '‘d', 'yet', 'make', 'none',
16 | 'while', 'just', 'towards', 'sometimes', 'his', 'into', 'various',
17 | 'their', 'thence', 'so', 'either', 'about', 'once', 'onto', 'thru',
18 | "'m", 'one', 'seems', 'between', 'say', 'mostly', 'otherwise',
19 | 'herself', 'might', 'and', 'least', 'did', 'hence', 'any', 'do',
20 | 'each', 'whereupon', 'becoming', 'thereby', "'ll", 'two', 'yourself',
21 | 'these', 'through', 'four', "'s", 'last', 'on', 'along', 'could',
22 | "n't", 'front', 'not', 'quite', '’m', 'at', 'he', 'ten', 'very',
23 | 'himself', 'although', 'now', 'it', 'move', 'bottom', 'within',
24 | 'can', 'sometime', 'out', 'elsewhere', 'empty', 'such', 'after',
25 | 'seeming', 'put', 'us', 'upon', 'please', 'used', 'except', 'n‘t',
26 | 'ours', 'six', 'though', 'without', 'why', 'however', 'above',
27 | 'herein', 'else', 'them', 'formerly', 'since', 'take', 'beyond',
28 | 'whence', 'n’t', 'been', 'nor', 'wherever', 'everywhere', 'hundred',
29 | 'but', 'latterly', 'really', 'is', 'with', 'hereupon', 'we',
30 | 'someone', 'whereby', 'in', 'because', 'latter', 'eleven', 'serious',
31 | 'twenty', 'name', 'may', 'itself', 'to', 'there', "'ve", 'whereafter',
32 | 'ever', 'perhaps', 'everyone', 'sixty', 'seem', 'which', 'almost',
33 | 'anywhere', 'the', 'wherein', 'its', 'cannot', 'keep', 'twelve',
34 | 'moreover', 'they', 'more', 'regarding', 'next', 'you', 'your',
35 | 'own', 'enough', 'side', 're', 'neither', 'have', 'during', 'under',
36 | 'will', 'would', 'over', 'therein', 'became', 'beforehand', 'using',
37 | 'part', 'my', 'that', 'themselves', '’ll', 'myself', 'somehow',
38 | 'together', 'top', 'from', 'then', 'are', 'give', 'back', 'less',
39 | 'always', 'never', 'becomes', 'until', "'d", 'go', 'i', 'whose',
40 | 'below', 'former', 'our', 'be', 'even', 'due', 'fifteen', 'every',
41 | 'than', 'rather', 'how', 'an', 'across', '‘ve', 'another', 'must',
42 | 'noone', 'against', '’s', 'others', 'per', 'already', 'off', 'too',
43 | 'was', 'when', 'also', 'other', 'therefore', 'see', 'up', 'indeed',
44 | 'what', '‘re', 'down', 'nobody', 'everything', 'whoever', 'five',
45 | 'me', 'nevertheless', 'toward', 'same', 'meanwhile', 'call', 'if',
46 | 'anyone', 'or', 'nowhere', 'were', 'unless', 'get', 'nine', 'her',
47 | 'for', '‘ll', 'who', 'fifty', 'few', 'only', 'anyway', 'no',
48 | 'amongst', 'show', '‘m', 'full', 'thereafter'
49 | ])
50 |
51 |
52 | def levenshtein_ratio(s, t):
53 | """ levenshtein_ratio_and_distance:
54 | Calculates levenshtein distance between two strings.
55 | If ratio_calc = True, the function computes the
56 | levenshtein distance ratio of similarity between two strings
57 | For all i and j, distance[i,j] will contain the Levenshtein
58 | distance between the first i characters of s and the
59 | first j characters of t
60 | original code from:
61 | https://www.datacamp.com/community/tutorials/fuzzy-string-python
62 | """
63 | # Initialize matrix of zeros
64 | rows = len(s)+1
65 | cols = len(t)+1
66 | distance = np.zeros((rows, cols), dtype=int)
67 |
68 | # Populate matrix of zeros with the indeces of each character of both strings
69 | for i in range(1, rows):
70 | for k in range(1, cols):
71 | distance[i][0] = i
72 | distance[0][k] = k
73 |
74 | # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions
75 | for col in range(1, cols):
76 | for row in range(1, rows):
77 | if s[row-1] == t[col-1]:
78 | cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
79 | else:
80 | # In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio
81 | # the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1.
82 | cost = 2
83 |
84 | distance[row][col] = min(
85 | distance[row-1][col] + 1, # Cost of deletions
86 | distance[row][col-1] + 1, # Cost of insertions
87 | distance[row-1][col-1] + cost) # Cost of substitutions
88 |
89 | # Computation of the Levenshtein Distance Ratio
90 | Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
91 | return Ratio
92 |
93 |
94 | def process_fosname(string):
95 | """Function to normalize FOS names """
96 | good_chars = "abcdefghijklmnoprstuvwxyz0123456789 "
97 | string = string.lower()
98 | string = string.replace("-", " ")
99 | string = "".join(i for i in string if i in good_chars)
100 | string = string.replace(" ", " ")
101 | if string[-1] == " ":
102 | string = string[:-1]
103 | if string[0] == " ":
104 | string = string[1:]
105 | return string
106 |
107 |
108 | def sdg_label_sort(sdg_label):
109 | try:
110 | sdg_nr = int(re.findall(r'\d+', sdg_label)[0])
111 | except IndexError:
112 | sdg_nr = sdg_label
113 | return sdg_nr
114 |
--------------------------------------------------------------------------------