├── corpus ├── __init__.py ├── createGoldDocuments │ ├── __init__.py │ ├── script │ │ ├── __init__.py │ │ ├── generate-single_post.py │ │ ├── final_processing.py │ │ ├── remove_link.py │ │ └── pre_processing.py │ ├── file.py │ ├── README.md │ └── calculate_position.py ├── README.md ├── goldDocumentsPre │ ├── myparkinsons.org.cgi-bin.forum.topic_show.pl.5256.json │ └── forum.ubuntuusers.de.topic.appimage-programm-in-alle-programme-als-icon-a..json └── goldDocuments │ └── myparkinsons.org.cgi-bin.forum.topic_show.pl.5256.json ├── tests ├── __init__.py ├── unit │ └── harvest │ │ ├── metadata │ │ └── test_username.py │ │ ├── test_date_search.py │ │ ├── test_utils.py │ │ └── cleanup │ │ └── test_forum_post.py ├── test_webservice.py └── integration │ └── harvest │ ├── test_posts_xpath.py │ └── test_extract_data.py ├── src ├── harvest │ ├── cleanup │ │ ├── __init__.py │ │ └── forum_post.py │ ├── metadata │ │ ├── __init__.py │ │ ├── .cache │ │ │ └── v │ │ │ │ └── cache │ │ │ │ └── lastfailed │ │ ├── usertext.py │ │ ├── link.py │ │ ├── date.py │ │ └── username.py │ ├── config.py │ ├── date_search.py │ ├── post_text.py │ ├── __init__.py │ ├── similarity_calculator.py │ ├── utils.py │ ├── extract.py │ └── posts.py └── test_dummy.py ├── requirements.txt ├── data └── forum │ ├── https%3A%2F%2Fshift.ms%2Ftopic%2Fcbd-oil-11.json.gz │ ├── http%3A%2F%2Fblog.angelman-asa.org%2Fread.php%3F2%2C736.json.gz │ ├── https%3A%2F%2Fcommunity.scope.org.uk%2Fdiscussion%2F57774%2Fcopd.json.gz │ ├── https%3A%2F%2Fforums.maladiesraresinfo.org%2Fpost11011.html%23p11011.json.gz │ ├── https%3A%2F%2Fwww.uninterrupted.org.au%2Fblog-category%2Fmy-ms-journey.json.gz │ ├── https%3A%2F%2Fmyparkinsons.org%2Fcgi-bin%2Fforum%2Ftopic_show.pl%3Fid%3D5231.json.gz │ ├── https%3A%2F%2Fforum.statcounter.com%2Fthreads%2Fcustom-tags-examples.44340%2F.json.gz │ ├── https%3A%2F%2Fwww.msworld.org%2Fforum%2Fshowthread.php%3F145403-Sort-of-new-here.json.gz │ ├── https%3A%2F%2Fwww.healingwell.com%2Fcommunity%2Fdefault.aspx%3Ff%3D34%26m%3D4099304.json.gz │ ├── https%3A%2F%2Fhealthunlocked.com%2Fparkinsonsmovement%2Fposts%2F142058845%2Fartane-anyone.json.gz │ ├── https%3A%2F%2Fwww.medhelp.org%2Fposts%2FMultiple-Sclerosis%2FPositive-ANA-Test%2Fshow%2F1123552.json.gz │ ├── https%3A%2F%2Fwww.medhelp.org%2Fposts%2FInfectious-Diseases%2FNoro-or-other-virus%2Fshow%2F1881254.json.gz │ ├── https%3A%2F%2Fwww.medhelp.org%2Fposts%2FHeart-Disease%2FWolfe-Parkinson-White-Syndrome%2Fshow%2F250747.json.gz │ ├── https%3A%2F%2Fwww.amsel.de%2Fmultiple-sklerose-forum%2F%3Ftnr%3D1%26mnr%3D217239%26archiv_flag%3D2%26fv%3D1.json.gz │ ├── https%3A%2F%2Fwww.mumsnet.com%2FTalk%2Fpregnancy%2F3749275-Pregnant-with-a-black-mixed-race-with-black-baby.json.gz │ ├── https%3A%2F%2Fwww.msconnection.org%2FDiscussions%2Ff33%2Ft77364%2Ftp1%2FHow-long-is-too-long-to-wait-for-an-initial-con.json.gz │ ├── https%3A%2F%2Fwww.medschat.com%2FDiscuss%2Fhow-important-is-this-medician-G-E-Sulfamethoxazole-TMP-DS-Tabitp-to-take-due-to-COPD-206090.htm%3Fsrcq%3Dcopd.json.gz │ └── test-urls.lst ├── .gitignore ├── publish.sh ├── scripts ├── serialize_test_data.py ├── webservice.py ├── extract_to_csv.py └── test-urls.lst ├── .github └── workflows │ └── main.yml ├── setup.py ├── README.md └── LICENSE /corpus/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/harvest/cleanup/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/harvest/metadata/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /corpus/createGoldDocuments/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /corpus/createGoldDocuments/script/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/harvest/metadata/.cache/v/cache/lastfailed: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /src/harvest/config.py: -------------------------------------------------------------------------------- 1 | LANGUAGES = ('en', 'de', 'es') -------------------------------------------------------------------------------- /corpus/README.md: -------------------------------------------------------------------------------- 1 | # Todo description of corpus 2 | ``` 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | lxml 2 | requests 3 | dateparser 4 | numpy 5 | inscriptis 6 | flask 7 | fuzzywuzzy 8 | pytest -------------------------------------------------------------------------------- /src/test_dummy.py: -------------------------------------------------------------------------------- 1 | """ 2 | This dummy test is needed for pytest to detect the src directory 3 | """ 4 | 5 | 6 | def test_dummy(): 7 | pass 8 | -------------------------------------------------------------------------------- /data/forum/https%3A%2F%2Fshift.ms%2Ftopic%2Fcbd-oil-11.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fshift.ms%2Ftopic%2Fcbd-oil-11.json.gz -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.pyo 3 | .*.swp 4 | __pycache__/ 5 | .cache/ 6 | debug/ 7 | venv 8 | dist 9 | build 10 | results.json 11 | *.egg-info 12 | .coverage 13 | coverage.xml 14 | -------------------------------------------------------------------------------- /data/forum/http%3A%2F%2Fblog.angelman-asa.org%2Fread.php%3F2%2C736.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/http%3A%2F%2Fblog.angelman-asa.org%2Fread.php%3F2%2C736.json.gz -------------------------------------------------------------------------------- /data/forum/https%3A%2F%2Fcommunity.scope.org.uk%2Fdiscussion%2F57774%2Fcopd.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fcommunity.scope.org.uk%2Fdiscussion%2F57774%2Fcopd.json.gz -------------------------------------------------------------------------------- /data/forum/https%3A%2F%2Fforums.maladiesraresinfo.org%2Fpost11011.html%23p11011.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fforums.maladiesraresinfo.org%2Fpost11011.html%23p11011.json.gz -------------------------------------------------------------------------------- /data/forum/https%3A%2F%2Fwww.uninterrupted.org.au%2Fblog-category%2Fmy-ms-journey.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fwww.uninterrupted.org.au%2Fblog-category%2Fmy-ms-journey.json.gz -------------------------------------------------------------------------------- /data/forum/https%3A%2F%2Fmyparkinsons.org%2Fcgi-bin%2Fforum%2Ftopic_show.pl%3Fid%3D5231.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fmyparkinsons.org%2Fcgi-bin%2Fforum%2Ftopic_show.pl%3Fid%3D5231.json.gz -------------------------------------------------------------------------------- /data/forum/https%3A%2F%2Fforum.statcounter.com%2Fthreads%2Fcustom-tags-examples.44340%2F.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fforum.statcounter.com%2Fthreads%2Fcustom-tags-examples.44340%2F.json.gz -------------------------------------------------------------------------------- /data/forum/https%3A%2F%2Fwww.msworld.org%2Fforum%2Fshowthread.php%3F145403-Sort-of-new-here.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fwww.msworld.org%2Fforum%2Fshowthread.php%3F145403-Sort-of-new-here.json.gz -------------------------------------------------------------------------------- /tests/unit/harvest/metadata/test_username.py: -------------------------------------------------------------------------------- 1 | from harvest.metadata.username import get_user_name 2 | 3 | 4 | def test_get_user_name(): 5 | assert get_user_name('Therese Kurz', 'http://www.heise.de/security') == 'Therese.Kurz@www.heise.de' 6 | -------------------------------------------------------------------------------- /data/forum/https%3A%2F%2Fwww.healingwell.com%2Fcommunity%2Fdefault.aspx%3Ff%3D34%26m%3D4099304.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fwww.healingwell.com%2Fcommunity%2Fdefault.aspx%3Ff%3D34%26m%3D4099304.json.gz -------------------------------------------------------------------------------- /data/forum/https%3A%2F%2Fhealthunlocked.com%2Fparkinsonsmovement%2Fposts%2F142058845%2Fartane-anyone.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fhealthunlocked.com%2Fparkinsonsmovement%2Fposts%2F142058845%2Fartane-anyone.json.gz -------------------------------------------------------------------------------- /data/forum/https%3A%2F%2Fwww.medhelp.org%2Fposts%2FMultiple-Sclerosis%2FPositive-ANA-Test%2Fshow%2F1123552.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fwww.medhelp.org%2Fposts%2FMultiple-Sclerosis%2FPositive-ANA-Test%2Fshow%2F1123552.json.gz -------------------------------------------------------------------------------- /data/forum/https%3A%2F%2Fwww.medhelp.org%2Fposts%2FInfectious-Diseases%2FNoro-or-other-virus%2Fshow%2F1881254.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fwww.medhelp.org%2Fposts%2FInfectious-Diseases%2FNoro-or-other-virus%2Fshow%2F1881254.json.gz -------------------------------------------------------------------------------- /data/forum/https%3A%2F%2Fwww.medhelp.org%2Fposts%2FHeart-Disease%2FWolfe-Parkinson-White-Syndrome%2Fshow%2F250747.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fwww.medhelp.org%2Fposts%2FHeart-Disease%2FWolfe-Parkinson-White-Syndrome%2Fshow%2F250747.json.gz -------------------------------------------------------------------------------- /data/forum/https%3A%2F%2Fwww.amsel.de%2Fmultiple-sklerose-forum%2F%3Ftnr%3D1%26mnr%3D217239%26archiv_flag%3D2%26fv%3D1.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fwww.amsel.de%2Fmultiple-sklerose-forum%2F%3Ftnr%3D1%26mnr%3D217239%26archiv_flag%3D2%26fv%3D1.json.gz -------------------------------------------------------------------------------- /data/forum/https%3A%2F%2Fwww.mumsnet.com%2FTalk%2Fpregnancy%2F3749275-Pregnant-with-a-black-mixed-race-with-black-baby.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fwww.mumsnet.com%2FTalk%2Fpregnancy%2F3749275-Pregnant-with-a-black-mixed-race-with-black-baby.json.gz -------------------------------------------------------------------------------- /data/forum/https%3A%2F%2Fwww.msconnection.org%2FDiscussions%2Ff33%2Ft77364%2Ftp1%2FHow-long-is-too-long-to-wait-for-an-initial-con.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fwww.msconnection.org%2FDiscussions%2Ff33%2Ft77364%2Ftp1%2FHow-long-is-too-long-to-wait-for-an-initial-con.json.gz -------------------------------------------------------------------------------- /data/forum/https%3A%2F%2Fwww.medschat.com%2FDiscuss%2Fhow-important-is-this-medician-G-E-Sulfamethoxazole-TMP-DS-Tabitp-to-take-due-to-COPD-206090.htm%3Fsrcq%3Dcopd.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fwww.medschat.com%2FDiscuss%2Fhow-important-is-this-medician-G-E-Sulfamethoxazole-TMP-DS-Tabitp-to-take-due-to-COPD-206090.htm%3Fsrcq%3Dcopd.json.gz -------------------------------------------------------------------------------- /publish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # TODO 4 | # - check release version number! 5 | 6 | # publish the latest package to pypi 7 | # sources: 8 | # - https://packaging.python.org/guides/distributing-packages-using-setuptools/#packaging-your-project 9 | # - https://packaging.python.org/guides/making-a-pypi-friendly-readme/ 10 | 11 | # cleanup dist 12 | rm -rf ./dist 13 | 14 | # build and verify packages 15 | python3 setup.py sdist bdist_wheel; twine check dist/* 16 | 17 | # upload 18 | twine upload dist/* -------------------------------------------------------------------------------- /src/harvest/date_search.py: -------------------------------------------------------------------------------- 1 | import dateparser.search 2 | import datetime 3 | from harvest.config import LANGUAGES 4 | 5 | 6 | def search_dates(text): 7 | results = dateparser.search.search_dates(text, languages=LANGUAGES, settings={'RETURN_AS_TIMEZONE_AWARE': False}) 8 | valid_dates = [] 9 | if results is not None: 10 | for result in results: 11 | if result[1] > datetime.datetime(1993, 4, 30): 12 | valid_dates.append(result) 13 | 14 | return valid_dates 15 | -------------------------------------------------------------------------------- /tests/unit/harvest/test_date_search.py: -------------------------------------------------------------------------------- 1 | from harvest.date_search import search_dates 2 | import datetime 3 | 4 | 5 | def test_date_found_by_external_library(): 6 | result = search_dates("asdfad 25-February-2012 21:46 afd adsf") 7 | assert len(result) == 1 8 | assert result[0][0] == "25-February-2012 21:46" 9 | assert result[0][1] == datetime.datetime(2012, 2, 25, 21, 46) 10 | 11 | 12 | def test_date_found_by_external_library_is_to_old(): 13 | result = search_dates("asdfad 29-April-1993 21:46 afd adsf") 14 | assert len(result) == 0 15 | -------------------------------------------------------------------------------- /corpus/createGoldDocuments/file.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import logging 4 | import os 5 | 6 | from json import dump 7 | from urllib.parse import urlparse 8 | 9 | logging.getLogger().setLevel(logging.INFO) 10 | 11 | 12 | def get_file_path(url, result_directory): 13 | url = urlparse(url).netloc + urlparse(url).path + urlparse(url).params 14 | return os.path.join(result_directory, f'{url.replace("/", ".")}.json') 15 | 16 | 17 | def write_to_json(url, result_directory, document): 18 | result_fname = get_file_path(url, result_directory) 19 | if not os.path.exists(result_directory): 20 | os.makedirs(result_directory) 21 | if not os.path.isfile(result_fname): 22 | with open(result_fname, "w") as f2: 23 | dump(document, f2, indent=True) 24 | -------------------------------------------------------------------------------- /src/harvest/post_text.py: -------------------------------------------------------------------------------- 1 | from inscriptis import get_text 2 | 3 | WORDS_TO_IGNORE_DE = {'cookies', 'startseite', 'datenschutzerklärung', 'impressum', 'nutzungsbedingungen', 4 | 'registrieren'} 5 | WORDS_TO_IGNORE_EN = {'forum home', 'sign in', 'sign up'} 6 | WORDS_TO_IGNORE = WORDS_TO_IGNORE_DE.union(WORDS_TO_IGNORE_EN) 7 | 8 | 9 | def get_cleaned_text(html): 10 | text_sections = [] 11 | text = get_text(html) 12 | for comment in (c for c in text.split("\n") if c.strip()): 13 | if [word for word in WORDS_TO_IGNORE if word in comment.lower()]: 14 | continue 15 | elif 'copyright' not in comment.lower() and '©' not in comment.lower() and 'powered by' not in comment.lower(): 16 | text_sections.append(comment.strip()) 17 | else: 18 | break 19 | return text_sections -------------------------------------------------------------------------------- /src/harvest/metadata/usertext.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Get the xpath to extract only the text of a post 3 | ''' 4 | import re 5 | import logging 6 | 7 | 8 | def get_text_xpath_pattern(dom, post_xpath, posts): 9 | """ 10 | Get the xpath to extract only the text of a post 11 | 12 | Args: 13 | - dom: the forums DOM object 14 | - post_xpath: the determined post xpath 15 | - posts: the extracted posts 16 | """ 17 | 18 | text_xpath = re.sub(r"\/\.\.", "", post_xpath) 19 | while True: 20 | text_elements = dom.xpath(text_xpath) 21 | if len(text_elements) == len(posts): 22 | return text_xpath 23 | if len(text_elements) < len(posts) or len(text_elements) <= 1: 24 | logging.warning(f'text xPath not found for {post_xpath}') 25 | return post_xpath 26 | text_xpath = text_xpath + '/..' 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /tests/test_webservice.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | 5 | def query(): 6 | service_url = 'http://localhost:5000/dragnet_extract_from_html' 7 | 8 | with open('./corpus/goldDocuments/blog.angelman-asa.org.read.php.json') as gold_document: 9 | data = json.load(gold_document) 10 | test_url = data['url'] 11 | test_html = data['html'] 12 | test_text = data['text'] 13 | test_annotations = data['gold_standard_annotation'] 14 | 15 | data = {'url': test_url, 'html': test_html, 'text': test_text, 'annotations': test_annotations} 16 | 17 | try: 18 | response = requests.post(service_url, json=data) 19 | except Exception as exception: 20 | print(f"Query failed: {exception}") 21 | response = None 22 | 23 | response_dict = json.loads(response.text) 24 | print(f"Response: {response_dict['entities']}") 25 | return response_dict 26 | 27 | 28 | if __name__ == '__main__': 29 | query() 30 | -------------------------------------------------------------------------------- /scripts/serialize_test_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from json import dump 4 | 5 | from os.path import exists 6 | from urllib.request import urlopen, Request 7 | from urllib.parse import quote_plus 8 | 9 | import datetime 10 | import gzip 11 | import shutil 12 | 13 | USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0" 14 | 15 | with open("test-urls.lst") as f: 16 | for url in (u.strip() for u in f): 17 | dst = quote_plus(url) 18 | if exists(dst + ".json") or url.startswith('#') or not url.strip(): 19 | continue 20 | 21 | print("Retrieving", url) 22 | try: 23 | req = Request(url, data=None, headers={'User-Agent': USER_AGENT}) 24 | http = urlopen(req) 25 | content_type = http.getheader('content-type') 26 | if content_type and 'charset=' in content_type: 27 | encoding = content_type.split('charset=')[1] 28 | else: 29 | encoding = 'utf8' 30 | html = http.read().decode(encoding) 31 | 32 | with open("../data/" + dst + ".json", 'w') as f: 33 | dump({'url': url, 'crawled': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'html': html}, f) 34 | with open("../data/" + dst + ".json", 'rb') as f, \ 35 | gzip.open('../data/forum/' + dst + ".json.gz", 'wb') as fgzip: 36 | shutil.copyfileobj(f, fgzip) 37 | except IOError: 38 | with open("failed.lst", "a") as f: 39 | f.write(url + "\n") 40 | -------------------------------------------------------------------------------- /tests/unit/harvest/test_utils.py: -------------------------------------------------------------------------------- 1 | from harvest.utils import get_merged_xpath 2 | 3 | 4 | def test_get_merge_xpath(): 5 | xpaths = [r'//div[@class="post post-even"]/a[not(*) and string-length(text()) = 0]', 6 | r'//div[@class="post-odd"]/a[not(*) and string-length(text()) = 0]', 7 | r'//a[@class="user-name"][not(*) and string-length(text()) > 0]'] 8 | merged_xpath = get_merged_xpath(xpaths) 9 | assert len(merged_xpath) == 1 10 | assert merged_xpath[0] == r"//div[(contains(@class, 'post') and contains(@class, 'post-even')) or " \ 11 | r"(contains(@class, 'post-odd'))]" \ 12 | r"/a[not(*) and string-length(text()) = 0]" 13 | 14 | 15 | def test_get_merge_xpath_same_classes(): 16 | xpaths = [r'//div[@class="post post-even"]/a[not(*) and string-length(text()) = 0]', 17 | r'//div[@class="post post-odd"]/a[not(*) and string-length(text()) = 0]', 18 | r'//a[@class="user-name"][not(*) and string-length(text()) > 0]'] 19 | merged_xpath = get_merged_xpath(xpaths) 20 | assert len(merged_xpath) == 1 21 | assert merged_xpath[0] == r"//div[(contains(@class, 'post'))]" \ 22 | r"/a[not(*) and string-length(text()) = 0]" 23 | 24 | 25 | def test_get_merge_xpath_with_no_merges(): 26 | xpaths = [r'//div[@class="post post-odd"]/a[not(*) and string-length(text()) = 0]', 27 | r'//a[@class="user-name"][not(*) and string-length(text()) > 0]'] 28 | merged_xpath = get_merged_xpath(xpaths) 29 | assert not merged_xpath 30 | -------------------------------------------------------------------------------- /tests/unit/harvest/cleanup/test_forum_post.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ''' 4 | Test classes 5 | ''' 6 | 7 | from harvest.cleanup.forum_post import remove_suffix, remove_prefix, remove_boilerplate 8 | 9 | 10 | def test_remove_suffix(): 11 | post_list = ['Good day', 'Good Saturday', 'Good Wednesday'] 12 | assert remove_suffix(post_list) == post_list 13 | 14 | post_list2 = ['Good day [Reply - to]', 'Good Saturday [Reply - to]', 'Good Wednesday [Reply - to]'] 15 | assert remove_suffix(post_list2) == post_list 16 | 17 | assert remove_prefix(post_list) == ['day', 'Saturday', 'Wednesday'] 18 | 19 | 20 | # 21 | # tests based on reported errors 22 | # 23 | 24 | def test_missing_message(): 25 | ''' 26 | the following string got completely removed by cleaning. 27 | ''' 28 | s = [ 29 | "Add message | Report paperplant Thu 21-Nov-19 11:07:27 Following as non-white woman - sounds really interesting, thanks for posting. Can't say much as I've only experienced the booking/sickle cell test and in my hospital we're offered the BCG vaccine as routine. My area is about 50% South Asian ethnicity though.", 30 | "Add message | Report Lweji Thu 21-Nov-19 11:27:03 Is that actually true? Anatomy, etc? Have you found evidence other than being told about it by a midwife?"] 31 | 32 | assert remove_boilerplate(s) == [ 33 | "paperplant Thu 21-Nov-19 11:07:27 Following as non-white woman - sounds really interesting, thanks for posting. Can't say much as I've only experienced the booking/sickle cell test and in my hospital we're offered the BCG vaccine as routine. My area is about 50% South Asian ethnicity though.", 34 | "Lweji Thu 21-Nov-19 11:27:03 Is that actually true? Anatomy, etc? Have you found evidence other than being told about it by a midwife?"] 35 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: build 4 | 5 | # Controls when the action will run. Triggers the workflow on push or pull request 6 | # events but only for the main branch 7 | on: 8 | push: 9 | branches: [ main ] 10 | pull_request: 11 | branches: [ main ] 12 | 13 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 14 | jobs: 15 | # This workflow contains a single job called "build" 16 | build: 17 | runs-on: ubuntu-latest 18 | strategy: 19 | matrix: 20 | python-version: [3.6, 3.7, 3.8 ] 21 | 22 | steps: 23 | - uses: actions/checkout@v2 24 | - name: Set up Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v2 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | - name: Install dependencies 29 | run: | 30 | python -m pip install --upgrade pip 31 | pip install flake8 32 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 33 | - name: Lint with flake8 34 | run: | 35 | # stop the build if there are Python syntax errors or undefined names 36 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 37 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 38 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 39 | - name: Test with pytest 40 | run: | 41 | pip install pytest-cov 42 | python -m pytest --rootdir tests --cov=harvest --cov-report=xml 43 | - name: Upload coverage to Codecov 44 | uses: codecov/codecov-action@v1 45 | with: 46 | token: ${{ secrets.CODECOV_TOKEN }} 47 | file: ./coverage.xml -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | from setuptools import setup, find_packages 6 | from os import path 7 | 8 | here = path.abspath(path.dirname(__file__)) 9 | sys.path.insert(0, path.join(here, 'src')) 10 | 11 | from harvest import (__version__, __author__, __author_email__, __license__) 12 | 13 | this_directory = path.abspath(path.dirname(__file__)) 14 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: 15 | long_description = f.read() 16 | 17 | setup( 18 | # Metadata 19 | name="harvest-webforum", 20 | version=__version__, 21 | description='A toolkit for extracting posts and post metadata from web forums', 22 | long_description=long_description, 23 | long_description_content_type='text/markdown', 24 | author=__author__, 25 | author_email=__author_email__, 26 | python_requires='>=3.5', 27 | classifiers=[ 28 | 'Intended Audience :: Developers', 29 | 'License :: OSI Approved :: Apache-2.0', 30 | 'Topic :: Text Processing', 31 | 'Topic :: Text Processing :: Markup :: HTML', 32 | 'Topic :: Utilities', 33 | 'Programming Language :: Python :: 3', 34 | 'Programming Language :: Python :: 3.6', 35 | 'Programming Language :: Python :: 3.7', 36 | 'Programming Language :: Python :: 3.8', 37 | ], 38 | license=__license__, 39 | package_dir={'': 'src'}, 40 | 41 | # Package List 42 | packages=find_packages('src'), 43 | 44 | # Scripts 45 | scripts=[ 46 | './scripts/extract_to_csv.py', 47 | './scripts/serialize_test_data.py' 48 | ], 49 | 50 | # Requirements 51 | install_requires=[ 52 | 'lxml', 53 | 'requests', 54 | 'dateparser', 55 | 'numpy', 56 | 'inscriptis', 57 | 'flask', 58 | 'fuzzywuzzy' 59 | ] 60 | ) 61 | -------------------------------------------------------------------------------- /corpus/createGoldDocuments/script/generate-single_post.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import logging 5 | import os 6 | from collections import defaultdict 7 | from glob import glob 8 | from json import load 9 | 10 | from corpus.createGoldDocuments.file import write_to_json, get_file_path 11 | 12 | logging.getLogger().setLevel(logging.INFO) 13 | 14 | parser = argparse.ArgumentParser(description='Forum harvester - generate gold standard documents with only one post') 15 | parser.add_argument('gold_document_path', metavar='gold_document_path', help='Path to the gold documents') 16 | parser.add_argument('--result-directory', dest='result_directory', help='Optional directory for storing final results.') 17 | parser.add_argument('--corpus-include-string', dest='corpus_include_string', 18 | help='Optionally restrict the input corpus to URLs that match the corpus include string.') 19 | 20 | args = parser.parse_args() 21 | 22 | result = defaultdict(list) 23 | for no, fname in enumerate(glob(args.gold_document_path + "*.json")): 24 | with open(fname) as f: 25 | forum = load(f) 26 | if (args.corpus_include_string and args.corpus_include_string not in forum['url']) \ 27 | or os.path.isfile(get_file_path(forum['url'], args.result_directory)): 28 | continue 29 | 30 | logging.info("Start creating final gold standard document with only one post for " + forum['url']) 31 | 32 | single_post = " ".join([a['post_text']['surface_form'] for a in forum['gold_standard_annotation']]) 33 | start_index = forum['gold_standard_annotation'][0]['post_text']['start'] 34 | end_index = forum['gold_standard_annotation'][-1]['post_text']['end'] 35 | 36 | forum['gold_standard_annotation'] = [{ 37 | "post_text": { 38 | "surface_form": single_post, 39 | "start": start_index, 40 | "end": end_index 41 | } 42 | }] 43 | 44 | write_to_json(os.path.basename(fname), args.result_directory, forum) 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Harvest - A toolkit for extracting posts and post metadata from web forums 2 | 3 | [![Actions Status](https://github.com/fhgr/harvest/workflows/build/badge.svg)](https://github.com/fhgr/harvest/actions) 4 | [![codecov](https://codecov.io/gh/fhgr/harvest/branch/main/graph/badge.svg)]( 5 | https://codecov.io/gh/fhgr/harvest) 6 | [![PyPI version](https://badge.fury.io/py/harvest-webforum.svg)](https://badge.fury.io/py/harvest-webforum) 7 | 8 | Automatic extraction of forum posts and metadata is a challenging task since forums do not expose their content in a standardized structure. Harvest performs this task reliably for many web forums and offers an easy way to extract data from web forums. 9 | 10 | ## Installation 11 | 12 | At the command line: 13 | ```bash 14 | $ pip install harvest-webforum 15 | ``` 16 | 17 | If you want to install from the latest sources, you can do: 18 | ```bash 19 | $ git clone https://github.com/fhgr/harvest.git 20 | $ cd harvest 21 | $ python3 setup.py install 22 | ``` 23 | 24 | ## Python library 25 | Embedding harvest into your code is easy, as outlined below: 26 | ```python 27 | from urllib.request import urlopen, Request 28 | from harvest import extract_data 29 | 30 | USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0" 31 | 32 | url = "https://forum.videolan.org/viewtopic.php?f=14&t=145604" 33 | req = Request(url, headers={'User-Agent': USER_AGENT}) 34 | html = urlopen(req).read().decode('utf-8') 35 | 36 | result = extract_data(html, url) 37 | print(result) 38 | ``` 39 | 40 | ## WEB-FORUM-52 gold standard 41 | The [corpus](corpus/goldDocuments) currently contains from 52 different web forums gold standard documents. These documents are also used by the integrations test of harvest. 42 | 43 | ## Publication 44 | 45 | * Weichselbraun, Albert, Brasoveanu, Adrian M. P., Waldvogel, Roger and Odoni, Fabian. (2020). [“Harvest - An Open Source Toolkit for Extracting Posts and Post Metadata from Web Forums”](https://arxiv.org/abs/2102.02240). IEEE/WIC/ACM International Joint Conference on Web Intelligence and Intelligent Agent Technology (WI-IAT 2020), Melbourne, Australia, Accepted 27 October 2020. 46 | -------------------------------------------------------------------------------- /corpus/createGoldDocuments/script/final_processing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import logging 5 | import os 6 | 7 | from glob import glob 8 | from json import load 9 | from collections import defaultdict 10 | from corpus.createGoldDocuments.file import write_to_json, get_file_path 11 | from corpus.createGoldDocuments.calculate_position import get_start_end_for_post 12 | 13 | logging.getLogger().setLevel(logging.INFO) 14 | 15 | parser = argparse.ArgumentParser(description='Forum harvester - generate final gold standard documents') 16 | parser.add_argument('pre_gold_document_path', metavar='pre_gold_document_path', 17 | help='Path to the pre processed gold documents') 18 | parser.add_argument('--result-directory', dest='result_directory', help='Optional directory for storing final results.') 19 | parser.add_argument('--corpus-include-string', dest='corpus_include_string', 20 | help='Optionally restrict the input corpus to URLs that match the corpus include string.') 21 | 22 | args = parser.parse_args() 23 | 24 | result = defaultdict(list) 25 | for no, fname in enumerate(glob(args.pre_gold_document_path + "*.json")): 26 | with open(fname) as f: 27 | forum = load(f) 28 | if (args.corpus_include_string and args.corpus_include_string not in forum['url']) \ 29 | or os.path.isfile(get_file_path(forum['url'], args.result_directory)): 30 | continue 31 | 32 | logging.info("Start creating final gold standard document for " + forum['url']) 33 | search_start_index = 0 34 | all_indexes_found = True 35 | for post in forum['gold_standard_annotation']: 36 | max_index = get_start_end_for_post(post, forum['text'], search_start_index) 37 | if max_index > -1: 38 | search_start_index = max_index 39 | else: 40 | all_indexes_found = False 41 | if all_indexes_found: 42 | write_to_json(forum['url'], args.result_directory, forum) 43 | logging.info('Gold standard document successfully created') 44 | else: 45 | logging.warning('Not all indexes found. Check pre file again.') 46 | -------------------------------------------------------------------------------- /corpus/createGoldDocuments/README.md: -------------------------------------------------------------------------------- 1 | # Instruction to create gold standard documents 2 | ## Gold standard document format 3 | ``` 4 | { 5 | "id": "" 6 | "url": "" 7 | "html": "..." 8 | "text": "Only the text of the html. Referenced as full text" 9 | "gold_standard_annotation": [{ 10 | "post_text": {"surface_form": "...", "start": 200, "end": 555}, 11 | "datetime": {"surface_form": "02-March-2012 00:58", "start": 10, "end": 29}, 12 | "user": {"surface_form": "http://blog.angelman-asa.org/profile.php?2,1606", "start": 30, "end": 77}, 13 | "post_link": {"surface_form": "msg-772", "start": 100, "end": 107}, 14 | }] 15 | } 16 | ``` 17 | ## Instruction 18 | 1. First you need to download the forum pages for which you want to create a gold document. This is done by executing the script `script/serialize-test-data.py`. But first you have to add or comment out (remove # at the beginning) the url of the forum page in the file `data/forum/test-urls.lst`. 19 | 2. Next, a first version of the gold document is created with `pre-processing.py`. Example command: 20 | `python3 pre-processing.py ./data/forum/ --result-directory ./goldDocumentsPre/` 21 | 3. The next step is to clean up the document for the following elements: 22 | `datetime.surface_form, user.surface_form, post_link.surface_form` 23 | The elements that are not correctly recognized should be corrected or supplemented. For the user the link is used if available, otherwise the displayed name. 24 | 4. Now the script `python3 remove-link.py ./goldDocumentsPre/` is executed. This removes all links from the full text of the gold document except those of user and post_link. 25 | 5. Now clean up the `post_text.surface_form` elements in the gold document. These must be found in the full text and must match the correct post text. 26 | 6. Now run the script `python3 final-processing.py ./goldDocumentsPre/ --result-directory ./goldDocuments/`. If all elements are prepared correctly, a start and end position is found for each element. If this is not the case, the log will show the message "Not found in text". If start and end positions are not found, correct the pre document accordingly and generate the final document again. 27 | 7. With git commit and push to repository. 28 | 29 | **Note**: the `final-processing.py` and `pre-processing.py` scripts do not overwrite existing documents. -------------------------------------------------------------------------------- /corpus/createGoldDocuments/script/remove_link.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import logging 5 | import re 6 | 7 | from glob import glob 8 | from json import load, dump 9 | from collections import defaultdict 10 | 11 | logging.getLogger().setLevel(logging.INFO) 12 | 13 | 14 | def remove_unused_links(text, links_to_keep): 15 | pattern = re.compile(r'( \* )?\[[^\]]*\]\((http(s)?:\/)?\/[^\)]*\)') 16 | start_index = 0 17 | while start_index > -1: 18 | link_match = pattern.search(text, start_index) 19 | if link_match: 20 | link_extracted = re.search(r'(http(s)?:\/)?\/[^\)]*', link_match.group(0)) 21 | if link_extracted and link_extracted.group(0) not in links_to_keep: 22 | logging.info(f'Removed {link_match.group(0)}') 23 | only_text = re.search(r'\[.*\]', link_match.group(0)) 24 | text = text[:link_match.start()] + only_text.group(0)[1:-1] + text[link_match.end():] 25 | else: 26 | start_index = link_match.end() 27 | else: 28 | start_index = -1 29 | return text 30 | 31 | 32 | parser = argparse.ArgumentParser(description='Forum harvester - remove unused link from text') 33 | parser.add_argument('pre_gold_document_path', metavar='pre_gold_document_path', 34 | help='Path to the pre processed gold documents') 35 | parser.add_argument('--corpus-include-string', dest='corpus_include_string', 36 | help='Optionally restrict the input corpus to URLs that match the corpus include string.') 37 | 38 | args = parser.parse_args() 39 | 40 | result = defaultdict(list) 41 | for no, fname in enumerate(glob(args.pre_gold_document_path + "*.json")): 42 | with open(fname, "r") as f: 43 | forum = load(f) 44 | if args.corpus_include_string and args.corpus_include_string not in forum['url']: 45 | continue 46 | 47 | logging.info("Remove unused links for " + forum['url']) 48 | link_user = set(x['user']['surface_form'] for x in forum['gold_standard_annotation'] if 49 | 'user' in x) 50 | link_post = set(x['post_link']['surface_form'] for x in forum['gold_standard_annotation'] if 51 | 'post_link' in x) 52 | 53 | forum['text'] = remove_unused_links(forum['text'], link_user.union(link_post)) 54 | with open(fname, "w") as f2: 55 | dump(forum, f2, indent=True) 56 | -------------------------------------------------------------------------------- /src/harvest/cleanup/forum_post.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ''' 4 | Removes common suffixes and suffixes from forum posts. 5 | ''' 6 | 7 | import logging 8 | 9 | def compute_common_suffix_count(post_list): 10 | ''' 11 | Returns: 12 | int: The number of common suffix terms. 13 | ''' 14 | confirmed_suffix_terms = [] 15 | for suffix_term in reversed(post_list[0].split(' ')): 16 | new_suffix = ' ' + ' '.join([suffix_term] + confirmed_suffix_terms) 17 | for post in post_list: 18 | if not post.endswith(new_suffix): 19 | return len(confirmed_suffix_terms) 20 | confirmed_suffix_terms.insert(0, suffix_term) 21 | 22 | return len(confirmed_suffix_terms) 23 | 24 | 25 | def remove_suffix(post_list): 26 | ''' 27 | Removes common suffixes from list posts. 28 | ''' 29 | suffix_count = compute_common_suffix_count(post_list) 30 | if suffix_count == 0: 31 | return post_list 32 | return [' '.join(posts.split(' ')[:-suffix_count]) for posts in post_list] 33 | 34 | 35 | def compute_common_prefix_count(post_list): 36 | ''' 37 | Returns: 38 | int: The number of common prefix terms. 39 | ''' 40 | confirmed_prefix_terms = [] 41 | for prefix_term in post_list[0].split(' '): 42 | new_prefix = ' '.join(confirmed_prefix_terms + [prefix_term]) + ' ' 43 | for post in post_list: 44 | if not post.startswith(new_prefix): 45 | return len(confirmed_prefix_terms) 46 | confirmed_prefix_terms.append(prefix_term) 47 | 48 | return len(confirmed_prefix_terms) 49 | 50 | 51 | def remove_prefix(post_list): 52 | ''' 53 | Removes common suffixes from list posts. 54 | ''' 55 | prefix_count = compute_common_prefix_count(post_list) 56 | if prefix_count == 0: 57 | return post_list 58 | return [' '.join(posts.split(' ')[prefix_count:]) for posts in post_list] 59 | 60 | 61 | def remove_boilerplate(post_list): 62 | ''' 63 | Removes common prefixes and suffixes from list posts. 64 | ''' 65 | prefix_count = compute_common_prefix_count(post_list) 66 | suffix_count = compute_common_suffix_count(post_list) 67 | logging.info(f'{prefix_count}>>{suffix_count}') 68 | if prefix_count == 0 and suffix_count == 0: 69 | return post_list 70 | suffix_count = -suffix_count if suffix_count != 0 else None 71 | return [' '.join(posts.split(' ')[prefix_count:suffix_count]) for posts in post_list] 72 | -------------------------------------------------------------------------------- /data/forum/test-urls.lst: -------------------------------------------------------------------------------- 1 | ## 2 | ## aproach: mirror all - extract content from forum posts 3 | ## provide flags to obtain views that are accessible to the crawler 4 | ## - problems: recrawls 5 | ## - disambiguation and URL assignment (just number posts and add a #n)? 6 | 7 | 8 | ## 9 | ## multiple sklerosis 10 | ## 11 | 12 | # overview pages and comments on forum posts 13 | https://www.medhelp.org/posts/Multiple-Sclerosis/Positive-ANA-Test/show/1123552 14 | # overview pages 15 | https://www.msconnection.org/Discussions/f33/t77364/tp1/How-long-is-too-long-to-wait-for-an-initial-con 16 | https://shift.ms/topic/cbd-oil-11 17 | # not a real forum, but rather a blog with single posts (!) very hard - since an absolutely non standard layout 18 | https://www.uninterrupted.org.au/blog-category/my-ms-journey 19 | # nested overview pages 20 | https://www.msworld.org/forum/showthread.php?145403-Sort-of-new-here 21 | # leicht parsebare ansicht; default: baumstruktur, kaum parsebar 22 | https://www.amsel.de/multiple-sklerose-forum/?tnr=1&mnr=217239&archiv_flag=2&fv=1 23 | 24 | ## 25 | ## emerging viri 26 | ## 27 | https://www.medhelp.org/posts/Infectious-Diseases/Noro-or-other-virus/show/1881254 28 | 29 | ## 30 | ## parkinson 31 | ## 32 | 33 | # provides json-ld 34 | https://healthunlocked.com/parkinsonsmovement/posts/142058845/artane-anyone 35 | # post overview page 36 | https://www.healingwell.com/community/default.aspx?f=34&m=4099304 37 | # distinguishes between question and posts >>> problem: questions are not in part of the forum structure 38 | https://www.medhelp.org/posts/Heart-Disease/Wolfe-Parkinson-White-Syndrome/show/250747 39 | # mailing list 40 | https://myparkinsons.org/cgi-bin/forum/topic_show.pl?id=5231 41 | 42 | 43 | ## 44 | ## angelman 45 | ## 46 | 47 | # implementation via google search not feasible; but direct mirroring of the forum's content // first post missing 48 | https://www.mumsnet.com/Talk/pregnancy/3749275-Pregnant-with-a-black-mixed-race-with-black-baby 49 | # DONE & works - overview pages 50 | http://blog.angelman-asa.org/read.php?2,736 51 | # beautiful standard forum :))) 52 | https://forums.maladiesraresinfo.org/post11011.html#p11011 53 | 54 | ## 55 | ## COPD - not yet classified; choosing some samples only 56 | ## 57 | 58 | https://www.medschat.com/Discuss/how-important-is-this-medician-G-E-Sulfamethoxazole-TMP-DS-Tabitp-to-take-due-to-COPD-206090.htm?srcq=copd 59 | # search interface has completely changed 60 | https://community.scope.org.uk/discussion/57774/copd 61 | -------------------------------------------------------------------------------- /src/harvest/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Automatic extraction of forum posts and metadata is a challenging task since forums do not expose their content in a 3 | standardized structure. Harvest performs this task reliably for many web forums and offers an easy way to extract data 4 | from web forums. 5 | Example:: 6 | import urllib.request 7 | from inscriptis import get_text 8 | url = 'https://www.fhgr.ch' 9 | html = urllib.request.urlopen(url).read().decode('utf-8') 10 | text = get_text(html) 11 | print(text) 12 | """ 13 | 14 | __author__ = 'Albert Weichselbraun, Roger Waldvogel' 15 | __author_email__ = 'albert.weichselbraun@fhgr.ch, roger.waldvogel@fhgr.ch' 16 | __copyright__ = '2019-2020 Albert Weichselbraun, Roger Waldvogel' 17 | __license__ = 'Apache-2.0' 18 | __version__ = '1.1.0' 19 | __status__ = 'Prototype' 20 | 21 | try: 22 | import re 23 | from lxml.html import fromstring 24 | 25 | from harvest import posts 26 | from harvest.extract import extract_posts 27 | 28 | except ImportError: 29 | import warnings 30 | 31 | warnings.warn( 32 | "Missing dependencies - harvest has not been properly installed") 33 | 34 | RE_STRIP_XML_DECLARATION = re.compile(r'^<\?xml [^>]+?\?>') 35 | 36 | 37 | def extract_data(html, url): 38 | """ 39 | Extracts posts from an html 40 | Args: 41 | html (string): html of the web forum 42 | url (string): the url to the html 43 | Returns: 44 | Dictionary: posts with metadata 45 | """ 46 | extract_post_result = posts.extract_posts(html, url) 47 | extraction_results = extract_posts(html, url, extract_post_result['text_xpath_pattern'], 48 | extract_post_result['url_xpath_pattern'], 49 | extract_post_result['date_xpath_pattern'], 50 | extract_post_result['user_xpath_pattern'], 51 | result_as_datetime=False) 52 | 53 | final_results = [] 54 | for extraction_result in extraction_results: 55 | entity = {'post_text': extraction_result.post} 56 | if hasattr(extraction_result, 'date'): 57 | entity['datetime'] = extraction_result.date 58 | if hasattr(extraction_result, 'url'): 59 | entity['post_link'] = extraction_result.url 60 | if hasattr(extraction_result, 'user'): 61 | entity['user'] = extraction_result.user 62 | final_results.append(entity) 63 | 64 | return {"posts": final_results} 65 | -------------------------------------------------------------------------------- /corpus/createGoldDocuments/calculate_position.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | from fuzzywuzzy import fuzz 4 | 5 | logging.getLogger().setLevel(logging.INFO) 6 | 7 | 8 | def _add_start_end_fuzzy_search(element_to_add, text, sub_text, start_index, move_text_left=0): 9 | sub_text_start = re.sub(r'(? -1: 13 | sub_text_end_index = text.find(sub_text_end, sub_text_start_index) 14 | if sub_text_end_index > -1: 15 | sub_text_end_index = sub_text_end_index + len(sub_text_end) 16 | matched_text = text[sub_text_start_index:sub_text_end_index] 17 | if fuzz.ratio(matched_text, sub_text) > 70: 18 | element_to_add['start'] = sub_text_start_index 19 | element_to_add['end'] = sub_text_end_index 20 | return element_to_add['end'] 21 | elif move_text_left < len(sub_text) - 60: 22 | return _add_start_end_fuzzy_search(element_to_add, text, sub_text, start_index, move_text_left + 20) 23 | logging.warning(f'Not found in text:\n{sub_text}') 24 | return -1 25 | 26 | 27 | def _add_start_end(element_to_add, text, sub_text, start_index, fuzzy_search=False): 28 | if isinstance(sub_text, str): 29 | found_start_index = text.find(sub_text, start_index) 30 | if found_start_index > -1: 31 | element_to_add['start'] = found_start_index 32 | element_to_add['end'] = found_start_index + len(sub_text) 33 | return found_start_index + len(sub_text) 34 | elif fuzzy_search and len(sub_text) > 150: 35 | return _add_start_end_fuzzy_search(element_to_add, text, sub_text, start_index) 36 | else: 37 | logging.warning(f'Not found in text:\n{sub_text}') 38 | return -1 39 | return -1 40 | 41 | 42 | def get_start_end_for_post(post, full_text, search_start_index, fuzzy_search=False): 43 | index_post_text = _add_start_end(post['post_text'], full_text, post['post_text']['surface_form'], 44 | search_start_index, fuzzy_search) 45 | 46 | if 'datetime' in post: 47 | _add_start_end(post['datetime'], full_text, 48 | post['datetime']['surface_form'], search_start_index) 49 | if 'user' in post: 50 | _add_start_end(post['user'], full_text, 51 | post['user']['surface_form'], search_start_index) 52 | if 'post_link' in post: 53 | _add_start_end(post['post_link'], full_text, 54 | post['post_link']['surface_form'], search_start_index) 55 | if index_post_text > -1: 56 | return index_post_text + len(post['post_text']) 57 | return index_post_text 58 | -------------------------------------------------------------------------------- /scripts/webservice.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module is used to provide a web interface for orbis-eval [https://github.com/orbis-eval]. 3 | With orbis-eval the scores of recall, precision and f1 is calculated. 4 | """ 5 | 6 | from flask import Flask 7 | from flask import request 8 | from flask import jsonify 9 | 10 | import harvest.posts as posts 11 | import harvest.extract as extract 12 | from corpus.createGoldDocuments.calculate_position import get_start_end_for_post 13 | 14 | app = Flask('harvest') 15 | 16 | 17 | @app.route('/extract_from_html', methods=['POST']) 18 | def events(): 19 | forum = request.json 20 | post_0 = posts.extract_posts(forum['html'], forum['url']) 21 | 22 | if 'gold_standard_format' in forum and forum['gold_standard_format']: 23 | results = [] 24 | else: 25 | results = {'entities': {}} 26 | if post_0['text_xpath_pattern']: 27 | search_start_index = 0 28 | for post_1 in extract.extract_posts( 29 | forum['html'], 30 | forum['url'], 31 | post_0['text_xpath_pattern'], 32 | post_0['url_xpath_pattern'], 33 | post_0['date_xpath_pattern'], 34 | post_0['user_xpath_pattern'], result_as_datetime=False): 35 | 36 | post_dict = { 37 | 'user': {'surface_form': post_1.user}, 38 | 'datetime': {'surface_form': post_1.date}, 39 | 'post_link': {'surface_form': post_1.url}, 40 | 'post_text': {'surface_form': post_1.post} 41 | } 42 | 43 | doc_id = forum['url'] 44 | 45 | if 'gold_standard_format' in forum and forum['gold_standard_format']: 46 | results.append(post_dict) 47 | else: 48 | if 'text' in forum: 49 | new_search_start_index = get_start_end_for_post(post_dict, forum['text'], search_start_index, 50 | fuzzy_search=True) 51 | if new_search_start_index > 0: 52 | search_start_index = new_search_start_index 53 | 54 | results['entities'][doc_id] = results['entities'].get(doc_id, []) 55 | for item in ['user', 'datetime', 'post_link', 'post_text']: 56 | result = { 57 | 'doc_id': doc_id, 58 | 'type': item, 59 | 'surface_form': post_dict[item]['surface_form'] 60 | } 61 | if 'start' in post_dict[item] and 'end' in post_dict[item]: 62 | result['start'] = post_dict[item]['start'] 63 | result['end'] = post_dict[item]['end'] 64 | 65 | results['entities'][doc_id].append(result) 66 | 67 | return jsonify(results) 68 | 69 | 70 | def get_flask_app(): 71 | return app 72 | 73 | 74 | if __name__ == '__main__': 75 | app.run(port=5000, debug=True) 76 | -------------------------------------------------------------------------------- /corpus/createGoldDocuments/script/pre_processing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import gzip 5 | import logging 6 | import hashlib 7 | 8 | from glob import glob 9 | from json import load 10 | from inscriptis import get_text 11 | from inscriptis.model.config import ParserConfig 12 | from collections import defaultdict 13 | from harvest import posts 14 | from harvest.extract import extract_posts 15 | from urllib.parse import urlparse 16 | 17 | from corpus.createGoldDocuments.file import write_to_json 18 | 19 | logging.getLogger().setLevel(logging.INFO) 20 | 21 | parser = argparse.ArgumentParser(description='Forum harvester - generate gold standard document for further processing') 22 | parser.add_argument('corpus_path', metavar='corpus_path', help='Path to the input corpus') 23 | parser.add_argument('--result-directory', dest='result_directory', help='Optional directory for storing json results.') 24 | parser.add_argument('--corpus-include-string', dest='corpus_include_string', 25 | help='Optionally restrict the input corpus to URLs that match the corpus include string.') 26 | 27 | args = parser.parse_args() 28 | 29 | result = defaultdict(list) 30 | for no, fname in enumerate(glob(args.corpus_path + "*.json.gz")): 31 | opener = gzip.open if fname.endswith(".gz") else open 32 | with opener(fname) as f: 33 | forum = load(f) 34 | domain = urlparse(forum['url']).netloc 35 | if args.corpus_include_string and args.corpus_include_string not in forum['url']: 36 | continue 37 | 38 | logging.info("Processing " + forum['url']) 39 | postXPath = posts.extract_posts(forum) 40 | if postXPath['xpath_pattern']: 41 | config = ParserConfig(display_links=True, display_anchors=True) 42 | text = get_text(forum['html'], config) 43 | text = " ".join([c.strip() for c in text.split("\n") if c.strip()]) 44 | document = {"id": f"i{int(hashlib.md5(forum['url'].encode('utf-8')).hexdigest(), 16)}", 45 | "url": forum['url'], "html": forum['html'], "text": text, "gold_standard_annotation": []} 46 | 47 | if args.result_directory: 48 | for post in extract_posts(forum['html'], forum['url'], 49 | postXPath['text_xpath_pattern'], 50 | postXPath['url_xpath_pattern'], 51 | postXPath['date_xpath_pattern'], 52 | postXPath['user_xpath_pattern'], result_as_datetime=False): 53 | post_element = {"post_text": {"surface_form": post.post}, 54 | "datetime": {"surface_form": post.date}, 55 | "user": {"surface_form": post.user}} 56 | if postXPath['url_xpath_pattern']: 57 | post_element["post_link"] = {"surface_form": post.url} 58 | document["gold_standard_annotation"].append(post_element) 59 | 60 | write_to_json(forum['url'], args.result_directory, document) 61 | else: 62 | logging.error(f'Could not process {forum["url"]}') 63 | -------------------------------------------------------------------------------- /scripts/extract_to_csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import gzip 5 | import logging 6 | import os 7 | 8 | from glob import glob 9 | from json import load, dump 10 | from csv import writer 11 | from collections import defaultdict 12 | from harvest import posts 13 | from harvest.extract import extract_posts 14 | from urllib.parse import urlparse 15 | 16 | logging.getLogger().setLevel(logging.INFO) 17 | 18 | 19 | def extract_to_csv(): 20 | parser = argparse.ArgumentParser( 21 | description='Forum harvester - extracts and harvests posts + metadata from Web forums') 22 | parser.add_argument('corpus_path', metavar='corpus_path', help='Path to the input corpus') 23 | parser.add_argument('output_file', metavar='output_file', help='Output file for the parser\'s results.') 24 | 25 | parser.add_argument('--result-directory', dest='result_directory', 26 | help='Optional directory for storing CSV results.') 27 | parser.add_argument('--debug-directory', dest='debug_directory', help='Optional directory for debug information.') 28 | parser.add_argument('--corpus-include-string', dest='corpus_include_string', 29 | help='Optionally restrict the input corpus to URLs that match the corpus include string.') 30 | 31 | args = parser.parse_args() 32 | 33 | result = defaultdict(list) 34 | 35 | for no, fname in enumerate(glob(args.corpus_path + "*.json.gz")): 36 | logging.info(fname) 37 | opener = gzip.open if fname.endswith(".gz") else open 38 | with opener(fname) as f: 39 | forum = load(f) 40 | domain = urlparse(forum['url']).netloc 41 | if args.corpus_include_string and args.corpus_include_string not in forum['url']: 42 | continue 43 | 44 | if args.debug_directory: 45 | debug_fname = os.path.join(args.debug_directory, "{}-{}.html".format(no, domain)) 46 | with open(debug_fname, "w") as g: 47 | g.write(forum['html']) 48 | 49 | logging.info("Processing " + forum['url']) 50 | extract_post_result = posts.extract_posts(forum['html'], forum['url']) 51 | result[domain].append(extract_post_result) 52 | 53 | if args.result_directory and extract_post_result['text_xpath_pattern']: 54 | result_fname = os.path.join(args.result_directory, f'{domain}.csv') 55 | with open(result_fname, 'a+') as g: 56 | csvwriter = writer(g) 57 | if os.stat(result_fname).st_size == 0: 58 | csvwriter.writerow(['forum_link', 'post_link', 'user', 'date', 'post']) 59 | for post in extract_posts(forum['html'], forum['url'], 60 | extract_post_result['text_xpath_pattern'], 61 | extract_post_result['url_xpath_pattern'], 62 | extract_post_result['date_xpath_pattern'], 63 | extract_post_result['user_xpath_pattern']): 64 | csvwriter.writerow([forum['url'], post.url, post.user, post.date, post.post]) 65 | 66 | with open(args.output_file, "w") as f: 67 | dump(result, f, indent=True) 68 | 69 | 70 | if __name__ == '__main__': 71 | extract_to_csv() 72 | -------------------------------------------------------------------------------- /src/harvest/similarity_calculator.py: -------------------------------------------------------------------------------- 1 | from itertools import chain 2 | 3 | from harvest.utils import get_xpath_tree_text 4 | import logging 5 | import re 6 | import numpy as np 7 | 8 | VSM_MODEL_SIZE = 5000 9 | 10 | # tags that are not allowed to be part of a forum xpath (lowercase) 11 | BLACKLIST_TAGS = ('option', 'footer', 'form', 'head', 'tfoot') 12 | REWARDED_CLASSES = ('content', 'message', 'post', 'wrapper') 13 | 14 | 15 | def _text_to_vsm(text): 16 | ''' 17 | translates a text into the vector space model 18 | using the hashing trick. 19 | 20 | VSM_MODEL_SIZE determines the size of the vsm. 21 | ''' 22 | vms = np.full(VSM_MODEL_SIZE, 0) 23 | for word in text.split(): 24 | index = word.__hash__() % VSM_MODEL_SIZE 25 | vms[index] += 1 26 | return vms 27 | 28 | 29 | def _descendants_contain_blacklisted_tag(xpath, dom, blacklisted_tags): 30 | descendants = set([t.tag for t in chain(*[e.iterdescendants() for e in dom.xpath(xpath)])]) 31 | for tag in blacklisted_tags: 32 | if tag in descendants: 33 | return True 34 | return False 35 | 36 | 37 | def _ancestors_contains_blacklisted_tag(xpath_string, blacklisted_tags): 38 | """ 39 | returns 40 | ------- 41 | True, if the xpath_string (i.e. the ancestors) contains any blacklisted_tag 42 | """ 43 | xpath = xpath_string.split("/") 44 | for tag in blacklisted_tags: 45 | if tag in xpath: 46 | return True 47 | return False 48 | 49 | 50 | def _ancestors_contains_class(xpath, rewarded_classes): 51 | classes_x_path = re.findall(r"(?!.*\[)@class=\".*\"", xpath) 52 | if classes_x_path: 53 | classes = [x.lower() for x in list(filter(None, re.sub(r"@class=|\"", "", classes_x_path[-1]).split(" ")))] 54 | for html_class in classes: 55 | for rewarded_class in rewarded_classes: 56 | if rewarded_class in html_class: 57 | return True 58 | 59 | 60 | def assess_node(reference_content, dom, xpath, reward_classes=False): 61 | """ 62 | returns 63 | ------- 64 | a metric that is based on 65 | (i) the vector space model and 66 | (ii) the number of returned elements 67 | (iii) whether the descendants contain any blacklisted tags 68 | to assess whether the node is likely to be part of a forum post. 69 | """ 70 | if xpath == "//" or _descendants_contain_blacklisted_tag(xpath, dom, BLACKLIST_TAGS): 71 | return 0., 1 72 | 73 | xpath_content_list = get_xpath_tree_text(dom, xpath) 74 | xpath_element_count = len(xpath_content_list) 75 | 76 | reference_vsm = _text_to_vsm(reference_content) 77 | xpath_vsm = _text_to_vsm(' '.join(xpath_content_list)) 78 | 79 | divisor = (np.linalg.norm(reference_vsm) * np.linalg.norm(xpath_vsm)) 80 | if not divisor: 81 | logging.warning("Cannot compute similarity - empty reference (%s) or xpath (%ss) text.", reference_content, 82 | ' '.join(xpath_content_list)) 83 | return 0., 1 84 | similarity = np.dot(reference_vsm, xpath_vsm) / divisor 85 | 86 | # discount any node that contains BLACKLIST_TAGS 87 | if _ancestors_contains_blacklisted_tag(xpath, BLACKLIST_TAGS): 88 | similarity /= 10 89 | elif reward_classes and _ancestors_contains_class(xpath, REWARDED_CLASSES): 90 | similarity += 0.1 91 | return similarity, xpath_element_count 92 | -------------------------------------------------------------------------------- /src/harvest/metadata/link.py: -------------------------------------------------------------------------------- 1 | """ 2 | link 3 | ---- 4 | 5 | Tries to obtain the URL of the given post 6 | determine post URL 7 | ------------------ 8 | * relevant tags: (href or name) 9 | * point to the same domain, or even better also to the same page (without parameters) 10 | * appear always in the same element 11 | """ 12 | import logging 13 | import re 14 | 15 | from collections import defaultdict 16 | from urllib.parse import urlparse, urljoin 17 | 18 | from harvest.utils import get_xpath_expression, get_xpath_expression_child_filter, get_merged_xpath, extract_text 19 | 20 | 21 | def _get_without_post_link(path): 22 | """ 23 | Used to handle case cases like post link /threads/deviantart-horrors.2366/post-145153 with forum link 24 | /threads/deviantart-horrors.2366 25 | Args: 26 | path: 27 | 28 | Returns: 29 | 30 | """ 31 | path_elements = path.split('/') 32 | if len([x for x in path_elements if x.strip() != '']) > 2: 33 | new_path = "/".join(path_elements[:-1]) 34 | return new_path 35 | 36 | return path 37 | 38 | 39 | def _get_link_representation(element): 40 | if extract_text(element): 41 | return extract_text(element) 42 | elif 'href' in element.attrib: 43 | return element.attrib['href'] 44 | return '' 45 | 46 | 47 | def _is_counting_up(candidates): 48 | for xpath, matches in candidates.items(): 49 | post_ids = [re.search(r'\d+', _get_link_representation(x)) for x in matches['elements']] 50 | if all(post_ids): 51 | post_ids = [int(x.group(0)) for x in post_ids] 52 | if all(x < y for x, y in zip(post_ids, post_ids[1:])): 53 | matches['score'] += 1 54 | 55 | 56 | def _get_link(dom, post_elements, base_url, forum_posts): 57 | ''' 58 | Obtains the URL to the given post. 59 | ''' 60 | url_candidates = defaultdict(lambda: {'elements': [], 61 | 'has_anchor_tag': False, 'score': 0}) 62 | 63 | # collect candidate paths 64 | for element in post_elements: 65 | for tag in element.iterdescendants(): 66 | if tag.tag == 'a': 67 | xpath = get_xpath_expression(tag) 68 | xpath += get_xpath_expression_child_filter(tag) 69 | # anchor tags with the name attribute will 70 | # lead to the post 71 | attributes = list(attr.lower() for attr in tag.attrib) 72 | if 'name' in attributes: 73 | url_candidates[xpath]['has_anchor_tag'] = True 74 | if 'name' in attributes or 'href' in attributes: 75 | url_candidates[xpath]['elements'].append(tag) 76 | 77 | # merge xpath 78 | for merged_xpath in get_merged_xpath(url_candidates.keys()): 79 | merged_elements = dom.xpath(merged_xpath) 80 | if merged_elements: 81 | url_candidates[merged_xpath]['elements'] = merged_elements 82 | if 'name' in (attr.lower() for attr in merged_elements[0].attrib): 83 | url_candidates[merged_xpath]['has_anchor_tag'] = True 84 | 85 | # filter candidate paths 86 | for xpath, matches in list(url_candidates.items()): 87 | # consider the number of posts or the number of posts + 2 spare for possible header elements 88 | if len(forum_posts) - len(matches['elements']) not in range(0, 3): 89 | del url_candidates[xpath] 90 | 91 | # filter candidates that contain URLs to other domains and 92 | # record the urls' targets 93 | forum_url = urlparse(base_url) 94 | for xpath, matches in list(url_candidates.items()): 95 | for match in matches['elements']: 96 | parsed_url = urlparse(urljoin(forum_url.scheme + "://" + forum_url.netloc, match.attrib.get('href', ''))) 97 | if parsed_url.netloc != forum_url.netloc: 98 | del url_candidates[xpath] 99 | break 100 | 101 | if _get_without_post_link(parsed_url.path) not in forum_url.path: 102 | del url_candidates[xpath] 103 | break 104 | 105 | _is_counting_up(url_candidates) 106 | 107 | # obtain the most likely url path 108 | for xpath, _ in sorted(url_candidates.items(), 109 | key=lambda x: (x[1]['has_anchor_tag'], x[1]['score']), 110 | reverse=True): 111 | return xpath 112 | 113 | return None 114 | 115 | 116 | def get_link(dom, post_xpath, base_url, forum_posts): 117 | ''' 118 | Args: 119 | dom: The DOM tree to analyze. 120 | post_xpath (str): xpath of the post to search dates. 121 | base_url (str): URL of the forum. 122 | Returns: 123 | str: the xpath to the post date. 124 | ''' 125 | 126 | logging.info('Start finding post link') 127 | post_elements = dom.xpath(post_xpath) 128 | while True: 129 | result = _get_link(dom, post_elements, base_url, forum_posts) 130 | if result or len(post_elements) <= 1: 131 | logging.info(f'Post link xpath: {result}') 132 | return result 133 | post_xpath = post_xpath + "/.." 134 | post_elements = dom.xpath(post_xpath) 135 | -------------------------------------------------------------------------------- /src/harvest/metadata/date.py: -------------------------------------------------------------------------------- 1 | ''' 2 | link 3 | ---- 4 | 5 | Tries to obtain the URL of the given post 6 | ''' 7 | import logging 8 | 9 | from collections import defaultdict 10 | from datetime import datetime 11 | from harvest.date_search import search_dates 12 | from dateutil import parser 13 | from lxml import etree 14 | 15 | from harvest.utils import (get_xpath_expression, get_cleaned_element_text, get_xpath_expression_child_filter, 16 | get_merged_xpath) 17 | 18 | MAX_DATE_LEN = 120 19 | 20 | 21 | def _get_date(dom, post_elements, base_url, forum_posts): 22 | date_candidates = defaultdict(lambda: {'elements': [], 23 | 'most_recent_date': datetime.fromtimestamp(0), # 1970 24 | 'lowermost_date': datetime.fromtimestamp(1E11), # >5000 25 | 'chronological_order': True, 26 | 'same_size_posts': False, 27 | 'multiple_dates': False}) 28 | # collect candidate paths 29 | for element in post_elements: 30 | for tag in element.iterdescendants(): 31 | text = get_cleaned_element_text(tag) 32 | # do not consider text larger than MAX_DATE_LEN relevant for date extraction 33 | 34 | if (len(text) > MAX_DATE_LEN or not search_dates(text) or 35 | tag.tag is etree.Comment) and not (tag.tag == 'time' and 'datetime' in tag.attrib): 36 | continue 37 | 38 | xpath = get_xpath_expression(tag, parent_element=element, single_class_filter=True) 39 | xpath += get_xpath_expression_child_filter(tag) 40 | date_candidates[xpath]['elements'].append(tag) 41 | 42 | # merge xpath 43 | for merged_xpath in get_merged_xpath(date_candidates.keys()): 44 | merged_elements = dom.xpath(merged_xpath) 45 | if merged_elements: 46 | date_candidates[merged_xpath]['elements'] = merged_elements 47 | 48 | # filter candidate paths that do not yield a date for every post 49 | for xpath, matches in list(date_candidates.items()): 50 | # consider the number of posts or the number of posts + 2 spare for possible header elements 51 | if len(forum_posts) - len(matches['elements']) not in range(0, 3): 52 | del date_candidates[xpath] 53 | 54 | # Set if same length as posts 55 | for xpath, matches in list(date_candidates.items()): 56 | if len(forum_posts) == len(matches['elements']): 57 | matches['same_size_posts'] = True 58 | 59 | # rank candidates based on the following criteria 60 | # - they must yield a date for every post 61 | # - we choose the candidate with the most recent date 62 | # (to distinguish between "post" and "member since" dates minus 1 year per year timedelta between the dates) 63 | for xpath, matches in list(date_candidates.items()): 64 | previous_date = datetime.min 65 | for match in matches['elements']: 66 | if match.tag == 'time': 67 | time = match.attrib.get('datetime', '') 68 | extracted_dates = [(time, parser.parse(time, ignoretz=True))] 69 | else: 70 | extracted_dates = search_dates(get_cleaned_element_text(match)) 71 | 72 | if not extracted_dates: 73 | del date_candidates[xpath] 74 | break 75 | 76 | if len(extracted_dates) > 1: 77 | date_candidates[xpath]['multiple_dates'] = True 78 | date_candidates[xpath]['most_recent_date'] = max(date_candidates[xpath]['most_recent_date'], 79 | max([date[1] for date in extracted_dates])) 80 | date_candidates[xpath]['lowermost_date'] = min(date_candidates[xpath]['lowermost_date'], 81 | min([date[1] for date in extracted_dates])) 82 | 83 | if previous_date > max([date[1] for date in extracted_dates]): 84 | date_candidates[xpath]['chronological_order'] = False 85 | else: 86 | date_candidates[xpath]['most_recent_date'] = max(date_candidates[xpath]['most_recent_date'], 87 | extracted_dates[0][1]) 88 | date_candidates[xpath]['lowermost_date'] = min(date_candidates[xpath]['lowermost_date'], 89 | extracted_dates[0][1]) 90 | if previous_date > extracted_dates[0][1]: 91 | date_candidates[xpath]['chronological_order'] = False 92 | 93 | previous_date = date_candidates[xpath]['most_recent_date'] 94 | 95 | # obtain the most likely url path 96 | for xpath, _ in sorted(date_candidates.items(), 97 | key=lambda x: (x[1]['same_size_posts'], x[1]['chronological_order'], 98 | x[1]['most_recent_date']), 99 | reverse=True): 100 | return xpath 101 | 102 | return None 103 | 104 | 105 | # strategy 106 | # -------- 107 | # * obtain all xpaths that have date information 108 | # - extract the one which contains most likely the date (otherwise no date-xpath is returned) 109 | 110 | # * extract all dates from the date-xpath 111 | # * select the one that 112 | # - uses the same format and 113 | # - are newer (!= join date) 114 | 115 | def get_date(dom, post_xpath, base_url, forum_posts): 116 | ''' 117 | Args: 118 | dom: The DOM tree to analyze. 119 | post_xpath (str): xpath of the post to search dates. 120 | base_url (str): URL of the forum. 121 | Returns: 122 | str: the xpath to the post date. 123 | ''' 124 | logging.info('Start finding post date') 125 | post_elements = dom.xpath(post_xpath) 126 | while True: 127 | result = _get_date(dom, post_elements, base_url, forum_posts) 128 | if result or len(post_elements) <= 1: 129 | logging.info(f'Post date xpath: {result}') 130 | return result 131 | post_xpath = post_xpath + "/.." 132 | post_elements = dom.xpath(post_xpath) 133 | -------------------------------------------------------------------------------- /tests/integration/harvest/test_posts_xpath.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import gzip 4 | from json import load 5 | from harvest.posts import extract_posts 6 | 7 | 8 | @pytest.fixture 9 | def load_test_data(): 10 | def _load_test_data(file_name): 11 | file_path = os.path.join(os.path.dirname(__file__), '../../../data/forum', file_name) 12 | with gzip.open(file_path) as f: 13 | return load(f) 14 | 15 | return _load_test_data 16 | 17 | 18 | def test_extract_posts_forum_shift_ms(load_test_data): 19 | forum_test_data = load_test_data("https%3A%2F%2Fshift.ms%2Ftopic%2Fcbd-oil-11.json.gz") 20 | post = extract_posts(forum_test_data['html'], forum_test_data['url']) 21 | 22 | assert post['url'] == 'https://shift.ms/topic/cbd-oil-11' 23 | assert post['xpath_pattern'] == '//div[@class="bbp-reply-content"]/../..' 24 | assert post['url_xpath_pattern'] is None 25 | assert post['date_xpath_pattern'] == '//div/div/div[@class="bbp-reply-date"][not(*) and string-length(text()) > 0]' 26 | assert post['user_xpath_pattern'] == \ 27 | '//div/div/a[@class="bbp-author-name"][not(*) and string-length(text()) > 0]' 28 | 29 | 30 | def test_extract_posts_forum_healingwell(load_test_data): 31 | forum_test_data = load_test_data( 32 | "https%3A%2F%2Fwww.healingwell.com%2Fcommunity%2Fdefault.aspx%3Ff%3D34%26m%3D4099304.json.gz") 33 | post = extract_posts(forum_test_data['html'], forum_test_data['url']) 34 | 35 | assert post['url'] == 'https://www.healingwell.com/community/default.aspx?f=34&m=4099304' 36 | assert post['xpath_pattern'] == '//div/div[@class="post-body"]/../../..' 37 | assert post['url_xpath_pattern'] == \ 38 | '//div[(contains(@class, \'post-even\')) or (contains(@class, \'post-odd\'))]/a[not(*) and string-length(text()) = 0]' 39 | assert post['date_xpath_pattern'] == '//div/div/div[@class="posted"][not(*) and string-length(text()) > 0]' 40 | assert post['user_xpath_pattern'] == \ 41 | '//div/div/div/div/div/div/a[@class="user-name"][not(*) and string-length(text()) > 0]' 42 | 43 | 44 | def test_extract_posts_forum_medhelp(load_test_data): 45 | forum_test_data = load_test_data("https%3A%2F%2Fwww.medhelp.org%2Fposts%2FHeart-Disease%2FWolfe-Parkinson-" 46 | "White-Syndrome%2Fshow%2F250747.json.gz") 47 | post = extract_posts(forum_test_data['html'], forum_test_data['url']) 48 | 49 | assert post['url'] == 'https://www.medhelp.org/posts/Heart-Disease/Wolfe-Parkinson-White-Syndrome/show/250747' 50 | assert post['xpath_pattern'] == '//div/div[@class="resp_body "]/..' 51 | assert post['url_xpath_pattern'] is None 52 | assert post[ 53 | 'date_xpath_pattern'] == '//div/div/div/time[@class="mh_timestamp"][not(*) and string-length(text()) = 0]' 54 | assert post['user_xpath_pattern'] == '//div/div/div[@class="username"]/a[span]' 55 | 56 | 57 | def test_extract_posts_forum_medschat(load_test_data): 58 | forum_test_data = load_test_data("https%3A%2F%2Fwww.medschat.com%2FDiscuss%2Fhow-important-is-this-medician-G-E-" 59 | "Sulfamethoxazole-TMP-DS-Tabitp-to-take-due-to-COPD-206090" 60 | ".htm%3Fsrcq%3Dcopd.json.gz") 61 | post = extract_posts(forum_test_data['html'], forum_test_data['url']) 62 | 63 | assert post['url'] == 'https://www.medschat.com/Discuss/how-important-is-this-medician-G-E-Sulfamethoxazole-' \ 64 | 'TMP-DS-Tabitp-to-take-due-to-COPD-206090.htm?srcq=copd' 65 | assert post['xpath_pattern'] == '//div/span[@class="search_results"]/../..' 66 | assert post['url_xpath_pattern'] == '//a[@class="action_bar_blue"][not(*) and string-length(text()) > 0]' 67 | assert post['date_xpath_pattern'] == '//div/span[@class="small soft"]/time[not(*) and string-length(text()) > 0]' 68 | assert post[ 69 | 'user_xpath_pattern'] == '//div[@class="list_item_b_content"]/strong[not(*) and string-length(text()) > 0]' 70 | 71 | 72 | def test_extract_posts_forum_msconnection(load_test_data): 73 | forum_test_data = load_test_data("https%3A%2F%2Fwww.msconnection.org%2FDiscussions%2Ff33%2Ft77364%2Ftp1%2FHow-long-" 74 | "is-too-long-to-wait-for-an-initial-con.json.gz") 75 | post = extract_posts(forum_test_data['html'], forum_test_data['url']) 76 | 77 | assert post['url'] == 'https://www.msconnection.org/Discussions/f33/t77364/tp1/How-long-is-too-long-to-wait-' \ 78 | 'for-an-initial-con' 79 | assert post['xpath_pattern'] == '//li/div[@class="discussion-post-body"]' 80 | assert post['url_xpath_pattern'] == None 81 | assert post['date_xpath_pattern'] == \ 82 | '//header/div/div[@class="discussion-post-meta-info"]/br[not(*) and string-length(text()) = 0]' 83 | assert post['user_xpath_pattern'] == '//header/div/div/a[@class="PostUser"][not(*) and string-length(text()) > 0]' 84 | 85 | 86 | def test_extract_posts_forum_msworld(load_test_data): 87 | forum_test_data = load_test_data("https%3A%2F%2Fwww.msworld.org%2Fforum%2Fshowthread.php%3F145403-" 88 | "Sort-of-new-here.json.gz") 89 | post = extract_posts(forum_test_data['html'], forum_test_data['url']) 90 | 91 | assert post['url'] == 'https://www.msworld.org/forum/showthread.php?145403-Sort-of-new-here' 92 | assert post['xpath_pattern'] == '//div/blockquote[@class="postcontent restore"]/../../../../..' 93 | assert post['url_xpath_pattern'] == '//a[@class="postcounter"][not(*) and string-length(text()) > 0]' 94 | assert post['date_xpath_pattern'] == '//div/div/span/span[@class="date"][span]' 95 | assert post['user_xpath_pattern'] == \ 96 | '//div/div/div/div/div/a[(contains(@class, \'popupctrl\') and contains(@class, \'username\'))][strong]' 97 | 98 | 99 | def test_extract_posts_forum_uninterrupted(load_test_data): 100 | forum_test_data = load_test_data("https%3A%2F%2Fwww.uninterrupted.org.au%2Fblog-category%2Fmy-ms-journey.json.gz") 101 | post = extract_posts(forum_test_data['html'], forum_test_data['url']) 102 | 103 | assert post['url'] == 'https://www.uninterrupted.org.au/blog-category/my-ms-journey' 104 | assert post['xpath_pattern'] == '//div[@class="field-content"]/../..' 105 | assert post['url_xpath_pattern'] is None 106 | assert post['date_xpath_pattern'] is None 107 | assert post['user_xpath_pattern'] == '//div/span/a[@class="username"][not(*) and string-length(text()) > 0]' 108 | -------------------------------------------------------------------------------- /src/harvest/utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Functions that are shared across modules. 3 | ''' 4 | 5 | import re 6 | 7 | from lxml import etree 8 | 9 | VALID_NODE_TYPE_QUALIFIERS = ('class',) 10 | RE_FILTER_XML_HEADER = re.compile("<\\?xml version=\".*? encoding=.*?\\?>") 11 | 12 | 13 | def get_html_dom(html_content): 14 | ''' 15 | Params: 16 | html_content: the HTML page to retrieve the DOM from. 17 | 18 | Returns: 19 | The corresponding lxml document object model (DOM). 20 | ''' 21 | html = RE_FILTER_XML_HEADER.sub("", html_content) 22 | return etree.HTML(html) 23 | 24 | 25 | def extract_text(element): 26 | ''' 27 | Returns: 28 | str -- The text for the given element. 29 | ''' 30 | return ' '.join([t.strip() for t in element.itertext() if t.strip()]) 31 | 32 | 33 | def get_xpath_expression_child_filter(element): 34 | """ 35 | Returns: 36 | str -- The xpath expression to filter because of child element 37 | """ 38 | child_filter = "" 39 | children = element.getchildren() 40 | if len(children) == 1 and type(children[0].tag) == str: 41 | child_filter = "[" + children[0].tag + "]" 42 | elif element.text and element.text.strip() and not children: 43 | child_filter = "[not(*) and string-length(text()) > 0]" 44 | elif not element.text and not children: 45 | child_filter = "[not(*) and string-length(text()) = 0]" 46 | return child_filter 47 | 48 | 49 | def get_xpath_combinations_for_classes(x_path): 50 | """ 51 | Returns: 52 | array -- Possible xpath combinations of classes 53 | """ 54 | classes_x_path = re.findall(r"(?!.*\[)@class=\".*\"", x_path) 55 | xpath_combinations = [] 56 | if classes_x_path: 57 | classes = list(filter(None, re.sub(r"@class=|\"", "", classes_x_path[-1]).split(" "))) 58 | for html_class in classes: 59 | xpath_combinations.append( 60 | re.sub(r"(?!.*\[)@class=\".*\"\]", r"contains(concat(' ',@class,' '),' " + html_class + r" ')]", 61 | x_path)) 62 | if len(classes) > 1: 63 | new_classes = " and ".join(["contains(@class, \'" + x + "\')" for x in classes]) + "]" 64 | xpath_combinations.append(re.sub(r"(?!.*\[)@class=\".*\"\]", new_classes, x_path)) 65 | if not xpath_combinations: 66 | xpath_combinations = [x_path] 67 | return xpath_combinations 68 | 69 | 70 | def get_xpath_expression(element, parent_element=None, single_class_filter=False): 71 | ''' 72 | Returns: 73 | str -- The xpath expression for the given comment. 74 | ''' 75 | xpath_list = [] 76 | has_class_filter = False 77 | 78 | while (not has_class_filter or parent_element is not None and element is not parent_element) \ 79 | and element is not None: 80 | without_class_filter = single_class_filter and has_class_filter 81 | xpath_expression = _get_xpath_element_expression(element, without_class_filter=without_class_filter) 82 | if not has_class_filter and "[" in xpath_expression: 83 | has_class_filter = True 84 | # Todo does this improve the detection overall? 85 | # if not has_class_filter: 86 | # xpath_expression = xpath_expression + "[not(@class)]" 87 | xpath_list.append(xpath_expression) 88 | 89 | element = element.getparent() 90 | 91 | xpath_list.reverse() 92 | return "//" + "/".join(xpath_list) 93 | 94 | 95 | def _get_xpath_element_expression(element, without_class_filter=False): 96 | ''' 97 | Returns: 98 | str -- The xpath expression for the given element. 99 | ''' 100 | attr_filter = None 101 | if not without_class_filter: 102 | attr_filter = " & ".join(['@%s="%s"' % (key, value) 103 | for key, value in element.attrib.items() 104 | if key in VALID_NODE_TYPE_QUALIFIERS]) 105 | return element.tag + "[%s]" % attr_filter if attr_filter else element.tag 106 | 107 | 108 | def get_xpath_tree_text(dom, xpath): 109 | ''' 110 | Args: 111 | xpath (str): The xpath to extract. 112 | Returns: 113 | list -- A list of text obtained by all elements matching the given 114 | xpath. 115 | ''' 116 | return [re.sub(r'\s\s+', ' ', extract_text(element)) for element in dom.xpath(xpath)] 117 | 118 | 119 | def get_cleaned_element_text(element): 120 | ''' 121 | Returns: 122 | str -- the text of the given element (without its children and 123 | punctuation). 124 | ''' 125 | return f'{element.text or ""} {element.tail or ""}'.replace(",", " ") \ 126 | .replace(";", " ").strip() 127 | 128 | 129 | def _get_classes_concat_with_and_condition(classes): 130 | return "(" + " and ".join(["contains(@class, \'" + x + "\')" for x in classes]) + ")" 131 | 132 | 133 | def _get_merged_classes_xpath_condition(classes, classes2): 134 | return "[" + _get_classes_concat_with_and_condition(classes) + " or " + \ 135 | _get_classes_concat_with_and_condition(classes2) + "]" 136 | 137 | 138 | def _get_classes(regex_class_detection, xpath): 139 | """ 140 | Args: 141 | regex_class_detection: regex to detect class 142 | xpath: xpath string to get classes 143 | 144 | Returns: list of classes 145 | """ 146 | classes = re.findall(regex_class_detection, xpath) 147 | return list(filter(None, re.sub(r"@class=|\"|\[|\]", "", classes[0]).split(" "))) 148 | 149 | 150 | def _get_merged_xpath(regex_class_detection, xpath, xpath_to_compare, merged_xpath): 151 | """ 152 | Args: 153 | regex_class_detection: Regex expression to look for class attributes 154 | xpath: xpath string 155 | xpath_to_compare: xpath string to compare with param xpath 156 | merged_xpath: dictionary with already merged xpath 157 | 158 | Returns: merged xpath if possible. If no match is found, none is returned 159 | 160 | """ 161 | xpath_without_class = re.sub(regex_class_detection, "", xpath) 162 | xpath_to_compare_without_class = re.sub(regex_class_detection, "", xpath_to_compare) 163 | if xpath_without_class == xpath_to_compare_without_class and xpath_to_compare not in merged_xpath: 164 | classes = _get_classes(regex_class_detection, xpath) 165 | classes_to_compare = _get_classes(regex_class_detection, xpath_to_compare) 166 | same_classes = list(set(classes).intersection(classes_to_compare)) 167 | if same_classes: 168 | same_classes.sort() 169 | return re.sub(regex_class_detection, "[" + _get_classes_concat_with_and_condition(same_classes) + "]", 170 | xpath) 171 | 172 | if classes and classes_to_compare: 173 | merged_xpath_classes = _get_merged_classes_xpath_condition(classes, classes_to_compare) 174 | return re.sub(regex_class_detection, merged_xpath_classes, xpath) 175 | 176 | 177 | def get_merged_xpath(xpaths): 178 | """ 179 | Args: 180 | xpaths: List of xpaths to look for xpaths which can be merged 181 | 182 | Returns: A list with the merged xpath 183 | """ 184 | merged_xpaths = dict() 185 | regex_class_detection = r"\[@class=\".*\"\]" 186 | for xpath in xpaths: 187 | if re.search(regex_class_detection, xpath): 188 | for xpath_to_compare in [x for x in xpaths if x != xpath]: 189 | if re.search(regex_class_detection, xpath_to_compare): 190 | merged_xpath = _get_merged_xpath(regex_class_detection, xpath, xpath_to_compare, merged_xpaths) 191 | if merged_xpath: 192 | merged_xpaths[xpath] = merged_xpath 193 | 194 | return list(merged_xpaths.values()) 195 | 196 | 197 | def get_grandparent(element): 198 | if etree.iselement(element) and etree.iselement(element.getparent()) and \ 199 | etree.iselement(element.getparent().getparent()): 200 | return element.getparent().getparent() 201 | 202 | 203 | def elements_have_no_overlap(elements): 204 | for element in elements: 205 | for element_to_compare in [child for child in [x for x in elements if x is not element]]: 206 | for child_element in element.iterdescendants(): 207 | if element_to_compare is child_element: 208 | return False 209 | return True 210 | -------------------------------------------------------------------------------- /src/harvest/extract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ''' 4 | Extracts posts and metadata from Web forums based on the xpath's provide 5 | by the machine learning component. 6 | 7 | - posts are extracted "as is" and send through a boilerplate removal component 8 | - URL and user metadata is extracted 9 | - the post date is extracted based on the provided xpath + simple 10 | pre-processing 11 | ''' 12 | 13 | from collections import namedtuple 14 | from datetime import datetime 15 | from operator import itemgetter 16 | from urllib.parse import urljoin, urlparse 17 | from dateparser.search import search_dates 18 | from dateutil import parser 19 | 20 | from harvest.utils import get_html_dom, get_xpath_tree_text, get_cleaned_element_text, extract_text 21 | 22 | from harvest.cleanup.forum_post import remove_boilerplate 23 | from harvest.config import LANGUAGES 24 | 25 | ExtractionResult = namedtuple('ExtractionResult', ('post', 'url', 'date', 26 | 'user')) 27 | 28 | 29 | def _get_reference_url(element): 30 | ''' 31 | Returns either the URL to the given element (if the `name` attribute is 32 | set) or the URL the element points to (if the `href` attribute is present). 33 | 34 | Args: 35 | element: The lxml element from which to extract the URL. 36 | 37 | Returns: 38 | str -- The URL that points to the element (name) or that the element 39 | is pointing to (href) 40 | ''' 41 | if 'name' in element.attrib: 42 | return f"#{element.attrib['name']}" 43 | 44 | if 'href' in element.attrib: 45 | return f"{element.attrib['href']}" 46 | 47 | return None 48 | 49 | 50 | def _get_user_name(element): 51 | ''' 52 | Returns either the URL to the given element (if the `name` attribute is 53 | set) or the URL the element points to (if the `href` attribute is present) or the cleaned text. 54 | 55 | Args: 56 | element: The lxml element from which to extract the URL. 57 | 58 | Returns: 59 | str -- The URL that points to the element (name) or that the element 60 | is pointing to (href) 61 | ''' 62 | if element.tag == 'a': 63 | return _get_reference_url(element) 64 | else: 65 | return extract_text(element) 66 | 67 | 68 | def _get_date_text(time_element, time_element_as_datetime=True): 69 | is_tag_time = time_element.tag == 'time' 70 | if is_tag_time and 'datetime' in time_element.attrib: 71 | if time_element_as_datetime: 72 | time = time_element.attrib.get('datetime', '') 73 | parsed_time = parser.parse(time, ignoretz=True) 74 | return is_tag_time, parsed_time 75 | 76 | return is_tag_time, get_cleaned_element_text(time_element) 77 | 78 | 79 | def get_forum_date(dom, post_date_xpath, result_as_datetime=True): 80 | ''' 81 | Selects the date present in the given post_date_xpath. Future dates are 82 | automatically filtered. If no date has been identified for a post, a None 83 | value is inserted. 84 | 85 | Args: 86 | dom: the DOM representation of the forum page. 87 | post_date_xpath (str): The xpath of the forum date. 88 | result_as_datetime (bool): If true the date are returned as datetime. Otherwise the date are returned as string 89 | 90 | Returns: 91 | list -- A list of dates for every forum post. 92 | ''' 93 | result = [] 94 | date_mentions = (_get_date_text(e, time_element_as_datetime=result_as_datetime) 95 | for e in dom.xpath(post_date_xpath) if 96 | e.tag == 'time' or search_dates(_get_date_text(e)[1], languages=LANGUAGES)) 97 | for is_time_element, date_mention in date_mentions: 98 | found = None 99 | if is_time_element: 100 | found = date_mention 101 | else: 102 | for data_as_string, date in sorted( 103 | search_dates(date_mention, settings={'RETURN_AS_TIMEZONE_AWARE': False}, languages=LANGUAGES), 104 | key=itemgetter(1), reverse=True): 105 | if date <= datetime.now(): 106 | if result_as_datetime: 107 | found = date 108 | else: 109 | found = data_as_string 110 | break 111 | result.append(found) 112 | 113 | return result 114 | 115 | 116 | def get_forum_url(dom, post_url_xpath): 117 | ''' 118 | Args: 119 | dom: The DOM representation of the forum page. 120 | post_url_xpath (str): The xpath to the post URL. 121 | url (str): The URL of the given page. 122 | 123 | Returns: 124 | list -- A list of all forum URLs. 125 | ''' 126 | return [_get_reference_url(element) 127 | for element in dom.xpath(post_url_xpath)] 128 | 129 | 130 | def get_forum_user(dom, post_user_xpath): 131 | ''' 132 | Args: 133 | dom: The DOM representation of the forum page. 134 | post_user_xpath (str): The xpath to the post user name. 135 | url (str): The URL of the given page. 136 | 137 | Returns: 138 | list -- A list of all forum URLs. 139 | ''' 140 | return [_get_user_name(element) 141 | for element in dom.xpath(post_user_xpath)] 142 | 143 | 144 | def generate_forum_url(url, num_posts): 145 | ''' 146 | Generates forum URLs based on the forum base URL and the number of 147 | posts. 148 | 149 | Args: 150 | url (str): the forum URL 151 | num_posts (int): the number of posts for which to generate a URL 152 | Returns: 153 | list -- a list of URLs for the posts. 154 | ''' 155 | return [urljoin(url, f'#{no}') for no in range(1, num_posts + 1)] 156 | 157 | 158 | def _get_same_size_as_posts(length_forum_post, forum_element): 159 | result = forum_element[-length_forum_post:] 160 | if len(forum_element) != length_forum_post: 161 | for x in range(0, length_forum_post - len(forum_element)): 162 | result.append(forum_element[0]) 163 | return result 164 | 165 | 166 | def _get_container_elements(dom, xpath, number_of_posts): 167 | post_elements = dom.xpath(xpath) 168 | while True: 169 | xpath = xpath + "/.." 170 | new_post_elements = dom.xpath(xpath) 171 | if new_post_elements is None or len(new_post_elements) < number_of_posts: 172 | return post_elements 173 | post_elements = new_post_elements 174 | 175 | 176 | def add_anonymous_user(dom, users, post_xpath, post_user_xpath): 177 | posts = dom.xpath(post_xpath) 178 | if len(posts) > len(users): 179 | user_elements = dom.xpath(post_user_xpath) 180 | posts = _get_container_elements(dom, post_xpath, len(posts)) 181 | for index in range(len(posts)): 182 | contains_user = False 183 | for tag in posts[index].iterdescendants(): 184 | if tag in user_elements: 185 | contains_user = True 186 | break 187 | if not contains_user: 188 | users.insert(index, "Anonymous") 189 | if len(posts) == len(users): 190 | break 191 | 192 | 193 | def extract_posts(html_content, url, post_xpath, post_url_xpath, 194 | post_date_xpath, post_user_xpath, result_as_datetime=True): 195 | ''' 196 | Returns: 197 | dict -- The extracted forum post and the corresponding metadat. 198 | ''' 199 | dom = get_html_dom(html_content) 200 | 201 | forum_posts = remove_boilerplate(get_xpath_tree_text(dom, post_xpath)) 202 | forum_urls = get_forum_url(dom, post_url_xpath) \ 203 | if post_url_xpath else generate_forum_url(url, len(forum_posts)) 204 | forum_dates = get_forum_date(dom, post_date_xpath, result_as_datetime=result_as_datetime) \ 205 | if post_date_xpath else len(forum_posts) * [''] 206 | forum_users = get_forum_user(dom, post_user_xpath) \ 207 | if post_user_xpath else len(forum_posts) * [''] 208 | 209 | add_anonymous_user(dom, forum_users, post_xpath, post_user_xpath) 210 | forum_urls = _get_same_size_as_posts(len(forum_posts), forum_urls) 211 | forum_dates = _get_same_size_as_posts(len(forum_posts), forum_dates) 212 | forum_users = _get_same_size_as_posts(len(forum_posts), forum_users) 213 | 214 | return [ExtractionResult(post, url, date, user) 215 | for post, url, date, user in zip(forum_posts, forum_urls, 216 | forum_dates, forum_users)] 217 | -------------------------------------------------------------------------------- /scripts/test-urls.lst: -------------------------------------------------------------------------------- 1 | #Various random forums 2 | 3 | #https://bbs.archlinux.org/viewtopic.php?id=249553 4 | #https://forum.ubuntuusers.de/topic/ubuntu-lst-18-04-newbie/ 5 | #https://forum.ubuntuusers.de/topic/appimage-programm-in-alle-programme-als-icon-a/ 6 | #https://forum.odroid.com/viewtopic.php?f=139&t=22170 7 | #https://forum.odroid.com/viewtopic.php?f=139&t=19897 8 | 9 | #Medical forums 10 | #http://blog.angelman-asa.org/read.php?2,736 11 | #http://blog.angelman-asa.org/read.php?2,132 12 | #https://community.scope.org.uk/discussion/57774/copd 13 | #https://community.scope.org.uk/discussion/68941/disabled-mum 14 | #https://forums.maladiesraresinfo.org/post11011.html#p11011 15 | #https://forums.maladiesraresinfo.org/credit-immobilier-maladie-rare-t2720.html 16 | #https://healthunlocked.com/parkinsonsmovement/posts/142058845/artane-anyone 17 | #https://healthunlocked.com/parkinsonsmovement/posts/143660160/the-radiograph-shows-calcium-deposits-in-the-joint-capsule.-the-other-radiograph-is-the-podiatrists-own-ankle-for-comparison. 18 | #https://myparkinsons.org/cgi-bin/forum/topic_show.pl?id=5231 19 | #https://myparkinsons.org/cgi-bin/forum/topic_show.pl?id=5256 20 | #https://shift.ms/topic/cbd-oil-11 21 | #https://shift.ms/topic/news-on-myelin-repair 22 | #https://www.amsel.de/multiple-sklerose-forum/?tnr=1&mnr=217239&archiv_flag=2&fv=1 23 | #https://www.amsel.de/multiple-sklerose-forum/?tnr=1&mnr=221323&archiv_flag=2&fv=1 24 | #https://www.medhelp.org/posts/Heart-Disease/Wolfe-Parkinson-White-Syndrome/show/250747 25 | #https://www.medhelp.org/posts/Heart-Rhythm/Tikosyn-load-ablation/show/1640925 26 | #https://www.medschat.com/Discuss/how-important-is-this-medician-G-E-Sulfamethoxazole-TMP-DS-Tabitp-to-take-due-to-COPD-206090.htm?srcq=copd 27 | #https://www.medschat.com/Discuss/Nexium-drug-information-159060.htm 28 | #https://www.msconnection.org/Discussions/f33/t77364/tp1/How-long-is-too-long-to-wait-for-an-initial-con 29 | #https://www.msconnection.org/Discussions/f27/t79421/tp1/Does-this-sound-like-MS 30 | #https://www.msworld.org/forum/showthread.php?145403-Sort-of-new-here 31 | #https://www.msworld.org/forum/showthread.php?143493-FDA-Approes-Generic-20mg-AND-40MG 32 | #https://www.mumsnet.com/Talk/pregnancy/3749275-Pregnant-with-a-black-mixed-race-with-black-baby 33 | #https://www.mumsnet.com/Talk/adoptions/3940686-Siblings 34 | #http://www.paradisi.de/Health_und_Ernaehrung/Erkrankungen/Parkinson/Forum/120167.php 35 | #http://www.paradisi.de/Health_und_Ernaehrung/Erkrankungen/Parkinson/Forum/186517.php 36 | 37 | #Random Forums from https://en.wikipedia.org/wiki/List_of_Internet_forums 38 | 39 | #https://www.airliners.net/forum/viewtopic.php?f=3&t=1437935 40 | #https://www.airliners.net/forum/viewtopic.php?f=3&t=1428699 41 | #https://bpdfamily.com/message_board/index.php?topic=343886.0 42 | #https://bpdfamily.com/message_board/index.php?topic=344165.0 43 | #https://talk.collegeconfidential.com/student-here-ask-me-anything/2183693-got-into-nyu-pre-med-intention-ask-me-anything.html 44 | #https://talk.collegeconfidential.com/ivy-league/2184314-application-to-ivy-leagues.html 45 | #http://forum.ebaumsworld.com/viewtopic.php?f=14&t=42095&start=470 46 | #http://forum.ebaumsworld.com/viewtopic.php?f=14&t=78519 47 | #https://www.fanfiction.net/topic/146535/168685925/1/The-OC-Creation-and-Minor-Character-Information-Topic 48 | #https://www.fanfiction.net/topic/146535/108548484/1/The-About-the-World-Topic 49 | #https://www.gtplanet.net/forum/threads/f1-2018-general-discussion.378195/ 50 | #https://www.gtplanet.net/forum/threads/historic-cars-in-f1-2018-feel-slugish.387294/ 51 | #https://kiwifarms.net/threads/deviantart-horrors.2366/ 52 | #https://kiwifarms.net/threads/the-twitter-pedo-hunter-loli-crusader-community.64404/ 53 | #https://forums.macrumors.com/threads/se-or-11.2231616/ 54 | #https://forums.macrumors.com/threads/x-vs-8.2183765/ 55 | #https://forums.moneysavingexpert.com/discussion/5567669/unsure-whether-to-consolidate-please-advise 56 | #https://forums.moneysavingexpert.com/discussion/6100693/how-do-0-credit-card-balances-work-when-you-have-borrowed-twice 57 | #https://www.nairaland.com/5813456/how-snakes-get-into-toilet 58 | #https://www.nairaland.com/5812914/akeredolu-rejects-plot-impeach-deputy 59 | #https://forum.nationstates.net/viewtopic.php?f=4&t=170098 60 | #https://forum.nationstates.net/viewtopic.php?f=12&t=419 61 | #https://www.neowin.net/forum/topic/1393830-hello-everyone/ 62 | #https://www.neowin.net/forum/topic/1391546-hello-im-dion/ 63 | #https://www.pistonheads.com/gassing/topic.asp?h=0&f=239&t=1858583 64 | #https://www.pistonheads.com/gassing/topic.asp?h=0&f=156&t=1866139 65 | #http://skyscraperpage.com/forum/showthread.php?t=242327 66 | #http://skyscraperpage.com/forum/showthread.php?t=242165 67 | #https://forums.sherdog.com/threads/all-time-goat-poll.3916359/ 68 | #https://forums.sherdog.com/threads/free-fight-nick-diaz-debut.4102395/ 69 | 70 | 71 | #http://www.beliebte-foren.de/ 72 | 73 | #https://www.computerbase.de/forum/threads/ram-upgrade-auf-32-gb-fuer-3700x.1940201/ 74 | #https://www.computerbase.de/forum/threads/ram-empfehlung-fuer-ryzen.1940441/ 75 | #https://proxer.me/forum/142-anime/386798-kann-keine-anime-mehr-abspielen 76 | #https://proxer.me/forum/213-allgemein/386665-grammatikfehler-auf-der-seite 77 | #http://www.hifi-forum.de/viewthread-84-87.html 78 | #http://www.hifi-forum.de/viewthread-84-29928.html 79 | #https://www.android-hilfe.de/forum/samsung-allgemein.423/faq-diskussion-zum-kauf-samsung-galaxy-s10-s10e-s10-snapdragon-variante.904645.html 80 | #https://www.android-hilfe.de/forum/samsung-galaxy-s10-s10-s10e-s10-5g.3478/samsung-galaxy-s10e-s10-s10-zeigt-her-eure-homescreens.905512.html 81 | #https://www.drwindows.de/windows-7-allgemein/16340-zufall-entdeckte-problemlsungen.html 82 | #https://www.drwindows.de/windows-7-allgemein/167371-windows-7-dvd-iso-datei-umwandel.html 83 | #https://www.dslr-forum.de/showthread.php?t=1847412 84 | #https://www.dslr-forum.de/showthread.php?t=2016951 85 | #https://forum.mein-schoener-garten.de/viewtopic.php?f=1&t=4825193&sid=0e26c5b6c7cd9b067a6a5dc32896eebb 86 | #https://forum.mein-schoener-garten.de/viewtopic.php?f=1&t=4829305&sid=0e26c5b6c7cd9b067a6a5dc32896eebb 87 | #https://www.med1.de/forum/beruf-alltag-und-umwelt/corona-eine-gehypde-apokalypse-972190/ 88 | #https://www.med1.de/forum/blut-gefaesse-herz-lunge/sauerstoffsaettigung-nachts-969551/ 89 | #https://forum.digitalfernsehen.de/threads/df-hilferuf.416785/ 90 | #https://forum.digitalfernsehen.de/threads/erneuerbare-energie.413489/ 91 | #https://www.juraforum.de/forum/t/bettlaegerige-person-ohne-pflege-aus-krankenhaus-entlassen.678903/ 92 | #https://www.juraforum.de/forum/t/fahrtkostenerstattung-bei-falschen-rezepten.675629/ 93 | #https://www.musiker-board.de/threads/baubericht-0-14-ital-fichte-palisander.689167/ 94 | #https://www.musiker-board.de/threads/kopfplattenbruch-reparatur-lakewood-m48-custom.706841/ 95 | #https://forum.worldofplayers.de/forum/threads/1553036-Wie-aufwendig-ist-die-Arbeit-mit-vBulletin 96 | #https://forum.worldofplayers.de/forum/threads/1548322-Welchen-Blog-benutzt-man-in-2020 97 | #https://www.klamm.de/forum/f42/klamm-treff-jeder-lernt-die-stadt-von-jedem-kennen-327507.html 98 | #https://www.klamm.de/forum/f42/conventioncamp-in-hannover-341612.html 99 | #https://uhrforum.de/threads/der-yema-fotothread-und-nicht-nur-das.414009/ 100 | #https://uhrforum.de/threads/schachtel-fuer-mauthe-nr-50-322.432114/ 101 | #https://www.wohnmobilforum.de/w-t141583.html 102 | #https://www.wohnmobilforum.de/w-t141863.html 103 | #https://forum.glamour.de/t/nebenwirkungen-aknenormin/345148/2 104 | #https://forum.glamour.de/t/designertaschen-laber-laber/18136 105 | 106 | 107 | #Top forum examples from https://www.wpressblog.com/free-forum-posting-sites-list/ 108 | 109 | #https://www.cnet.com/forums/discussions/welcome-to-the-digital-camera-forum-315232/ 110 | #https://www.cnet.com/forums/discussions/select-camera/ 111 | #https://forum.wordreference.com/threads/attuned-to-the-reiki-symbols.3691417/ 112 | #https://forum.wordreference.com/threads/adding-accent-marks-accent-marks-are-mandatory-in-french.557434/ 113 | #https://forum.utorrent.com/topic/86747-help-us-build-the-next-great-bittorrent-product/ 114 | #https://forum.utorrent.com/topic/23012-check-on-startup/ 115 | #https://forum.xda-developers.com/showthread.php?t=2326393 116 | #https://forum.xda-developers.com/android/software/tool-tool-one-driversunlocktwrpfactory-t3358711 117 | #https://us.forums.blizzard.com/en/wow/t/layers-and-character-creation-adjustments-on-select-realms/499760 118 | #https://us.forums.blizzard.com/en/wow/t/can-i-transfer-back-to-locked-server-if-i-have-existing-character/505388 119 | #https://forum.videolan.org/viewtopic.php?f=14&t=92075 120 | #https://forum.videolan.org/viewtopic.php?f=14&t=145604 121 | #https://community.kaspersky.com/kaspersky-security-cloud-11/rootkit-scan-not-executed-6849 122 | #https://community.kaspersky.com/kaspersky-security-cloud-11/portuguese-in-free-version-8313 123 | #https://forum.statcounter.com/threads/custom-tags-examples.44340/ 124 | #https://forum.statcounter.com/threads/best-android-apps-in-uk-2019.79812/ 125 | #https://forums.futura-sciences.com/annonces-officielles/78761-moderateurs.html 126 | #https://forums.futura-sciences.com/annonces-officielles/12735-latex-debarque-fsg-explications-mode-demploi.html 127 | #https://forum.openoffice.org/en/forum/viewtopic.php?f=5&t=63160 128 | #https://forum.openoffice.org/en/forum/viewtopic.php?f=5&t=82202 129 | #https://community.bitdefender.com/en/discussion/82059/i-noticed-that-the-bitdefender-process-can-be-easily-killed 130 | #https://community.bitdefender.com/en/discussion/81455/how-to-disable-notification 131 | 132 | 133 | 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /src/harvest/posts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Forum Extraction AI and heuristic 4 | # --------------------------------- 5 | # (C)opyrights 2020 Albert Weichselbraun 6 | 7 | # simplifications: 8 | # ================ 9 | # - only consider tags with a class attribute 10 | # - vsm based on the hashing trick 11 | 12 | # algorithm 13 | # ========= 14 | # - match text to xpath nodes 15 | # - extract the text based on the xpath nodes and determine the best match 16 | # based on the node + its children 17 | # - from the best match that yields multiple results (i.e. forum posts) 18 | # select node parent elements as long as we still get the same number of 19 | # results. 20 | # - constraints 21 | # - blocked tags are not allowed to appear down- or upstream of the selected 22 | # path (e.g. it is not possible that a forum post contains a 'form' or 23 | # 'input' element :) 24 | # - there are forums that are contained in a form tag .... 25 | 26 | # cleanup posts 27 | # ------------- 28 | # * remove repeated elements 29 | # * appear at the beginning or end of a post 30 | # * may contain information on 31 | # - user 32 | # - date (subscription versus post date) => always compare dates within a page for computing the date extraction rule 33 | # - replies, likes, etc. 34 | 35 | import logging 36 | import re 37 | 38 | from lxml import etree 39 | 40 | from harvest.cleanup.forum_post import remove_boilerplate 41 | from harvest.metadata.date import get_date 42 | from harvest.metadata.link import get_link 43 | from harvest.metadata.username import get_user 44 | from harvest.metadata.usertext import get_text_xpath_pattern 45 | from harvest.post_text import get_cleaned_text 46 | from harvest.similarity_calculator import assess_node 47 | from harvest.utils import (get_xpath_expression, get_html_dom, get_xpath_combinations_for_classes, 48 | get_xpath_tree_text, get_grandparent, elements_have_no_overlap) 49 | 50 | CORPUS = "./data/forum/" 51 | 52 | # number of characters required for a match 53 | MATCH_PREFIX_SIZE = 30 54 | 55 | BLACKLIST_POST_TEXT_TAG = ('h1', 'h2', 'h3', 'h4', 'h5', 'a') 56 | 57 | # minimum number of posts we suspect on the page 58 | MIN_POST_COUNT = 3 59 | 60 | 61 | def _get_matching_element(comment, dom): 62 | """ 63 | returns 64 | ------- 65 | the element that matches the given comment 66 | """ 67 | if not comment.strip(): 68 | return None 69 | 70 | for e in dom.iter(): 71 | text = (e.text or "").strip() 72 | min_length_of_text = len(comment[:MATCH_PREFIX_SIZE]) 73 | if text and comment.startswith(text[:MATCH_PREFIX_SIZE]) and len(text) >= min_length_of_text and \ 74 | e.tag is not etree.Comment: 75 | return e 76 | 77 | return None 78 | 79 | 80 | def _get_xpath_tree(comment, dom, tree): 81 | element = _get_matching_element(comment, dom) 82 | return (None, None) if element is None else (element, tree.getpath(element)) 83 | 84 | 85 | def _remove_trailing_p_element(xpath_score, xpath_element_count, xpath, reference_text, dom): 86 | """ 87 | The p elements at the end can be removed. Some posts have several p elements and some have none at all. 88 | Those without p element can then not be detected. As Example, leading post can not be detected: 89 | https://us.forums.blizzard.com/en/wow/t/layers-and-character-creation-adjustments-on-select-realms/499760 90 | 91 | Args: 92 | xpath: the xpath to remove the p element from 93 | 94 | Returns: 95 | 96 | """ 97 | cleaned_xpath = re.sub(r'(? 1: 119 | candidate_xpaths.append((xpath_score, xpath_element_count, xpath_pattern)) 120 | 121 | return candidate_xpaths 122 | 123 | 124 | def _get_post_frame(xpath_pattern, xpath_score, reference_text, dom): 125 | while True: 126 | new_xpath_pattern = xpath_pattern + "/.." 127 | new_xpath_score, new_xpath_element_count = assess_node(reference_content=reference_text, dom=dom, 128 | xpath=new_xpath_pattern) 129 | if new_xpath_element_count < MIN_POST_COUNT: 130 | return xpath_pattern, xpath_score 131 | 132 | xpath_pattern = new_xpath_pattern 133 | xpath_score = new_xpath_score 134 | 135 | 136 | def _get_combination_of_posts(xpath_pattern, xpath_score, xpath_element_count, reference_text, dom): 137 | """ 138 | Check if combinations of classes result in detecting leading post 139 | Args: 140 | xpath_pattern: 141 | xpath_score: 142 | xpath_element_count: 143 | reference_text: 144 | dom: 145 | 146 | Returns: 147 | Combination of classes if they resulting in a better score. Otherwise the parameters xpath_patter, xpath_score and 148 | xpath_element_count are returned. 149 | """ 150 | candidate_xpaths = [] 151 | for final_xpath in get_xpath_combinations_for_classes(xpath_pattern): 152 | new_xpath_score, new_xpath_element_count = assess_node(reference_content=reference_text, dom=dom, 153 | xpath=final_xpath) 154 | if (xpath_element_count < new_xpath_element_count <= xpath_element_count + 2 or 155 | xpath_element_count * 2 - new_xpath_element_count in range(-1, 2)) and new_xpath_score > xpath_score: 156 | if elements_have_no_overlap(dom.xpath(final_xpath)): 157 | candidate_xpaths.append((new_xpath_score, new_xpath_element_count, final_xpath)) 158 | 159 | if candidate_xpaths: 160 | candidate_xpaths.sort() 161 | return candidate_xpaths.pop() 162 | return xpath_score, xpath_element_count, xpath_pattern 163 | 164 | 165 | def extract_posts(html, url): 166 | dom = get_html_dom(html) 167 | tree = etree.ElementTree(dom) 168 | result = {'url': url, 'dragnet': None, 'url_xpath_pattern': None, 'xpath_pattern': None, 169 | 'xpath_score': None, 'forum_posts': None, 'date_xpath_pattern': None, 'user_xpath_pattern': None, 170 | 'text_xpath_pattern': None} 171 | 172 | text_sections = get_cleaned_text(html) 173 | logging.debug(f"Extracted {len(text_sections)} lines of comments.") 174 | reference_text = " ".join(text_sections) 175 | 176 | candidate_xpaths = _get_xpaths_candidates(text_sections, dom, tree, reference_text) 177 | 178 | if not candidate_xpaths: 179 | logging.warning("Couldn't identify any candidate posts for forum", url) 180 | return result 181 | 182 | # obtain anchor node 183 | candidate_xpaths.sort() 184 | xpath_score, xpath_element_count, xpath_pattern = candidate_xpaths.pop() 185 | xpath_score, xpath_element_count, xpath_pattern = _remove_trailing_p_element(xpath_score, xpath_element_count, 186 | xpath_pattern, reference_text, dom) 187 | 188 | xpath_pattern, xpath_score = _get_post_frame(xpath_pattern, xpath_score, reference_text, dom) 189 | 190 | xpath_score, xpath_element_count, xpath_pattern = _get_combination_of_posts(xpath_pattern, xpath_score, 191 | xpath_element_count, reference_text, 192 | dom) 193 | 194 | logging.info( 195 | f"Obtained most likely forum xpath for forum {url}: {xpath_pattern} with a score of {xpath_score}.") 196 | if xpath_pattern: 197 | forum_posts = get_xpath_tree_text(dom, xpath_pattern) 198 | forum_posts = remove_boilerplate(forum_posts) 199 | 200 | result['xpath_pattern'] = xpath_pattern 201 | result['xpath_score'] = xpath_score 202 | result['forum_posts'] = forum_posts 203 | 204 | if xpath_pattern: 205 | result['text_xpath_pattern'] = get_text_xpath_pattern(dom, xpath_pattern, forum_posts) 206 | 207 | # add the post URL 208 | url_xpath_pattern = get_link(dom, xpath_pattern, url, forum_posts) 209 | if url_xpath_pattern: 210 | result['url_xpath_pattern'] = url_xpath_pattern 211 | 212 | # add the post Date 213 | date_xpath_pattern = get_date(dom, xpath_pattern, url, forum_posts) 214 | if date_xpath_pattern: 215 | result['date_xpath_pattern'] = date_xpath_pattern 216 | 217 | # add the post user 218 | user_xpath_pattern = get_user(dom, xpath_pattern, url, forum_posts) 219 | if user_xpath_pattern: 220 | result['user_xpath_pattern'] = user_xpath_pattern 221 | return result 222 | -------------------------------------------------------------------------------- /src/harvest/metadata/username.py: -------------------------------------------------------------------------------- 1 | ''' 2 | link 3 | ---- 4 | 5 | Tries to obtain the name of the post's author 6 | ''' 7 | import logging 8 | import re 9 | import numpy as np 10 | 11 | from harvest.config import LANGUAGES 12 | from itertools import combinations 13 | from collections import defaultdict 14 | from dateparser.search import search_dates 15 | from urllib.parse import urlparse, urljoin 16 | 17 | from harvest.utils import (get_xpath_expression, get_xpath_expression_child_filter, get_merged_xpath, 18 | get_cleaned_element_text) 19 | 20 | USER_PAGE_HINTS = ('user', 'member', 'person', 'profile') 21 | FORBIDDEN_TERMS = ('terms of use', 'privacy policy', 'add message', 'reply', 'answer', 'share', 'report', 'registered', 22 | 'setting') 23 | 24 | SCORE_INCREMENT = 1 25 | SCORE_TEXT_CHANCE_INCREMENT = 3 26 | 27 | 28 | def _set_user_hint_exits_for_attribute(matches, attribute_value): 29 | for user_hint in USER_PAGE_HINTS: 30 | if re.search(user_hint, attribute_value, re.IGNORECASE): 31 | matches['score'] += SCORE_INCREMENT 32 | return True 33 | 34 | 35 | def _set_user_hint_exits(url_candidates): 36 | for xpath, matches in [x for x in url_candidates.items()]: 37 | _set_user_hint_exits_for_attribute(matches, xpath) 38 | for match in [m.get('href') for m in matches['elements'] if m.get('href')]: 39 | if _set_user_hint_exits_for_attribute(matches, match.lower()): 40 | break 41 | 42 | 43 | def _set_text_changes(url_candidates): 44 | for xpath, matches in list(url_candidates.items()): 45 | if len(np.unique([e.text for e in matches['elements'] if e.text])) > 1: 46 | matches['score'] += SCORE_TEXT_CHANCE_INCREMENT 47 | else: 48 | text_in_sub_elements = [] 49 | for tag in [e for e in matches['elements']]: 50 | for subTag in tag.iterdescendants('span', 'div', 'b', 'strong'): 51 | if subTag.text and subTag.text not in text_in_sub_elements: 52 | text_in_sub_elements.append(subTag.text) 53 | if len(text_in_sub_elements) > 1: 54 | matches['score'] += SCORE_TEXT_CHANCE_INCREMENT 55 | 56 | 57 | def _filter_items_with_forbidden_words(url_candidates): 58 | for xpath, matches in list(url_candidates.items()): 59 | for tag in matches['elements']: 60 | if tag.text and tag.text.strip().lower() in FORBIDDEN_TERMS: 61 | del url_candidates[xpath] 62 | break 63 | 64 | 65 | def _filter_user_name_without_link_includes_date(url_candidates): 66 | for xpath, candidate in [x for x in url_candidates.items() if not x[1]['is_link']]: 67 | for element in candidate['elements']: 68 | text = element.text.strip() 69 | if search_dates(text, languages=LANGUAGES) or text in FORBIDDEN_TERMS: 70 | del url_candidates[xpath] 71 | break 72 | 73 | 74 | def _filter_user_name_without_link_and_no_text_changes(url_candidates): 75 | for xpath, candidate in [x for x in url_candidates.items() 76 | if not x[1]['is_link'] and x[1]['score'] == 0]: 77 | previous_element = None 78 | has_changed = False 79 | for element in candidate['elements']: 80 | text = element.text.strip() 81 | if previous_element is not None and previous_element.text.strip() != text: 82 | has_changed = True 83 | break 84 | previous_element = element 85 | 86 | if not has_changed and url_candidates[xpath]: 87 | del url_candidates[xpath] 88 | 89 | 90 | def _filter_more_than_one_element_per_post(url_candidates, post_elements): 91 | if len(post_elements) > 1: 92 | for xpath, candidate in [x for x in url_candidates.items()]: 93 | for post_element in post_elements: 94 | if len([x for x in post_element.iterdescendants() if x in candidate['elements']]) > 1 and \ 95 | url_candidates[xpath]: 96 | del url_candidates[xpath] 97 | break 98 | 99 | 100 | def _filter_post_links(url_candidates): 101 | for xpath, candidate in list(url_candidates.items()): 102 | sequence = re.findall(r"\d+", " ".join(get_cleaned_element_text(x) for x in candidate['elements'])) 103 | sequence = [int(s) for s in sequence] 104 | if len(sequence) > 2 and all(x + 1 == y for x, y in zip(sequence, sequence[1:])): 105 | del url_candidates[xpath] 106 | 107 | 108 | def _filter_post_to_candidate_length(url_candidates, posts): 109 | for xpath, matches in list(url_candidates.items()): 110 | if len(matches['elements']) > len(posts) or len(matches['elements']) < len(posts) - 2: 111 | del url_candidates[xpath] 112 | 113 | 114 | def _filter_url_other_domain(url_candidates, base_url): 115 | forum_url = urlparse(base_url) 116 | for xpath, matches in [x for x in url_candidates.items() if x[1]['is_link']]: 117 | for match in matches['elements']: 118 | parsed_url = urlparse(urljoin(base_url, match.attrib.get('href', ''))) 119 | if parsed_url.netloc and parsed_url.netloc != forum_url.netloc or parsed_url.path == forum_url.path: 120 | del url_candidates[xpath] 121 | break 122 | 123 | 124 | def _is_user_name_pattern(text): 125 | return text and text.strip() and 3 < len(text.strip()) < 100 and len( 126 | text.strip().split(" ")) <= 4 and not re.findall('http[s]?://', text) 127 | 128 | 129 | def _contains_user_name_pattern(tag): 130 | if tag.getchildren(): 131 | return _is_user_name_pattern(" ".join([get_cleaned_element_text(x) for x in tag.iterdescendants()])) 132 | return _is_user_name_pattern(tag.text) 133 | 134 | 135 | def _combine_xpath_candidates(url_candidates, number_of_posts): 136 | candidates_less_then_posts = [x for x in url_candidates.items() if len(x[1]['elements']) < number_of_posts] 137 | if number_of_posts > 1 and len(candidates_less_then_posts) > 1: 138 | valid_combinations = [] 139 | for comb in combinations(candidates_less_then_posts, 2): 140 | if len(comb[0][1]['elements']) + len(comb[1][1]['elements']) == number_of_posts: 141 | valid_combinations.append(comb) 142 | for elements1, elements2 in sorted(valid_combinations, 143 | key=lambda x: (x[0][1]['score'] + x[1][1]['score']), reverse=True): 144 | combined_xpath = elements1[0] + "|" + elements2[0] 145 | url_candidates[combined_xpath]['elements'] = elements1[1]['elements'] + elements2[1]['elements'] 146 | url_candidates[combined_xpath]['is_link'] = elements1[1]['is_link'] or elements2[1]['is_link'] 147 | url_candidates[combined_xpath]['score'] = min(elements1[1]['score'], elements2[1]['score']) 148 | break 149 | 150 | 151 | def _collect_candidates_paths(post_elements): 152 | url_candidates = defaultdict(lambda: {'elements': [], 'is_link': True, 'score': 0}) 153 | for element in post_elements: 154 | for tag in element.iterdescendants(): 155 | if ((tag.tag == 'a' and 'href' in tag.attrib and not [x for x in list(tag) if x.tag == 'time']) or 156 | (tag.tag in ['span', 'strong', 'div', 'b'] and not list(tag))) and \ 157 | _contains_user_name_pattern(tag): 158 | xpath = get_xpath_expression(tag, parent_element=element, single_class_filter=True) 159 | xpath += get_xpath_expression_child_filter(tag) 160 | url_candidates[xpath]['elements'].append(tag) 161 | if tag.tag != 'a': 162 | url_candidates[xpath]['is_link'] = False 163 | 164 | return url_candidates 165 | 166 | 167 | def get_user_name(name, base_url): 168 | ''' 169 | returns 170 | ------- 171 | A standardized representation of the user's URL. 172 | ''' 173 | return ".".join(name.split()) + '@' + urlparse(base_url).netloc 174 | 175 | 176 | def _get_user(dom, post_elements, base_url, posts): 177 | url_candidates = _collect_candidates_paths(post_elements) 178 | 179 | for merged_xpath in get_merged_xpath(url_candidates.keys()): 180 | merged_elements = dom.xpath(merged_xpath) 181 | if merged_elements: 182 | url_candidates[merged_xpath]['elements'] = merged_elements 183 | 184 | _filter_url_other_domain(url_candidates, base_url) 185 | _filter_items_with_forbidden_words(url_candidates) 186 | _filter_user_name_without_link_includes_date(url_candidates) 187 | _filter_post_links(url_candidates) 188 | _filter_more_than_one_element_per_post(url_candidates, post_elements) 189 | 190 | _set_user_hint_exits(url_candidates) 191 | _set_text_changes(url_candidates) 192 | 193 | _combine_xpath_candidates(url_candidates, len(posts)) 194 | _filter_user_name_without_link_and_no_text_changes(url_candidates) 195 | 196 | _filter_post_to_candidate_length(url_candidates, posts) 197 | 198 | # obtain the most likely url path 199 | 200 | for xpath, _ in sorted(url_candidates.items(), 201 | key=lambda x: (x[1]['is_link'], x[1]['score'], len(x[1]['elements'])), reverse=True): 202 | return xpath 203 | 204 | return None 205 | 206 | 207 | # strategy 208 | # -------- 209 | # * consider decendndants as well as elements at the same level 210 | # * the number of URL candidates must be identical to the number of posts 211 | # or must not have less than two elements than the posts 212 | # * assign points for URLs that contain 'user', 'member', 'person', 'profile', 213 | # etc. 214 | 215 | 216 | def get_user(dom, post_xpath, base_url, posts): 217 | """ 218 | Obtains the URL to the given post. 219 | 220 | Args: 221 | - dom: the forums DOM object 222 | - post_xpath: the determined post xpath 223 | - base url: URL of the given forum 224 | - posts: the extracted posts 225 | """ 226 | logging.info('Start finding user name') 227 | post_elements = dom.xpath(post_xpath) 228 | while True: 229 | result = _get_user(dom, post_elements, base_url, posts) 230 | if result or len(post_elements) <= 1: 231 | logging.info(f'User name xpath: {result}') 232 | return result 233 | post_xpath = post_xpath + "/.." 234 | post_elements = dom.xpath(post_xpath) 235 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /corpus/goldDocumentsPre/myparkinsons.org.cgi-bin.forum.topic_show.pl.5256.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "i18885220280728907651545511619171969200", 3 | "url": "https://myparkinsons.org/cgi-bin/forum/topic_show.pl?id=5256", 4 | "html": "\n\n\n\nParkinsons disease Caregiver Help | Discuss Parkinson's disease: Diagnosis through Advanced Parkinson Care @MyParkinsons.org\n\n\n\n\n\n\n
\n
\n\n
\n
\n
\nFor those who care for someone with Parkinson's disease
\n
\n
\n
\n\n
\n\n\n
\n\n\n
[Home]\n[Forum]\n[Help]\n[Search]\n[Register]\n[Login]\n[Donate]\n
\nYou are not logged in\n

\n\n
\n\n\n
\n\n\n
\nTopic Coronavirus and PD?\n\nGo to previous topic\nGo to next topic\nGo to higher level\n
\n
\n\n
\n\n\n\n\n\n
\n\n\n
\n\nBy jcoff012\nOn 2020.03.12 13:17\n\n
\n
\nWe don\u0092t see the neurologist til March 23, so I was wondering if anyone has any feedback from their\u0092s? As a diabetic with lymphedema, my primary said last Friday that I should not fly or be in large crowds for awhile. Seems that a PWP should have similar concerns, but do they?\n
\n\n

\n\n\n\n\n\n
\n\n\n
\n\nBy junipersage\nOn 2020.03.12 15:21\n\n
\n
\nWe haven't spoken to the neurologist, but I work in a county public health department and have been actively working on COVID-19. What we are telling people is that Parkinson's is not necessarily as strong a risk factor as other conditions (particularly lung conditions like asthma and COPD, and or heart conditions.) But even so, the virus is most severe in people over 60, and the risk increases with age. Also, any underlying condition that can make people more frail could be a problem. So its probably a good idea for PWP to take extra caution. Here's a webpage with more info: HREF='https://www.apdaparkinson.org/article/covid-19-overview-for-pd-community/'>https://www.apdaparkinson.org/article/covid-19-overview-for-pd-community/\n
\n\n

\n\n\n\n\n\n
\n\n\n
\n\nBy jcoff012\nOn 2020.03.12 17:31\n\n
\n
\nBless you! This is wonderfully helpful. I have a sore throat and upset stomach, so we decided my husband would go to the Taekwondo studio with our grandson today. I am not concerned that this is Coronavirus, but feel strongly about a lower resistance now. He is my PWP, but after talking this out, he said he\u0092ll stay away from the other parents and staff. He read the article, too. Thank you!\n
\n\n

\n\n\n\n

\n\n
\n\n\n
© MyParkinsons.org · Published by jAess Media\n · Privacy Policy & Terms of Use
\n Sponsorship Assistance for this website and Forum has been provided by by people\n like you

\n\n
\n
\n\n\n\n\n\n", 5 | "text": " For those who care for someone with Parkinson's disease [Home] [[Forum](forum_show.pl)] [[Help](forum_help.pl)] [[Search](forum_search.pl)] [Register] [[Login](user_login.pl)] [Donate] You are not logged in Topic Coronavirus and PD? [](topic_show.pl?id=5257) [](topic_show.pl?id=5247) [](32677) [ ](#0) By [jcoff012](user_info.pl?id=16417) On 2020.03.12 13:17 We don\u0092t see the neurologist til March 23, so I was wondering if anyone has any feedback from their\u0092s? As a diabetic with lymphedema, my primary said last Friday that I should not fly or be in large crowds for awhile. Seems that a PWP should have similar concerns, but do they? [](32678) [ ](#32677) By [junipersage](user_info.pl?id=27725) On 2020.03.12 15:21 We haven't spoken to the neurologist, but I work in a county public health department and have been actively working on COVID-19. What we are telling people is that Parkinson's is not necessarily as strong a risk factor as other conditions (particularly lung conditions like asthma and COPD, and or heart conditions.) But even so, the virus is most severe in people over 60, and the risk increases with age. Also, any underlying condition that can make people more frail could be a problem. So its probably a good idea for PWP to take extra caution. Here's a webpage with more info: https://www.apdaparkinson.org/article/covid-19-overview-for-pd-community/ [](32679) [ ](#32678) By [jcoff012](user_info.pl?id=16417) On 2020.03.12 17:31 Bless you! This is wonderfully helpful. I have a sore throat and upset stomach, so we decided my husband would go to the Taekwondo studio with our grandson today. I am not concerned that this is Coronavirus, but feel strongly about a lower resistance now. He is my PWP, but after talking this out, he said he\u0092ll stay away from the other parents and staff. He read the article, too. Thank you! \u00a9 MyParkinsons.org \u00b7 Published by jAess Media \u00b7 Privacy Policy & Terms of Use Sponsorship Assistance for this website and Forum has been provided by by people like you", 6 | "gold_standard_annotation": [ 7 | { 8 | "post_text": { 9 | "surface_form": "We don\u0092t see the neurologist til March 23, so I was wondering if anyone has any feedback from their\u0092s? As a diabetic with lymphedema, my primary said last Friday that I should not fly or be in large crowds for awhile. Seems that a PWP should have similar concerns, but do they?" 10 | }, 11 | "datetime": { 12 | "surface_form": "2020.03.12 13:17" 13 | }, 14 | "user": { 15 | "surface_form": "user_info.pl?id=16417" 16 | }, 17 | "post_link": { 18 | "surface_form": "#0" 19 | } 20 | }, 21 | { 22 | "post_text": { 23 | "surface_form": "We haven't spoken to the neurologist, but I work in a county public health department and have been actively working on COVID-19. What we are telling people is that Parkinson's is not necessarily as strong a risk factor as other conditions (particularly lung conditions like asthma and COPD, and or heart conditions.) But even so, the virus is most severe in people over 60, and the risk increases with age. Also, any underlying condition that can make people more frail could be a problem. So its probably a good idea for PWP to take extra caution. Here's a webpage with more info: https://www.apdaparkinson.org/article/covid-19-overview-for-pd-community/" 24 | }, 25 | "datetime": { 26 | "surface_form": "2020.03.12 15:21" 27 | }, 28 | "user": { 29 | "surface_form": "user_info.pl?id=27725" 30 | }, 31 | "post_link": { 32 | "surface_form": "#32677" 33 | } 34 | }, 35 | { 36 | "post_text": { 37 | "surface_form": "Bless you! This is wonderfully helpful. I have a sore throat and upset stomach, so we decided my husband would go to the Taekwondo studio with our grandson today. I am not concerned that this is Coronavirus, but feel strongly about a lower resistance now. He is my PWP, but after talking this out, he said he\u0092ll stay away from the other parents and staff. He read the article, too. Thank you!" 38 | }, 39 | "datetime": { 40 | "surface_form": "2020.03.12 17:31" 41 | }, 42 | "user": { 43 | "surface_form": "user_info.pl?id=16417" 44 | }, 45 | "post_link": { 46 | "surface_form": "#32678" 47 | } 48 | } 49 | ] 50 | } -------------------------------------------------------------------------------- /corpus/goldDocuments/myparkinsons.org.cgi-bin.forum.topic_show.pl.5256.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "i18885220280728907651545511619171969200", 3 | "url": "https://myparkinsons.org/cgi-bin/forum/topic_show.pl?id=5256", 4 | "html": "\n\n\n\nParkinsons disease Caregiver Help | Discuss Parkinson's disease: Diagnosis through Advanced Parkinson Care @MyParkinsons.org\n\n\n\n\n\n\n
\n
\n\n
\n
\n
\nFor those who care for someone with Parkinson's disease
\n
\n
\n
\n
\n
\n\n\n
\n\n\n
[Home]\n[Forum]\n[Help]\n[Search]\n[Register]\n[Login]\n[Donate]\n
\nYou are not logged in\n

\n\n
\n\n\n
\n\n\n
\nTopic Coronavirus and PD?\n\nGo to previous topic\nGo to next topic\nGo to higher level\n
\n
\n\n
\n\n\n\n\n\n
\n\n\n
\n\nBy jcoff012\nOn 2020.03.12 13:17\n\n
\n
\nWe don\u0092t see the neurologist til March 23, so I was wondering if anyone has any feedback from their\u0092s? As a diabetic with lymphedema, my primary said last Friday that I should not fly or be in large crowds for awhile. Seems that a PWP should have similar concerns, but do they?\n
\n\n

\n\n\n\n\n\n
\n\n\n
\n\nBy junipersage\nOn 2020.03.12 15:21\n\n
\n
\nWe haven't spoken to the neurologist, but I work in a county public health department and have been actively working on COVID-19. What we are telling people is that Parkinson's is not necessarily as strong a risk factor as other conditions (particularly lung conditions like asthma and COPD, and or heart conditions.) But even so, the virus is most severe in people over 60, and the risk increases with age. Also, any underlying condition that can make people more frail could be a problem. So its probably a good idea for PWP to take extra caution. Here's a webpage with more info: HREF='https://www.apdaparkinson.org/article/covid-19-overview-for-pd-community/'>https://www.apdaparkinson.org/article/covid-19-overview-for-pd-community/\n
\n\n

\n\n\n\n\n\n
\n\n\n
\n\nBy jcoff012\nOn 2020.03.12 17:31\n\n
\n
\nBless you! This is wonderfully helpful. I have a sore throat and upset stomach, so we decided my husband would go to the Taekwondo studio with our grandson today. I am not concerned that this is Coronavirus, but feel strongly about a lower resistance now. He is my PWP, but after talking this out, he said he\u0092ll stay away from the other parents and staff. He read the article, too. Thank you!\n
\n\n

\n\n\n\n

\n\n
\n\n\n
© MyParkinsons.org · Published by jAess Media\n · Privacy Policy & Terms of Use
\n Sponsorship Assistance for this website and Forum has been provided by by people\n like you

\n\n
\n
\n\n\n\n\n\n", 5 | "text": " For those who care for someone with Parkinson's disease [Home] [[Forum](forum_show.pl)] [[Help](forum_help.pl)] [[Search](forum_search.pl)] [Register] [[Login](user_login.pl)] [Donate] You are not logged in Topic Coronavirus and PD? [](topic_show.pl?id=5257) [](topic_show.pl?id=5247) [](32677) [ ](#0) By [jcoff012](user_info.pl?id=16417) On 2020.03.12 13:17 We don\u0092t see the neurologist til March 23, so I was wondering if anyone has any feedback from their\u0092s? As a diabetic with lymphedema, my primary said last Friday that I should not fly or be in large crowds for awhile. Seems that a PWP should have similar concerns, but do they? [](32678) [ ](#32677) By [junipersage](user_info.pl?id=27725) On 2020.03.12 15:21 We haven't spoken to the neurologist, but I work in a county public health department and have been actively working on COVID-19. What we are telling people is that Parkinson's is not necessarily as strong a risk factor as other conditions (particularly lung conditions like asthma and COPD, and or heart conditions.) But even so, the virus is most severe in people over 60, and the risk increases with age. Also, any underlying condition that can make people more frail could be a problem. So its probably a good idea for PWP to take extra caution. Here's a webpage with more info: https://www.apdaparkinson.org/article/covid-19-overview-for-pd-community/ [](32679) [ ](#32678) By [jcoff012](user_info.pl?id=16417) On 2020.03.12 17:31 Bless you! This is wonderfully helpful. I have a sore throat and upset stomach, so we decided my husband would go to the Taekwondo studio with our grandson today. I am not concerned that this is Coronavirus, but feel strongly about a lower resistance now. He is my PWP, but after talking this out, he said he\u0092ll stay away from the other parents and staff. He read the article, too. Thank you! \u00a9 MyParkinsons.org \u00b7 Published by jAess Media \u00b7 Privacy Policy & Terms of Use Sponsorship Assistance for this website and Forum has been provided by by people like you", 6 | "gold_standard_annotation": [ 7 | { 8 | "post_text": { 9 | "surface_form": "We don\u0092t see the neurologist til March 23, so I was wondering if anyone has any feedback from their\u0092s? As a diabetic with lymphedema, my primary said last Friday that I should not fly or be in large crowds for awhile. Seems that a PWP should have similar concerns, but do they?", 10 | "start": 362, 11 | "end": 639 12 | }, 13 | "datetime": { 14 | "surface_form": "2020.03.12 13:17", 15 | "start": 345, 16 | "end": 361 17 | }, 18 | "user": { 19 | "surface_form": "user_info.pl?id=16417", 20 | "start": 319, 21 | "end": 340 22 | }, 23 | "post_link": { 24 | "surface_form": "#0", 25 | "start": 301, 26 | "end": 303 27 | } 28 | }, 29 | { 30 | "post_text": { 31 | "surface_form": "We haven't spoken to the neurologist, but I work in a county public health department and have been actively working on COVID-19. What we are telling people is that Parkinson's is not necessarily as strong a risk factor as other conditions (particularly lung conditions like asthma and COPD, and or heart conditions.) But even so, the virus is most severe in people over 60, and the risk increases with age. Also, any underlying condition that can make people more frail could be a problem. So its probably a good idea for PWP to take extra caution. Here's a webpage with more info: https://www.apdaparkinson.org/article/covid-19-overview-for-pd-community/", 32 | "start": 722, 33 | "end": 1378 34 | }, 35 | "datetime": { 36 | "surface_form": "2020.03.12 15:21", 37 | "start": 705, 38 | "end": 721 39 | }, 40 | "user": { 41 | "surface_form": "user_info.pl?id=27725", 42 | "start": 679, 43 | "end": 700 44 | }, 45 | "post_link": { 46 | "surface_form": "#32677", 47 | "start": 654, 48 | "end": 660 49 | } 50 | }, 51 | { 52 | "post_text": { 53 | "surface_form": "Bless you! This is wonderfully helpful. I have a sore throat and upset stomach, so we decided my husband would go to the Taekwondo studio with our grandson today. I am not concerned that this is Coronavirus, but feel strongly about a lower resistance now. He is my PWP, but after talking this out, he said he\u0092ll stay away from the other parents and staff. He read the article, too. Thank you!", 54 | "start": 1458, 55 | "end": 1850 56 | }, 57 | "datetime": { 58 | "surface_form": "2020.03.12 17:31", 59 | "start": 1441, 60 | "end": 1457 61 | }, 62 | "user": { 63 | "surface_form": "user_info.pl?id=16417", 64 | "start": 1415, 65 | "end": 1436 66 | }, 67 | "post_link": { 68 | "surface_form": "#32678", 69 | "start": 1393, 70 | "end": 1399 71 | } 72 | } 73 | ] 74 | } -------------------------------------------------------------------------------- /tests/integration/harvest/test_extract_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | from json import load 3 | 4 | import pytest 5 | from fuzzywuzzy import fuzz 6 | 7 | from harvest import extract_data 8 | 9 | 10 | # @Todo lead post not detected-> test_forum_healthunlocked 11 | # @Todo not recognized because of inscriptis -> test_forum_proxer 12 | # @Todo lead post not detected -> test_forum_shift_ms 13 | # @Todo lead post not detected- -> test_forum_medhelp 14 | # @Todo text not recognized -> test_forum_medschat 15 | # @Todo text not recognized because to many other threads recommendations -> test_forum_paradisi 16 | 17 | @pytest.fixture 18 | def compare(): 19 | def _compare(gold_annotations, response, ignored_element=[], ratio=95): 20 | for index, gold_annotation in enumerate(gold_annotations, start=0): 21 | for element in gold_annotation: 22 | if element not in ignored_element: 23 | if element == 'post_text': 24 | assert fuzz.ratio(gold_annotation[element]['surface_form'], 25 | response[index][element]) > ratio 26 | else: 27 | assert gold_annotation[element]['surface_form'] == response[index][element] 28 | 29 | return _compare 30 | 31 | 32 | @pytest.fixture 33 | def remove_index(): 34 | def _remove_index(response, indexes_to_remove): 35 | final_response = [] 36 | for index, response_element in enumerate(response, start=0): 37 | if index not in indexes_to_remove: 38 | final_response.append(response_element) 39 | return final_response 40 | 41 | return _remove_index 42 | 43 | 44 | @pytest.fixture 45 | def load_test_data(): 46 | def _load_test_data(file_name): 47 | file_path = os.path.join(os.path.dirname(__file__), '../../../corpus/goldDocuments', file_name) 48 | with open(file_path) as f: 49 | return load(f) 50 | 51 | return _load_test_data 52 | 53 | 54 | def test_forum_angelman(load_test_data, compare): 55 | forum_test_data = load_test_data("blog.angelman-asa.org.read.php.json") 56 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 57 | compare(forum_test_data['gold_standard_annotation'], response, ['post_link']) 58 | 59 | 60 | def test_forum_bpdfamily(load_test_data, compare): 61 | forum_test_data = load_test_data("bpdfamily.com.message_board.index.php.json") 62 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 63 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'user', 'post_link']) 64 | 65 | 66 | def test_forum_bitdefender(load_test_data, compare): 67 | forum_test_data = load_test_data( 68 | "community.bitdefender.com.en.discussion.82059.i-noticed-that-the-bitdefender-process-can-be-easily-killed.json") 69 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 70 | compare(forum_test_data['gold_standard_annotation'], response, ['post_link']) 71 | 72 | 73 | def test_forum_kaspersky(load_test_data, compare): 74 | forum_test_data = load_test_data( 75 | "community.kaspersky.com.kaspersky-security-cloud-11.rootkit-scan-not-executed-6849.json") 76 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 77 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'post_link']) 78 | 79 | 80 | def test_forum_community_scope(load_test_data, compare): 81 | forum_test_data = load_test_data("community.scope.org.uk.discussion.68941.disabled-mum.json") 82 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 83 | compare(forum_test_data['gold_standard_annotation'], response, ['post_link']) 84 | 85 | 86 | def test_forum_digitalfernsehen(load_test_data, compare): 87 | forum_test_data = load_test_data("forum.digitalfernsehen.de.threads.df-hilferuf.416785..json") 88 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 89 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime']) 90 | 91 | 92 | def test_forum_ebaumsworld(load_test_data, compare): 93 | forum_test_data = load_test_data("forum.ebaumsworld.com.viewtopic.php.42095.json") 94 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 95 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime']) 96 | 97 | 98 | def test_forum_glamour(load_test_data, compare): 99 | forum_test_data = load_test_data("forum.glamour.de.t.designertaschen-laber-laber.18136.json") 100 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 101 | compare(forum_test_data['gold_standard_annotation'], response, []) 102 | 103 | 104 | def test_forum_mein_schoener_garten(load_test_data, compare): 105 | forum_test_data = load_test_data("forum.mein-schoener-garten.de.viewtopic.php.4825193.json") 106 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 107 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime']) 108 | 109 | 110 | def test_forum_nationstates(load_test_data, compare): 111 | forum_test_data = load_test_data("forum.nationstates.net.viewtopic.php.419.json") 112 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 113 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime']) 114 | 115 | 116 | def test_forum_openoffice(load_test_data, compare): 117 | forum_test_data = load_test_data("forum.openoffice.org.en.forum.viewtopic.php.json") 118 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 119 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'user']) 120 | 121 | 122 | def test_forum_statcounter(load_test_data, compare): 123 | forum_test_data = load_test_data("forum.statcounter.com.threads.best-android-apps-in-uk-2019.79812..json") 124 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 125 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'user', 'post_link']) 126 | 127 | 128 | def test_forum_ubuntuusers(load_test_data, compare): 129 | forum_test_data = load_test_data("forum.ubuntuusers.de.topic.appimage-programm-in-alle-programme-als-icon-a..json") 130 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 131 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'user', 'post_link'], ratio=78) 132 | 133 | 134 | def test_forum_utorrent(load_test_data, compare): 135 | forum_test_data = load_test_data("forum.utorrent.com.topic.23012-check-on-startup..json") 136 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 137 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'post_link']) 138 | 139 | 140 | def test_forum_videolan(load_test_data, compare): 141 | forum_test_data = load_test_data("forum.videolan.org.viewtopic.php.json") 142 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 143 | compare(forum_test_data['gold_standard_annotation'], response, []) 144 | 145 | 146 | def test_forum_wordreference(load_test_data, compare): 147 | forum_test_data = load_test_data("forum.wordreference.com.threads.attuned-to-the-reiki-symbols.3691417..json") 148 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 149 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'user']) 150 | 151 | 152 | def test_forum_worldofplayers(load_test_data, compare): 153 | forum_test_data = load_test_data( 154 | "forum.worldofplayers.de.forum.threads.1548322-Welchen-Blog-benutzt-man-in-2020.json") 155 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 156 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime']) 157 | 158 | 159 | def test_forum_futura_sciences(load_test_data, compare, remove_index): 160 | forum_test_data = load_test_data("forums.futura-sciences.com.annonces-officielles.78761-moderateurs.html.json") 161 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 162 | # Remove the advertisement slots 163 | response = remove_index(response, [1, 6, 8, 15, 22, 29]) 164 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime']) 165 | 166 | 167 | def test_forum_macrumors(load_test_data, compare): 168 | forum_test_data = load_test_data("forums.macrumors.com.threads.se-or-11.2231616..json") 169 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 170 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime']) 171 | 172 | 173 | def test_forum_maladiesraresinfo(load_test_data, compare): 174 | forum_test_data = load_test_data("forums.maladiesraresinfo.org.post11011.html.json") 175 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 176 | compare(forum_test_data['gold_standard_annotation'], response, ['post_link']) 177 | 178 | 179 | def test_forum_moneysavingexpert(load_test_data, compare): 180 | forum_test_data = load_test_data( 181 | "forums.moneysavingexpert.com.discussion.6100693.how-do-0-credit-card-balances-work-when-you-have-borrowed-twice.json") 182 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 183 | compare(forum_test_data['gold_standard_annotation'], response, ['post_link']) 184 | 185 | 186 | def test_forum_sherdog(load_test_data, compare): 187 | forum_test_data = load_test_data("forums.sherdog.com.threads.all-time-goat-poll.3916359..json") 188 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 189 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'post_link']) 190 | 191 | 192 | def test_forum_kiwifarms(load_test_data, compare): 193 | forum_test_data = load_test_data( 194 | "kiwifarms.net.threads.the-twitter-pedo-hunter-loli-crusader-community.64404..json") 195 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 196 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'user']) 197 | 198 | 199 | def test_forum_myparkinsons(load_test_data, compare, remove_index): 200 | forum_test_data = load_test_data("myparkinsons.org.cgi-bin.forum.topic_show.pl.5256.json") 201 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 202 | # Remove header that looks exactly like the posts 203 | response = remove_index(response, [0, 1]) 204 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'post_link'], ratio=90) 205 | 206 | 207 | def test_forum_skyscraperpage(load_test_data, compare): 208 | forum_test_data = load_test_data("skyscraperpage.com.forum.showthread.php.json") 209 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 210 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'post_link']) 211 | 212 | 213 | def test_forum_collegeconfidential(load_test_data, compare): 214 | forum_test_data = load_test_data( 215 | "talk.collegeconfidential.com.student-here-ask-me-anything.2183693-got-into-nyu-pre-med-intention-ask-me-anything.html.json") 216 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 217 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'post_link']) 218 | 219 | 220 | def test_forum_uhrforum(load_test_data, compare): 221 | forum_test_data = load_test_data("uhrforum.de.threads.der-yema-fotothread-und-nicht-nur-das.414009..json") 222 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 223 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'user']) 224 | 225 | 226 | def test_forum_blizzard(load_test_data, compare): 227 | forum_test_data = load_test_data( 228 | "us.forums.blizzard.com.en.wow.t.can-i-transfer-back-to-locked-server-if-i-have-existing-character.505388.json") 229 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 230 | compare(forum_test_data['gold_standard_annotation'], response) 231 | 232 | 233 | def test_forum_airliners(load_test_data, compare): 234 | forum_test_data = load_test_data("www.airliners.net.forum.viewtopic.php.1428699.json") 235 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 236 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime'], ratio=85) 237 | 238 | 239 | def test_forum_amsel(load_test_data, compare): 240 | forum_test_data = load_test_data("www.amsel.de.multiple-sklerose-forum..json") 241 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 242 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime']) 243 | 244 | 245 | def test_forum_android_hilfe(load_test_data, compare): 246 | forum_test_data = load_test_data( 247 | "www.android-hilfe.de.forum.samsung-allgemein.423.faq-diskussion-zum-kauf-samsung-galaxy-s10-s10e-s10-snapdragon-variante.904645.html.json") 248 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 249 | compare(forum_test_data['gold_standard_annotation'], response, ['user']) 250 | 251 | 252 | def test_forum_computerbase(load_test_data, compare): 253 | forum_test_data = load_test_data( 254 | "www.computerbase.de.forum.threads.ram-empfehlung-fuer-ryzen.1940441..json") 255 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 256 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime'], ratio=85) 257 | 258 | 259 | def test_forum_drwindows(load_test_data, compare): 260 | forum_test_data = load_test_data( 261 | "www.drwindows.de.windows-7-allgemein.16340-zufall-entdeckte-problemlsungen.html.json") 262 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 263 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'user', 'post_link'], ratio=89) 264 | 265 | 266 | def test_forum_fanfiction(load_test_data, compare): 267 | forum_test_data = load_test_data( 268 | "www.fanfiction.net.topic.146535.108548484.1.The-About-the-World-Topic.json") 269 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 270 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'user'], ratio=75) 271 | 272 | 273 | def test_forum_gtplanet(load_test_data, compare): 274 | forum_test_data = load_test_data("www.gtplanet.net.forum.threads.f1-2018-general-discussion.378195..json") 275 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 276 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime']) 277 | 278 | 279 | def test_forum_hifi(load_test_data, compare): 280 | forum_test_data = load_test_data("www.hifi-forum.de.viewthread-84-87.html.json") 281 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 282 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime']) 283 | 284 | 285 | def test_forum_juraforum(load_test_data, compare): 286 | forum_test_data = load_test_data( 287 | "www.juraforum.de.forum.t.fahrtkostenerstattung-bei-falschen-rezepten.675629..json") 288 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 289 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'user', 'post_link']) 290 | 291 | 292 | def test_forum_med1(load_test_data, compare): 293 | forum_test_data = load_test_data( 294 | "www.med1.de.forum.beruf-alltag-und-umwelt.corona-eine-gehypde-apokalypse-972190..json") 295 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 296 | compare(forum_test_data['gold_standard_annotation'], response, ['post_link']) 297 | 298 | 299 | def test_forum_msworld(load_test_data, compare): 300 | forum_test_data = load_test_data("www.msworld.org.forum.showthread.php.json") 301 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 302 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime']) 303 | 304 | 305 | def test_forum_msconnection(load_test_data, compare): 306 | forum_test_data = load_test_data("www.msconnection.org.Discussions.f27.t79421.tp1.Does-this-sound-like-MS.json") 307 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 308 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime']) 309 | 310 | 311 | def test_forum_mumsnet(load_test_data, compare): 312 | forum_test_data = load_test_data( 313 | "www.mumsnet.com.Talk.pregnancy.3749275-Pregnant-with-a-black-mixed-race-with-black-baby.json") 314 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 315 | compare(forum_test_data['gold_standard_annotation'], response) 316 | 317 | 318 | def test_forum_musiker_board(load_test_data, compare): 319 | forum_test_data = load_test_data( 320 | "www.musiker-board.de.threads.baubericht-0-14-ital-fichte-palisander.689167..json") 321 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 322 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime'], ratio=93) 323 | 324 | 325 | def test_forum_nairaland(load_test_data, compare): 326 | forum_test_data = load_test_data("www.nairaland.com.5812914.akeredolu-rejects-plot-impeach-deputy.json") 327 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 328 | compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'post_link']) 329 | 330 | 331 | def test_forum_neowin(load_test_data, compare): 332 | forum_test_data = load_test_data("www.neowin.net.forum.topic.1391546-hello-im-dion..json") 333 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 334 | compare(forum_test_data['gold_standard_annotation'], response, ['user']) 335 | 336 | 337 | def test_forum_pistonheads(load_test_data, compare): 338 | forum_test_data = load_test_data("www.pistonheads.com.gassing.topic.asp.1858583.json") 339 | response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts'] 340 | compare(forum_test_data['gold_standard_annotation'], response, ['post_link', 'datetime']) 341 | -------------------------------------------------------------------------------- /corpus/goldDocumentsPre/forum.ubuntuusers.de.topic.appimage-programm-in-alle-programme-als-icon-a..json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "i239851449887439968677754102219662807017", 3 | "url": "https://forum.ubuntuusers.de/topic/appimage-programm-in-alle-programme-als-icon-a/", 4 | "html": "\n\n\n\n\n\n\n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n\n Appimage Programm in \"alle Programme\" als Icon anzeigen? \u203a GNOME (Ubuntu ab 17.10) \u203a Grafische Oberfl\u00e4che \u203a Forum \u203a ubuntuusers.de\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n\n \n\n \n\n \n\n \n \n \n \n \n \n \n \n \n\n \n \n\n \n \n\n \n \n \n \n\n
\n

ubuntuusers.de

\n \n
\n\n \n\n \n \n
\n \n \n\n \n \n \n \n \n \n\n \n\n via DuckDuckGo\n
\n \n\n \n
\n \n
\n\n
\n \n \n
\n\n\n \n\n \n \n \n\n \n\n
\n \n\n \n\n \n
\n

Appimage Programm in "alle Programme" als Icon anzeigen?

\n
\n
\u00ab Vorherige1N\u00e4chste \u00bb\n
\n Status:\n \n Ungel\u00f6st\n \n |\n \n Ubuntu-Version:\n Ubuntu 20.04 (Focal Fossa)\n \n
\n \n \n Antworten |\n \n \n\n
\n
\n\n
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
\n

\n Liane\n \n

\n

Anmeldungsdatum:
11. April 2009

\n

Beitr\u00e4ge: 457

\n
\n
\n
\n Zitieren\n
\n \n \"Beitrag\"\n 14. Juni 2020 10:23\n \n (zuletzt bearbeitet: 14. Juni 2020 10:41)\n \n
\n \n
\n

Hallo zusammen,

ich habe ein Programm, dass als Appimage zu starten ist.

Ich w\u00fcrde gerne dieses Programm wie die anderen Anwendungen auch auf dem \nDashboard (hoffe ich habe das Men\u00fc richtig beschrieben) mittels "alle Programme"\nsehen k\u00f6nnen.

Der Umweg \u00fcber den Ordner suchen und das Programm darin zu starten ist umst\u00e4ndlich.

Danke!

bye\nLiane

Moderiert von Taomon:

Dieses Thema ist verschoben worden. Bitte beachte die als wichtig markierten Themen (\u201eWelche Themen geh\u00f6ren hier her und welche nicht?\u201c)!

\n

\n
\n
\n

\n fleet_street\n \n

\n

Anmeldungsdatum:
30. August 2016

\n

Beitr\u00e4ge: 1046

\n

Wohnort: Hunsr\u00fcck

\n
\n
\n
\n Zitieren\n
\n \n \"Beitrag\"\n 14. Juni 2020 10:42\n \n
\n \n
\n

Was du ben\u00f6tigst, ist ein Starter \u2192 .desktop-Dateien

\n
\n
\n

\n Liane\n \n

\n
(Themenstarter)
\n

Anmeldungsdatum:
11. April 2009

\n

Beitr\u00e4ge: 457

\n
\n
\n
\n Zitieren\n
\n \n \"Beitrag\"\n 14. Juni 2020 17:56\n \n
\n \n
\n

danke f\u00fcr die Antwort. Erinnerlich, gibt es eine Software, mit der man mittels GUI diesen Starter erstellen kann?

DANKE

\n
\n
\n

\n Tut-tut\n \n

\n \n \"Avatar\n\n

Anmeldungsdatum:
24. August 2012

\n

Beitr\u00e4ge: 990

\n
\n
\n
\n Zitieren\n
\n \n \"Beitrag\"\n 14. Juni 2020 20:24\n \n
\n \n
\n

Da brauchst du bestimmt keine Software!

Mein Appimage ist Moneyplex:\nBei mir habe ich die Startdatei im Ordner mit der rechten Maustaste angeklickt und diese "ausf\u00fchrbar" in den Eigenschaften gemacht. Danach war das Problem wie du es beschreibst: L\u00f6sung siehe https://forum.ubuntuusers.de/topic/starter-auf-dem-desktop-zu-favoriten-hinzufueg/

\n
\n
\n

\n fleet_street\n \n

\n

Anmeldungsdatum:
30. August 2016

\n

Beitr\u00e4ge: 1046

\n

Wohnort: Hunsr\u00fcck

\n
\n
\n
\n Zitieren\n
\n \n \"Beitrag\"\n 15. Juni 2020 21:57\n \n
\n \n
\n

Liane schrieb:

\u2026 Erinnerlich, gibt es eine Software, mit der man mittels GUI diesen Starter erstellen kann?

\nDa erinnerst du dich vielleicht an dein Linux Mint Cinnamon.

\n
\n
\n

\n Bleys\n \n

\n \n \"Avatar\n\n

Anmeldungsdatum:
13. August 2006

\n

Beitr\u00e4ge: 5175

\n

Wohnort: Essen, NRW

\n
\n
\n
\n Zitieren\n
\n \n \"Beitrag\"\n 16. Juni 2020 01:23\n \n
\n \n
\n
1
sudo apt install menulibre\n
\n

In der \u00dcbersicht hei\u00dft die Anwendung "Men\u00fcbearbeitung". Starten, die passende Kategorie ausw\u00e4hlen, oben links auf das Plus klicken.

\n
\n
\n\n
\n
\n
\u00ab Vorherige1N\u00e4chste \u00bb\n
\n \n \n Antworten |\n \n \n\n
\n
\n \n\n \n\n \n\n
\n
    \n
  • \n Powered by Inyoka\n \n
    \n \n Inyoka v0.22.1\n \n \n
  • \n
  • \n \ud83c\udd2f 2004 \u2013 2020 ubuntuusers.de \u2022 Einige Rechte vorbehalten
    \n Lizenz \u2022\n Kontakt \u2022\n Datenschutz \u2022\n Impressum \u2022\n Serverstatus\n
  • \n
  • \n Serverhousing gespendet von
    \n \"noris\n \"anexia\"\n
  • \n
\n
\n\n \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n \n \n \n", 5 | "text": "[Zum Hauptinhalt springen](#main) [Zur Seitenleiste springen](#sidebar) * Bitte aktiviere JavaScript! Anmelden Registrieren ubuntuusers.de PortalForumWikiIkhayaPlanetMitmachen via DuckDuckGo * Filter + Neue Beitr\u00e4ge nur hier + Unbeantwortete Themen nur hier + Ungel\u00f6ste Themen nur hier + 24 Stunden nur hier + 12 Stunden nur hier + 6 Stunden nur hier 1. Forum 2. Grafische Oberfl\u00e4che 3. GNOME (Ubuntu ab 17.10) 4. Appimage Programm in \"alle Programme\" als Icon anzeigen? Appimage Programm in \"alle Programme\" als Icon anzeigen? \u00ab Vorherige 1 N\u00e4chste \u00bb Status: Ungel\u00f6st | Ubuntu-Version: Ubuntu 20.04 (Focal Fossa) Antworten | Zitieren [Liane](https://ubuntuusers.de/user/Liane/) [ ](https://forum.ubuntuusers.de/post/9165689/) 14. Juni 2020 10:23 (zuletzt bearbeitet: 14. Juni 2020 10:41) Hallo zusammen, ich habe ein Programm, dass als Appimage zu starten ist. Ich w\u00fcrde gerne dieses Programm wie die anderen Anwendungen auch auf dem Dashboard (hoffe ich habe das Men\u00fc richtig beschrieben) mittels \"alle Programme\" sehen k\u00f6nnen. Der Umweg \u00fcber den Ordner suchen und das Programm darin zu starten ist umst\u00e4ndlich. Danke! bye Liane Moderiert von Taomon: Dieses Thema ist verschoben worden. Bitte beachte die als wichtig markierten Themen (\u201eWelche Themen geh\u00f6ren hier her und welche nicht?\u201c)! Zitieren [fleet_street](https://ubuntuusers.de/user/fleet_street/) [ ](https://forum.ubuntuusers.de/post/9165696/) 14. Juni 2020 10:42 Was du ben\u00f6tigst, ist ein Starter \u2192 .desktop-Dateien Anmeldungsdatum: 30. August 2016 Beitr\u00e4ge: 1046 Wohnort: Hunsr\u00fcck Zitieren [Liane](https://ubuntuusers.de/user/Liane/) [ ](https://forum.ubuntuusers.de/post/9165847/) 14. Juni 2020 17:56 (Themenstarter) danke f\u00fcr die Antwort. Erinnerlich, gibt es eine Software, mit der man mittels GUI diesen Starter erstellen kann? DANKE Beitr\u00e4ge: 457 Zitieren [Tut-tut](https://ubuntuusers.de/user/Tut-tut/) [ ](https://forum.ubuntuusers.de/post/9165916/) 14. Juni 2020 20:24 Da brauchst du bestimmt keine Software! Mein Appimage ist Moneyplex: Bei mir habe ich die Startdatei im Ordner mit der rechten Maustaste angeklickt und diese \"ausf\u00fchrbar\" in den Eigenschaften gemacht. Danach war das Problem wie du es beschreibst: L\u00f6sung siehe https://forum.ubuntuusers.de/topic/starter-auf-dem-desktop-zu-favoriten-hinzufueg/ Beitr\u00e4ge: 990 Zitieren [fleet_street](https://ubuntuusers.de/user/fleet_street/) [ ](https://forum.ubuntuusers.de/post/9166263/) 15. Juni 2020 21:57 Liane schrieb: \u2026 Erinnerlich, gibt es eine Software, mit der man mittels GUI diesen Starter erstellen kann? Da erinnerst du dich vielleicht an dein Linux Mint Cinnamon. Wohnort: Hunsr\u00fcck Zitieren [Bleys](https://ubuntuusers.de/user/Bleys/) [ ](https://forum.ubuntuusers.de/post/9166282/) 16. Juni 2020 01:23 1 sudo apt install menulibre In der \u00dcbersicht hei\u00dft die Anwendung \"Men\u00fcbearbeitung\". Starten, die passende Kategorie ausw\u00e4hlen, oben links auf das Plus klicken. Beitr\u00e4ge: 5175 Wohnort: Essen, NRW \u00ab Vorherige 1 N\u00e4chste \u00bb Antworten | \u00ab Vorheriges Thema N\u00e4chstes Thema \u00bb 1. Forum 2. Grafische Oberfl\u00e4che 3. GNOME (Ubuntu ab 17.10) 4. Appimage Programm in \"alle Programme\" als Icon anzeigen? * Powered by Inyoka Inyoka v0.22.1 * \ud83c\udd2f 2004 \u2013 2020 ubuntuusers.de \u2022 Einige Rechte vorbehalten Lizenz \u2022 Kontakt \u2022 Datenschutz \u2022 Impressum \u2022 Serverstatus * Serverhousing gespendet von ", 6 | "gold_standard_annotation": [ 7 | { 8 | "post_text": { 9 | "surface_form": "Hallo zusammen, ich habe ein Programm, dass als Appimage zu starten ist. Ich w\u00fcrde gerne dieses Programm wie die anderen Anwendungen auch auf dem Dashboard (hoffe ich habe das Men\u00fc richtig beschrieben) mittels \"alle Programme\" sehen k\u00f6nnen. Der Umweg \u00fcber den Ordner suchen und das Programm darin zu starten ist umst\u00e4ndlich. Danke!" 10 | }, 11 | "datetime": { 12 | "surface_form": "14. Juni 2020 10:23" 13 | }, 14 | "user": { 15 | "surface_form": "https://ubuntuusers.de/user/Liane/" 16 | }, 17 | "post_link": { 18 | "surface_form": "https://forum.ubuntuusers.de/post/9165689/" 19 | } 20 | }, 21 | { 22 | "post_text": { 23 | "surface_form": "Was du ben\u00f6tigst, ist ein Starter \u2192 .desktop-Dateien" 24 | }, 25 | "datetime": { 26 | "surface_form": "14. Juni 2020 10:42" 27 | }, 28 | "user": { 29 | "surface_form": "https://ubuntuusers.de/user/fleet_street/" 30 | }, 31 | "post_link": { 32 | "surface_form": "https://forum.ubuntuusers.de/post/9165696/" 33 | } 34 | }, 35 | { 36 | "post_text": { 37 | "surface_form": "danke f\u00fcr die Antwort. Erinnerlich, gibt es eine Software, mit der man mittels GUI diesen Starter erstellen kann? DANKE" 38 | }, 39 | "datetime": { 40 | "surface_form": "14. Juni 2020 17:56" 41 | }, 42 | "user": { 43 | "surface_form": "https://ubuntuusers.de/user/Liane/" 44 | }, 45 | "post_link": { 46 | "surface_form": "https://forum.ubuntuusers.de/post/9165847/" 47 | } 48 | }, 49 | { 50 | "post_text": { 51 | "surface_form": "Da brauchst du bestimmt keine Software! Mein Appimage ist Moneyplex: Bei mir habe ich die Startdatei im Ordner mit der rechten Maustaste angeklickt und diese \"ausf\u00fchrbar\" in den Eigenschaften gemacht. Danach war das Problem wie du es beschreibst: L\u00f6sung siehe https://forum.ubuntuusers.de/topic/starter-auf-dem-desktop-zu-favoriten-hinzufueg/" 52 | }, 53 | "datetime": { 54 | "surface_form": "14. Juni 2020 20:24" 55 | }, 56 | "user": { 57 | "surface_form": "https://ubuntuusers.de/user/Tut-tut/" 58 | }, 59 | "post_link": { 60 | "surface_form": "https://forum.ubuntuusers.de/post/9165916/" 61 | } 62 | }, 63 | { 64 | "post_text": { 65 | "surface_form": "Liane schrieb: \u2026 Erinnerlich, gibt es eine Software, mit der man mittels GUI diesen Starter erstellen kann? Da erinnerst du dich vielleicht an dein Linux Mint Cinnamon." 66 | }, 67 | "datetime": { 68 | "surface_form": "15. Juni 2020 21:57" 69 | }, 70 | "user": { 71 | "surface_form": "https://ubuntuusers.de/user/fleet_street/" 72 | }, 73 | "post_link": { 74 | "surface_form": "https://forum.ubuntuusers.de/post/9166263/" 75 | } 76 | }, 77 | { 78 | "post_text": { 79 | "surface_form": "1 sudo apt install menulibre In der \u00dcbersicht hei\u00dft die Anwendung \"Men\u00fcbearbeitung\". Starten, die passende Kategorie ausw\u00e4hlen, oben links auf das Plus klicken." 80 | }, 81 | "datetime": { 82 | "surface_form": "16. Juni 2020 01:23" 83 | }, 84 | "user": { 85 | "surface_form": "https://ubuntuusers.de/user/Bleys/" 86 | }, 87 | "post_link": { 88 | "surface_form": "https://forum.ubuntuusers.de/post/9166282/" 89 | } 90 | } 91 | ] 92 | } --------------------------------------------------------------------------------