├── corpus
    ├── __init__.py
    ├── createGoldDocuments
    │   ├── __init__.py
    │   ├── script
    │   │   ├── __init__.py
    │   │   ├── generate-single_post.py
    │   │   ├── final_processing.py
    │   │   ├── remove_link.py
    │   │   └── pre_processing.py
    │   ├── file.py
    │   ├── README.md
    │   └── calculate_position.py
    ├── README.md
    ├── goldDocumentsPre
    │   ├── myparkinsons.org.cgi-bin.forum.topic_show.pl.5256.json
    │   └── forum.ubuntuusers.de.topic.appimage-programm-in-alle-programme-als-icon-a..json
    └── goldDocuments
    │   └── myparkinsons.org.cgi-bin.forum.topic_show.pl.5256.json
├── tests
    ├── __init__.py
    ├── unit
    │   └── harvest
    │   │   ├── metadata
    │   │       └── test_username.py
    │   │   ├── test_date_search.py
    │   │   ├── test_utils.py
    │   │   └── cleanup
    │   │       └── test_forum_post.py
    ├── test_webservice.py
    └── integration
    │   └── harvest
    │       ├── test_posts_xpath.py
    │       └── test_extract_data.py
├── src
    ├── harvest
    │   ├── cleanup
    │   │   ├── __init__.py
    │   │   └── forum_post.py
    │   ├── metadata
    │   │   ├── __init__.py
    │   │   ├── .cache
    │   │   │   └── v
    │   │   │   │   └── cache
    │   │   │   │       └── lastfailed
    │   │   ├── usertext.py
    │   │   ├── link.py
    │   │   ├── date.py
    │   │   └── username.py
    │   ├── config.py
    │   ├── date_search.py
    │   ├── post_text.py
    │   ├── __init__.py
    │   ├── similarity_calculator.py
    │   ├── utils.py
    │   ├── extract.py
    │   └── posts.py
    └── test_dummy.py
├── requirements.txt
├── data
    └── forum
    │   ├── https%3A%2F%2Fshift.ms%2Ftopic%2Fcbd-oil-11.json.gz
    │   ├── http%3A%2F%2Fblog.angelman-asa.org%2Fread.php%3F2%2C736.json.gz
    │   ├── https%3A%2F%2Fcommunity.scope.org.uk%2Fdiscussion%2F57774%2Fcopd.json.gz
    │   ├── https%3A%2F%2Fforums.maladiesraresinfo.org%2Fpost11011.html%23p11011.json.gz
    │   ├── https%3A%2F%2Fwww.uninterrupted.org.au%2Fblog-category%2Fmy-ms-journey.json.gz
    │   ├── https%3A%2F%2Fmyparkinsons.org%2Fcgi-bin%2Fforum%2Ftopic_show.pl%3Fid%3D5231.json.gz
    │   ├── https%3A%2F%2Fforum.statcounter.com%2Fthreads%2Fcustom-tags-examples.44340%2F.json.gz
    │   ├── https%3A%2F%2Fwww.msworld.org%2Fforum%2Fshowthread.php%3F145403-Sort-of-new-here.json.gz
    │   ├── https%3A%2F%2Fwww.healingwell.com%2Fcommunity%2Fdefault.aspx%3Ff%3D34%26m%3D4099304.json.gz
    │   ├── https%3A%2F%2Fhealthunlocked.com%2Fparkinsonsmovement%2Fposts%2F142058845%2Fartane-anyone.json.gz
    │   ├── https%3A%2F%2Fwww.medhelp.org%2Fposts%2FMultiple-Sclerosis%2FPositive-ANA-Test%2Fshow%2F1123552.json.gz
    │   ├── https%3A%2F%2Fwww.medhelp.org%2Fposts%2FInfectious-Diseases%2FNoro-or-other-virus%2Fshow%2F1881254.json.gz
    │   ├── https%3A%2F%2Fwww.medhelp.org%2Fposts%2FHeart-Disease%2FWolfe-Parkinson-White-Syndrome%2Fshow%2F250747.json.gz
    │   ├── https%3A%2F%2Fwww.amsel.de%2Fmultiple-sklerose-forum%2F%3Ftnr%3D1%26mnr%3D217239%26archiv_flag%3D2%26fv%3D1.json.gz
    │   ├── https%3A%2F%2Fwww.mumsnet.com%2FTalk%2Fpregnancy%2F3749275-Pregnant-with-a-black-mixed-race-with-black-baby.json.gz
    │   ├── https%3A%2F%2Fwww.msconnection.org%2FDiscussions%2Ff33%2Ft77364%2Ftp1%2FHow-long-is-too-long-to-wait-for-an-initial-con.json.gz
    │   ├── https%3A%2F%2Fwww.medschat.com%2FDiscuss%2Fhow-important-is-this-medician-G-E-Sulfamethoxazole-TMP-DS-Tabitp-to-take-due-to-COPD-206090.htm%3Fsrcq%3Dcopd.json.gz
    │   └── test-urls.lst
├── .gitignore
├── publish.sh
├── scripts
    ├── serialize_test_data.py
    ├── webservice.py
    ├── extract_to_csv.py
    └── test-urls.lst
├── .github
    └── workflows
    │   └── main.yml
├── setup.py
├── README.md
└── LICENSE


/corpus/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/harvest/cleanup/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/harvest/metadata/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/corpus/createGoldDocuments/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/corpus/createGoldDocuments/script/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/harvest/metadata/.cache/v/cache/lastfailed:
--------------------------------------------------------------------------------
1 | {}


--------------------------------------------------------------------------------
/src/harvest/config.py:
--------------------------------------------------------------------------------
1 | LANGUAGES = ('en', 'de', 'es')


--------------------------------------------------------------------------------
/corpus/README.md:
--------------------------------------------------------------------------------
1 | # Todo description of corpus
2 | ```
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | lxml
2 | requests
3 | dateparser
4 | numpy
5 | inscriptis
6 | flask
7 | fuzzywuzzy
8 | pytest


--------------------------------------------------------------------------------
/src/test_dummy.py:
--------------------------------------------------------------------------------
1 | """
2 | This dummy test is needed for pytest to detect the src directory
3 | """
4 | 
5 | 
6 | def test_dummy():
7 |     pass
8 | 


--------------------------------------------------------------------------------
/data/forum/https%3A%2F%2Fshift.ms%2Ftopic%2Fcbd-oil-11.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fshift.ms%2Ftopic%2Fcbd-oil-11.json.gz


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.pyo
 3 | .*.swp
 4 | __pycache__/
 5 | .cache/
 6 | debug/
 7 | venv
 8 | dist
 9 | build
10 | results.json
11 | *.egg-info
12 | .coverage
13 | coverage.xml
14 | 


--------------------------------------------------------------------------------
/data/forum/http%3A%2F%2Fblog.angelman-asa.org%2Fread.php%3F2%2C736.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/http%3A%2F%2Fblog.angelman-asa.org%2Fread.php%3F2%2C736.json.gz


--------------------------------------------------------------------------------
/data/forum/https%3A%2F%2Fcommunity.scope.org.uk%2Fdiscussion%2F57774%2Fcopd.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fcommunity.scope.org.uk%2Fdiscussion%2F57774%2Fcopd.json.gz


--------------------------------------------------------------------------------
/data/forum/https%3A%2F%2Fforums.maladiesraresinfo.org%2Fpost11011.html%23p11011.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fforums.maladiesraresinfo.org%2Fpost11011.html%23p11011.json.gz


--------------------------------------------------------------------------------
/data/forum/https%3A%2F%2Fwww.uninterrupted.org.au%2Fblog-category%2Fmy-ms-journey.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fwww.uninterrupted.org.au%2Fblog-category%2Fmy-ms-journey.json.gz


--------------------------------------------------------------------------------
/data/forum/https%3A%2F%2Fmyparkinsons.org%2Fcgi-bin%2Fforum%2Ftopic_show.pl%3Fid%3D5231.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fmyparkinsons.org%2Fcgi-bin%2Fforum%2Ftopic_show.pl%3Fid%3D5231.json.gz


--------------------------------------------------------------------------------
/data/forum/https%3A%2F%2Fforum.statcounter.com%2Fthreads%2Fcustom-tags-examples.44340%2F.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fforum.statcounter.com%2Fthreads%2Fcustom-tags-examples.44340%2F.json.gz


--------------------------------------------------------------------------------
/data/forum/https%3A%2F%2Fwww.msworld.org%2Fforum%2Fshowthread.php%3F145403-Sort-of-new-here.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fwww.msworld.org%2Fforum%2Fshowthread.php%3F145403-Sort-of-new-here.json.gz


--------------------------------------------------------------------------------
/tests/unit/harvest/metadata/test_username.py:
--------------------------------------------------------------------------------
1 | from harvest.metadata.username import get_user_name
2 | 
3 | 
4 | def test_get_user_name():
5 |     assert get_user_name('Therese Kurz', 'http://www.heise.de/security') == 'Therese.Kurz@www.heise.de'
6 | 


--------------------------------------------------------------------------------
/data/forum/https%3A%2F%2Fwww.healingwell.com%2Fcommunity%2Fdefault.aspx%3Ff%3D34%26m%3D4099304.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fwww.healingwell.com%2Fcommunity%2Fdefault.aspx%3Ff%3D34%26m%3D4099304.json.gz


--------------------------------------------------------------------------------
/data/forum/https%3A%2F%2Fhealthunlocked.com%2Fparkinsonsmovement%2Fposts%2F142058845%2Fartane-anyone.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fhealthunlocked.com%2Fparkinsonsmovement%2Fposts%2F142058845%2Fartane-anyone.json.gz


--------------------------------------------------------------------------------
/data/forum/https%3A%2F%2Fwww.medhelp.org%2Fposts%2FMultiple-Sclerosis%2FPositive-ANA-Test%2Fshow%2F1123552.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fwww.medhelp.org%2Fposts%2FMultiple-Sclerosis%2FPositive-ANA-Test%2Fshow%2F1123552.json.gz


--------------------------------------------------------------------------------
/data/forum/https%3A%2F%2Fwww.medhelp.org%2Fposts%2FInfectious-Diseases%2FNoro-or-other-virus%2Fshow%2F1881254.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fwww.medhelp.org%2Fposts%2FInfectious-Diseases%2FNoro-or-other-virus%2Fshow%2F1881254.json.gz


--------------------------------------------------------------------------------
/data/forum/https%3A%2F%2Fwww.medhelp.org%2Fposts%2FHeart-Disease%2FWolfe-Parkinson-White-Syndrome%2Fshow%2F250747.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fwww.medhelp.org%2Fposts%2FHeart-Disease%2FWolfe-Parkinson-White-Syndrome%2Fshow%2F250747.json.gz


--------------------------------------------------------------------------------
/data/forum/https%3A%2F%2Fwww.amsel.de%2Fmultiple-sklerose-forum%2F%3Ftnr%3D1%26mnr%3D217239%26archiv_flag%3D2%26fv%3D1.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fwww.amsel.de%2Fmultiple-sklerose-forum%2F%3Ftnr%3D1%26mnr%3D217239%26archiv_flag%3D2%26fv%3D1.json.gz


--------------------------------------------------------------------------------
/data/forum/https%3A%2F%2Fwww.mumsnet.com%2FTalk%2Fpregnancy%2F3749275-Pregnant-with-a-black-mixed-race-with-black-baby.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fwww.mumsnet.com%2FTalk%2Fpregnancy%2F3749275-Pregnant-with-a-black-mixed-race-with-black-baby.json.gz


--------------------------------------------------------------------------------
/data/forum/https%3A%2F%2Fwww.msconnection.org%2FDiscussions%2Ff33%2Ft77364%2Ftp1%2FHow-long-is-too-long-to-wait-for-an-initial-con.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fwww.msconnection.org%2FDiscussions%2Ff33%2Ft77364%2Ftp1%2FHow-long-is-too-long-to-wait-for-an-initial-con.json.gz


--------------------------------------------------------------------------------
/data/forum/https%3A%2F%2Fwww.medschat.com%2FDiscuss%2Fhow-important-is-this-medician-G-E-Sulfamethoxazole-TMP-DS-Tabitp-to-take-due-to-COPD-206090.htm%3Fsrcq%3Dcopd.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhgr/harvest/HEAD/data/forum/https%3A%2F%2Fwww.medschat.com%2FDiscuss%2Fhow-important-is-this-medician-G-E-Sulfamethoxazole-TMP-DS-Tabitp-to-take-due-to-COPD-206090.htm%3Fsrcq%3Dcopd.json.gz


--------------------------------------------------------------------------------
/publish.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # TODO
 4 | # - check release version number!
 5 | 
 6 | # publish the latest package to pypi
 7 | # sources:
 8 | # - https://packaging.python.org/guides/distributing-packages-using-setuptools/#packaging-your-project
 9 | # - https://packaging.python.org/guides/making-a-pypi-friendly-readme/
10 | 
11 | # cleanup dist
12 | rm -rf ./dist
13 | 
14 | # build and verify packages
15 | python3 setup.py sdist bdist_wheel; twine check dist/*
16 | 
17 | # upload
18 | twine upload dist/*


--------------------------------------------------------------------------------
/src/harvest/date_search.py:
--------------------------------------------------------------------------------
 1 | import dateparser.search
 2 | import datetime
 3 | from harvest.config import LANGUAGES
 4 | 
 5 | 
 6 | def search_dates(text):
 7 |     results = dateparser.search.search_dates(text, languages=LANGUAGES, settings={'RETURN_AS_TIMEZONE_AWARE': False})
 8 |     valid_dates = []
 9 |     if results is not None:
10 |         for result in results:
11 |             if result[1] > datetime.datetime(1993, 4, 30):
12 |                 valid_dates.append(result)
13 | 
14 |     return valid_dates
15 | 


--------------------------------------------------------------------------------
/tests/unit/harvest/test_date_search.py:
--------------------------------------------------------------------------------
 1 | from harvest.date_search import search_dates
 2 | import datetime
 3 | 
 4 | 
 5 | def test_date_found_by_external_library():
 6 |     result = search_dates("asdfad 25-February-2012 21:46  afd adsf")
 7 |     assert len(result) == 1
 8 |     assert result[0][0] == "25-February-2012 21:46"
 9 |     assert result[0][1] == datetime.datetime(2012, 2, 25, 21, 46)
10 | 
11 | 
12 | def test_date_found_by_external_library_is_to_old():
13 |     result = search_dates("asdfad 29-April-1993 21:46  afd adsf")
14 |     assert len(result) == 0
15 | 


--------------------------------------------------------------------------------
/corpus/createGoldDocuments/file.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import logging
 4 | import os
 5 | 
 6 | from json import dump
 7 | from urllib.parse import urlparse
 8 | 
 9 | logging.getLogger().setLevel(logging.INFO)
10 | 
11 | 
12 | def get_file_path(url, result_directory):
13 |     url = urlparse(url).netloc + urlparse(url).path + urlparse(url).params
14 |     return os.path.join(result_directory, f'{url.replace("/", ".")}.json')
15 | 
16 | 
17 | def write_to_json(url, result_directory, document):
18 |     result_fname = get_file_path(url, result_directory)
19 |     if not os.path.exists(result_directory):
20 |         os.makedirs(result_directory)
21 |     if not os.path.isfile(result_fname):
22 |         with open(result_fname, "w") as f2:
23 |             dump(document, f2, indent=True)
24 | 


--------------------------------------------------------------------------------
/src/harvest/post_text.py:
--------------------------------------------------------------------------------
 1 | from inscriptis import get_text
 2 | 
 3 | WORDS_TO_IGNORE_DE = {'cookies', 'startseite', 'datenschutzerklärung', 'impressum', 'nutzungsbedingungen',
 4 |                       'registrieren'}
 5 | WORDS_TO_IGNORE_EN = {'forum home', 'sign in', 'sign up'}
 6 | WORDS_TO_IGNORE = WORDS_TO_IGNORE_DE.union(WORDS_TO_IGNORE_EN)
 7 | 
 8 | 
 9 | def get_cleaned_text(html):
10 |     text_sections = []
11 |     text = get_text(html)
12 |     for comment in (c for c in text.split("\n") if c.strip()):
13 |         if [word for word in WORDS_TO_IGNORE if word in comment.lower()]:
14 |             continue
15 |         elif 'copyright' not in comment.lower() and '©' not in comment.lower() and 'powered by' not in comment.lower():
16 |             text_sections.append(comment.strip())
17 |         else:
18 |             break
19 |     return text_sections


--------------------------------------------------------------------------------
/src/harvest/metadata/usertext.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Get the xpath to extract only the text of a post
 3 | '''
 4 | import re
 5 | import logging
 6 | 
 7 | 
 8 | def get_text_xpath_pattern(dom, post_xpath, posts):
 9 |     """
10 |     Get the xpath to extract only the text of a post
11 | 
12 |     Args:
13 |         - dom: the forums DOM object
14 |         - post_xpath: the determined post xpath
15 |         - posts: the extracted posts
16 |     """
17 | 
18 |     text_xpath = re.sub(r"\/\.\.", "", post_xpath)
19 |     while True:
20 |         text_elements = dom.xpath(text_xpath)
21 |         if len(text_elements) == len(posts):
22 |             return text_xpath
23 |         if len(text_elements) < len(posts) or len(text_elements) <= 1:
24 |             logging.warning(f'text xPath not found for {post_xpath}')
25 |             return post_xpath
26 |         text_xpath = text_xpath + '/..'
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/tests/test_webservice.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | 
 4 | 
 5 | def query():
 6 |     service_url = 'http://localhost:5000/dragnet_extract_from_html'
 7 | 
 8 |     with open('./corpus/goldDocuments/blog.angelman-asa.org.read.php.json') as gold_document:
 9 |         data = json.load(gold_document)
10 |         test_url = data['url']
11 |         test_html = data['html']
12 |         test_text = data['text']
13 |         test_annotations = data['gold_standard_annotation']
14 | 
15 |         data = {'url': test_url, 'html': test_html, 'text': test_text, 'annotations': test_annotations}
16 | 
17 |         try:
18 |             response = requests.post(service_url, json=data)
19 |         except Exception as exception:
20 |             print(f"Query failed: {exception}")
21 |             response = None
22 | 
23 |         response_dict = json.loads(response.text)
24 |         print(f"Response: {response_dict['entities']}")
25 |         return response_dict
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     query()
30 | 


--------------------------------------------------------------------------------
/scripts/serialize_test_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from json import dump
 4 | 
 5 | from os.path import exists
 6 | from urllib.request import urlopen, Request
 7 | from urllib.parse import quote_plus
 8 | 
 9 | import datetime
10 | import gzip
11 | import shutil
12 | 
13 | USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0"
14 | 
15 | with open("test-urls.lst") as f:
16 |     for url in (u.strip() for u in f):
17 |         dst = quote_plus(url)
18 |         if exists(dst + ".json") or url.startswith('#') or not url.strip():
19 |             continue
20 | 
21 |         print("Retrieving", url)
22 |         try:
23 |             req = Request(url, data=None, headers={'User-Agent': USER_AGENT})
24 |             http = urlopen(req)
25 |             content_type = http.getheader('content-type')
26 |             if content_type and 'charset=' in content_type:
27 |                 encoding = content_type.split('charset=')[1]
28 |             else:
29 |                 encoding = 'utf8'
30 |             html = http.read().decode(encoding)
31 | 
32 |             with open("../data/" + dst + ".json", 'w') as f:
33 |                 dump({'url': url, 'crawled': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'html': html}, f)
34 |             with open("../data/" + dst + ".json", 'rb') as f, \
35 |                     gzip.open('../data/forum/' + dst + ".json.gz", 'wb') as fgzip:
36 |                 shutil.copyfileobj(f, fgzip)
37 |         except IOError:
38 |             with open("failed.lst", "a") as f:
39 |                 f.write(url + "\n")
40 | 


--------------------------------------------------------------------------------
/tests/unit/harvest/test_utils.py:
--------------------------------------------------------------------------------
 1 | from harvest.utils import get_merged_xpath
 2 | 
 3 | 
 4 | def test_get_merge_xpath():
 5 |     xpaths = [r'//div[@class="post post-even"]/a[not(*) and string-length(text()) = 0]',
 6 |               r'//div[@class="post-odd"]/a[not(*) and string-length(text()) = 0]',
 7 |               r'//a[@class="user-name"][not(*) and string-length(text()) > 0]']
 8 |     merged_xpath = get_merged_xpath(xpaths)
 9 |     assert len(merged_xpath) == 1
10 |     assert merged_xpath[0] == r"//div[(contains(@class, 'post') and contains(@class, 'post-even')) or " \
11 |                               r"(contains(@class, 'post-odd'))]" \
12 |                               r"/a[not(*) and string-length(text()) = 0]"
13 | 
14 | 
15 | def test_get_merge_xpath_same_classes():
16 |     xpaths = [r'//div[@class="post post-even"]/a[not(*) and string-length(text()) = 0]',
17 |               r'//div[@class="post post-odd"]/a[not(*) and string-length(text()) = 0]',
18 |               r'//a[@class="user-name"][not(*) and string-length(text()) > 0]']
19 |     merged_xpath = get_merged_xpath(xpaths)
20 |     assert len(merged_xpath) == 1
21 |     assert merged_xpath[0] == r"//div[(contains(@class, 'post'))]" \
22 |                               r"/a[not(*) and string-length(text()) = 0]"
23 | 
24 | 
25 | def test_get_merge_xpath_with_no_merges():
26 |     xpaths = [r'//div[@class="post post-odd"]/a[not(*) and string-length(text()) = 0]',
27 |               r'//a[@class="user-name"][not(*) and string-length(text()) > 0]']
28 |     merged_xpath = get_merged_xpath(xpaths)
29 |     assert not merged_xpath
30 | 


--------------------------------------------------------------------------------
/tests/unit/harvest/cleanup/test_forum_post.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | '''
 4 | Test classes
 5 | '''
 6 | 
 7 | from harvest.cleanup.forum_post import remove_suffix, remove_prefix, remove_boilerplate
 8 | 
 9 | 
10 | def test_remove_suffix():
11 |     post_list = ['Good day', 'Good Saturday', 'Good Wednesday']
12 |     assert remove_suffix(post_list) == post_list
13 | 
14 |     post_list2 = ['Good day [Reply - to]', 'Good Saturday [Reply - to]', 'Good Wednesday [Reply - to]']
15 |     assert remove_suffix(post_list2) == post_list
16 | 
17 |     assert remove_prefix(post_list) == ['day', 'Saturday', 'Wednesday']
18 | 
19 | 
20 | #
21 | # tests based on reported errors
22 | #
23 | 
24 | def test_missing_message():
25 |     '''
26 |     the following string got completely removed by cleaning.
27 |     '''
28 |     s = [
29 |         "Add message | Report paperplant Thu 21-Nov-19 11:07:27 Following as non-white woman - sounds really interesting, thanks for posting. Can't say much as I've only experienced the booking/sickle cell test and in my hospital we're offered the BCG vaccine as routine. My area is about 50% South Asian ethnicity though.",
30 |         "Add message | Report Lweji Thu 21-Nov-19 11:27:03 Is that actually true? Anatomy, etc? Have you found evidence other than being told about it by a midwife?"]
31 | 
32 |     assert remove_boilerplate(s) == [
33 |         "paperplant Thu 21-Nov-19 11:07:27 Following as non-white woman - sounds really interesting, thanks for posting. Can't say much as I've only experienced the booking/sickle cell test and in my hospital we're offered the BCG vaccine as routine. My area is about 50% South Asian ethnicity though.",
34 |         "Lweji Thu 21-Nov-19 11:27:03 Is that actually true? Anatomy, etc? Have you found evidence other than being told about it by a midwife?"]
35 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | # This is a basic workflow to help you get started with Actions
 2 | 
 3 | name: build
 4 | 
 5 | # Controls when the action will run. Triggers the workflow on push or pull request
 6 | # events but only for the main branch
 7 | on:
 8 |   push:
 9 |     branches: [ main ]
10 |   pull_request:
11 |     branches: [ main ]
12 | 
13 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
14 | jobs:
15 |   # This workflow contains a single job called "build"
16 |   build:
17 |     runs-on: ubuntu-latest
18 |     strategy:
19 |       matrix:
20 |         python-version: [3.6, 3.7, 3.8 ]
21 | 
22 |     steps:
23 |       - uses: actions/checkout@v2
24 |       - name: Set up Python ${{ matrix.python-version }}
25 |         uses: actions/setup-python@v2
26 |         with:
27 |           python-version: ${{ matrix.python-version }}
28 |       - name: Install dependencies
29 |         run: |
30 |           python -m pip install --upgrade pip
31 |           pip install flake8
32 |           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
33 |       - name: Lint with flake8
34 |         run: |
35 |           # stop the build if there are Python syntax errors or undefined names
36 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
37 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
38 |           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
39 |       - name: Test with pytest
40 |         run: |
41 |           pip install pytest-cov
42 |           python -m pytest --rootdir tests --cov=harvest --cov-report=xml
43 |       - name: Upload coverage to Codecov
44 |         uses: codecov/codecov-action@v1
45 |         with:
46 |           token: ${{ secrets.CODECOV_TOKEN }}
47 |           file: ./coverage.xml


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | from setuptools import setup, find_packages
 6 | from os import path
 7 | 
 8 | here = path.abspath(path.dirname(__file__))
 9 | sys.path.insert(0, path.join(here, 'src'))
10 | 
11 | from harvest import (__version__, __author__, __author_email__, __license__)
12 | 
13 | this_directory = path.abspath(path.dirname(__file__))
14 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
15 |     long_description = f.read()
16 | 
17 | setup(
18 |     # Metadata
19 |     name="harvest-webforum",
20 |     version=__version__,
21 |     description='A toolkit for extracting posts and post metadata from web forums',
22 |     long_description=long_description,
23 |     long_description_content_type='text/markdown',
24 |     author=__author__,
25 |     author_email=__author_email__,
26 |     python_requires='>=3.5',
27 |     classifiers=[
28 |         'Intended Audience :: Developers',
29 |         'License :: OSI Approved :: Apache-2.0',
30 |         'Topic :: Text Processing',
31 |         'Topic :: Text Processing :: Markup :: HTML',
32 |         'Topic :: Utilities',
33 |         'Programming Language :: Python :: 3',
34 |         'Programming Language :: Python :: 3.6',
35 |         'Programming Language :: Python :: 3.7',
36 |         'Programming Language :: Python :: 3.8',
37 |     ],
38 |     license=__license__,
39 |     package_dir={'': 'src'},
40 | 
41 |     # Package List
42 |     packages=find_packages('src'),
43 | 
44 |     # Scripts
45 |     scripts=[
46 |         './scripts/extract_to_csv.py',
47 |         './scripts/serialize_test_data.py'
48 |     ],
49 | 
50 |     # Requirements
51 |     install_requires=[
52 |         'lxml',
53 |         'requests',
54 |         'dateparser',
55 |         'numpy',
56 |         'inscriptis',
57 |         'flask',
58 |         'fuzzywuzzy'
59 |     ]
60 | )
61 | 


--------------------------------------------------------------------------------
/corpus/createGoldDocuments/script/generate-single_post.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import logging
 5 | import os
 6 | from collections import defaultdict
 7 | from glob import glob
 8 | from json import load
 9 | 
10 | from corpus.createGoldDocuments.file import write_to_json, get_file_path
11 | 
12 | logging.getLogger().setLevel(logging.INFO)
13 | 
14 | parser = argparse.ArgumentParser(description='Forum harvester - generate gold standard documents with only one post')
15 | parser.add_argument('gold_document_path', metavar='gold_document_path', help='Path to the gold documents')
16 | parser.add_argument('--result-directory', dest='result_directory', help='Optional directory for storing final results.')
17 | parser.add_argument('--corpus-include-string', dest='corpus_include_string',
18 |                     help='Optionally restrict the input corpus to URLs that match the corpus include string.')
19 | 
20 | args = parser.parse_args()
21 | 
22 | result = defaultdict(list)
23 | for no, fname in enumerate(glob(args.gold_document_path + "*.json")):
24 |     with open(fname) as f:
25 |         forum = load(f)
26 |         if (args.corpus_include_string and args.corpus_include_string not in forum['url']) \
27 |                 or os.path.isfile(get_file_path(forum['url'], args.result_directory)):
28 |             continue
29 | 
30 |         logging.info("Start creating final gold standard document with only one post for " + forum['url'])
31 | 
32 |         single_post = " ".join([a['post_text']['surface_form'] for a in forum['gold_standard_annotation']])
33 |         start_index = forum['gold_standard_annotation'][0]['post_text']['start']
34 |         end_index = forum['gold_standard_annotation'][-1]['post_text']['end']
35 | 
36 |         forum['gold_standard_annotation'] = [{
37 |             "post_text": {
38 |                 "surface_form": single_post,
39 |                 "start": start_index,
40 |                 "end": end_index
41 |             }
42 |         }]
43 | 
44 |         write_to_json(os.path.basename(fname), args.result_directory, forum)
45 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Harvest - A toolkit for extracting posts and post metadata from web forums
 2 | 
 3 | [![Actions Status](https://github.com/fhgr/harvest/workflows/build/badge.svg)](https://github.com/fhgr/harvest/actions)
 4 | [![codecov](https://codecov.io/gh/fhgr/harvest/branch/main/graph/badge.svg)](
 5 |     https://codecov.io/gh/fhgr/harvest)
 6 | [![PyPI version](https://badge.fury.io/py/harvest-webforum.svg)](https://badge.fury.io/py/harvest-webforum)
 7 | 
 8 | Automatic extraction of forum posts and metadata is a challenging task since forums do not expose their content in a standardized structure. Harvest performs this task reliably for many web forums and offers an easy way to extract data from web forums.
 9 | 
10 | ## Installation
11 | 
12 | At the command line:
13 | ```bash
14 | $ pip install harvest-webforum
15 | ```
16 | 
17 | If you want to install from the latest sources, you can do:
18 | ```bash
19 | $ git clone https://github.com/fhgr/harvest.git
20 | $ cd harvest
21 | $ python3 setup.py install
22 | ```
23 | 
24 | ## Python library
25 | Embedding harvest into your code is easy, as outlined below:
26 | ```python
27 | from urllib.request import urlopen, Request
28 | from harvest import extract_data
29 | 
30 | USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0"
31 | 
32 | url = "https://forum.videolan.org/viewtopic.php?f=14&t=145604"
33 | req = Request(url, headers={'User-Agent': USER_AGENT})
34 | html = urlopen(req).read().decode('utf-8')
35 | 
36 | result = extract_data(html, url)
37 | print(result)
38 | ```
39 | 
40 | ## WEB-FORUM-52 gold standard
41 | The [corpus](corpus/goldDocuments) currently contains from 52 different web forums gold standard documents. These documents are also used by the integrations test of harvest.
42 | 
43 | ## Publication
44 | 
45 | * Weichselbraun, Albert, Brasoveanu, Adrian M. P., Waldvogel, Roger and Odoni, Fabian. (2020). [“Harvest - An Open Source Toolkit for Extracting Posts and Post Metadata from Web Forums”](https://arxiv.org/abs/2102.02240). IEEE/WIC/ACM International Joint Conference on Web Intelligence and Intelligent Agent Technology (WI-IAT 2020), Melbourne, Australia, Accepted 27 October 2020.
46 | 


--------------------------------------------------------------------------------
/corpus/createGoldDocuments/script/final_processing.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import logging
 5 | import os
 6 | 
 7 | from glob import glob
 8 | from json import load
 9 | from collections import defaultdict
10 | from corpus.createGoldDocuments.file import write_to_json, get_file_path
11 | from corpus.createGoldDocuments.calculate_position import get_start_end_for_post
12 | 
13 | logging.getLogger().setLevel(logging.INFO)
14 | 
15 | parser = argparse.ArgumentParser(description='Forum harvester - generate final gold standard documents')
16 | parser.add_argument('pre_gold_document_path', metavar='pre_gold_document_path',
17 |                     help='Path to the pre processed gold documents')
18 | parser.add_argument('--result-directory', dest='result_directory', help='Optional directory for storing final results.')
19 | parser.add_argument('--corpus-include-string', dest='corpus_include_string',
20 |                     help='Optionally restrict the input corpus to URLs that match the corpus include string.')
21 | 
22 | args = parser.parse_args()
23 | 
24 | result = defaultdict(list)
25 | for no, fname in enumerate(glob(args.pre_gold_document_path + "*.json")):
26 |     with open(fname) as f:
27 |         forum = load(f)
28 |         if (args.corpus_include_string and args.corpus_include_string not in forum['url']) \
29 |                 or os.path.isfile(get_file_path(forum['url'], args.result_directory)):
30 |             continue
31 | 
32 |         logging.info("Start creating final gold standard document for " + forum['url'])
33 |         search_start_index = 0
34 |         all_indexes_found = True
35 |         for post in forum['gold_standard_annotation']:
36 |             max_index = get_start_end_for_post(post, forum['text'], search_start_index)
37 |             if max_index > -1:
38 |                 search_start_index = max_index
39 |             else:
40 |                 all_indexes_found = False
41 |         if all_indexes_found:
42 |             write_to_json(forum['url'], args.result_directory, forum)
43 |             logging.info('Gold standard document successfully created')
44 |         else:
45 |             logging.warning('Not all indexes found. Check pre file again.')
46 | 


--------------------------------------------------------------------------------
/corpus/createGoldDocuments/README.md:
--------------------------------------------------------------------------------
 1 | # Instruction to create gold standard documents
 2 | ## Gold standard document format
 3 | ```
 4 | {
 5 |     "id": ""
 6 |     "url": ""
 7 |     "html": "..."
 8 |     "text": "Only the text of the html. Referenced as full text"
 9 |     "gold_standard_annotation": [{
10 |         "post_text": {"surface_form": "...", "start": 200, "end": 555},
11 |         "datetime": {"surface_form": "02-March-2012 00:58", "start": 10, "end": 29},
12 |         "user": {"surface_form": "http://blog.angelman-asa.org/profile.php?2,1606", "start": 30, "end": 77},
13 |         "post_link": {"surface_form": "msg-772", "start": 100, "end": 107},
14 |     }]
15 | }
16 | ```
17 | ## Instruction
18 | 1. First you need to download the forum pages for which you want to create a gold document. This is done by executing the script `script/serialize-test-data.py`. But first you have to add or comment out (remove # at the beginning) the url of the forum page in the file `data/forum/test-urls.lst`.
19 | 2. Next, a first version of the gold document is created with `pre-processing.py`. Example command:
20 | `python3  pre-processing.py ./data/forum/ --result-directory ./goldDocumentsPre/`
21 | 3. The next step is to clean up the document for the following elements:
22 | `datetime.surface_form, user.surface_form, post_link.surface_form`
23 | The elements that are not correctly recognized should be corrected or supplemented. For the user the link is used if available, otherwise the displayed name.
24 | 4. Now the script `python3 remove-link.py ./goldDocumentsPre/` is executed. This removes all links from the full text of the gold document except those of user and post_link.
25 | 5. Now clean up the `post_text.surface_form` elements in the gold document. These must be found in the full text and must match the correct post text.
26 | 6. Now run the script `python3 final-processing.py ./goldDocumentsPre/ --result-directory ./goldDocuments/`. If all elements are prepared correctly, a start and end position is found for each element. If this is not the case, the log will show the message "Not found in text". If start and end positions are not found, correct the pre document accordingly and generate the final document again.
27 | 7. With git commit and push to repository.
28 | 
29 | **Note**: the `final-processing.py` and `pre-processing.py` scripts do not overwrite existing documents.


--------------------------------------------------------------------------------
/corpus/createGoldDocuments/script/remove_link.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import logging
 5 | import re
 6 | 
 7 | from glob import glob
 8 | from json import load, dump
 9 | from collections import defaultdict
10 | 
11 | logging.getLogger().setLevel(logging.INFO)
12 | 
13 | 
14 | def remove_unused_links(text, links_to_keep):
15 |     pattern = re.compile(r'( \* )?\[[^\]]*\]\((http(s)?:\/)?\/[^\)]*\)')
16 |     start_index = 0
17 |     while start_index > -1:
18 |         link_match = pattern.search(text, start_index)
19 |         if link_match:
20 |             link_extracted = re.search(r'(http(s)?:\/)?\/[^\)]*', link_match.group(0))
21 |             if link_extracted and link_extracted.group(0) not in links_to_keep:
22 |                 logging.info(f'Removed {link_match.group(0)}')
23 |                 only_text = re.search(r'\[.*\]', link_match.group(0))
24 |                 text = text[:link_match.start()] + only_text.group(0)[1:-1] + text[link_match.end():]
25 |             else:
26 |                 start_index = link_match.end()
27 |         else:
28 |             start_index = -1
29 |     return text
30 | 
31 | 
32 | parser = argparse.ArgumentParser(description='Forum harvester - remove unused link from text')
33 | parser.add_argument('pre_gold_document_path', metavar='pre_gold_document_path',
34 |                     help='Path to the pre processed gold documents')
35 | parser.add_argument('--corpus-include-string', dest='corpus_include_string',
36 |                     help='Optionally restrict the input corpus to URLs that match the corpus include string.')
37 | 
38 | args = parser.parse_args()
39 | 
40 | result = defaultdict(list)
41 | for no, fname in enumerate(glob(args.pre_gold_document_path + "*.json")):
42 |     with open(fname, "r") as f:
43 |         forum = load(f)
44 |         if args.corpus_include_string and args.corpus_include_string not in forum['url']:
45 |             continue
46 | 
47 |         logging.info("Remove unused links for " + forum['url'])
48 |         link_user = set(x['user']['surface_form'] for x in forum['gold_standard_annotation'] if
49 |                         'user' in x)
50 |         link_post = set(x['post_link']['surface_form'] for x in forum['gold_standard_annotation'] if
51 |                         'post_link' in x)
52 | 
53 |         forum['text'] = remove_unused_links(forum['text'], link_user.union(link_post))
54 |         with open(fname, "w") as f2:
55 |             dump(forum, f2, indent=True)
56 | 


--------------------------------------------------------------------------------
/src/harvest/cleanup/forum_post.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | '''
 4 | Removes common suffixes and suffixes from forum posts.
 5 | '''
 6 | 
 7 | import logging
 8 | 
 9 | def compute_common_suffix_count(post_list):
10 |     '''
11 |     Returns:
12 |     	int: The number of common suffix terms.
13 |     '''
14 |     confirmed_suffix_terms = []
15 |     for suffix_term in reversed(post_list[0].split(' ')):
16 |         new_suffix = ' ' + ' '.join([suffix_term] + confirmed_suffix_terms)
17 |         for post in post_list:
18 |             if not post.endswith(new_suffix):
19 |                 return len(confirmed_suffix_terms)
20 |         confirmed_suffix_terms.insert(0, suffix_term)
21 | 
22 |     return len(confirmed_suffix_terms)
23 | 
24 | 
25 | def remove_suffix(post_list):
26 |     '''
27 |     Removes common suffixes from list posts.
28 |     '''
29 |     suffix_count = compute_common_suffix_count(post_list)
30 |     if suffix_count == 0:
31 |         return post_list
32 |     return [' '.join(posts.split(' ')[:-suffix_count]) for posts in post_list]
33 | 
34 | 
35 | def compute_common_prefix_count(post_list):
36 |     '''
37 |     Returns:
38 |     	int: The number of common prefix terms.
39 |     '''
40 |     confirmed_prefix_terms = []
41 |     for prefix_term in post_list[0].split(' '):
42 |         new_prefix = ' '.join(confirmed_prefix_terms + [prefix_term]) + ' '
43 |         for post in post_list:
44 |             if not post.startswith(new_prefix):
45 |                 return len(confirmed_prefix_terms)
46 |         confirmed_prefix_terms.append(prefix_term)
47 | 
48 |     return len(confirmed_prefix_terms)
49 | 
50 | 
51 | def remove_prefix(post_list):
52 |     '''
53 |     Removes common suffixes from list posts.
54 |     '''
55 |     prefix_count = compute_common_prefix_count(post_list)
56 |     if prefix_count == 0:
57 |         return post_list
58 |     return [' '.join(posts.split(' ')[prefix_count:]) for posts in post_list]
59 | 
60 | 
61 | def remove_boilerplate(post_list):
62 |     '''
63 |     Removes common prefixes and suffixes from list posts.
64 |     '''
65 |     prefix_count = compute_common_prefix_count(post_list)
66 |     suffix_count = compute_common_suffix_count(post_list)
67 |     logging.info(f'{prefix_count}>>{suffix_count}')
68 |     if prefix_count == 0 and suffix_count == 0:
69 |         return post_list
70 |     suffix_count = -suffix_count if suffix_count != 0 else None
71 |     return [' '.join(posts.split(' ')[prefix_count:suffix_count]) for posts in post_list]
72 | 


--------------------------------------------------------------------------------
/data/forum/test-urls.lst:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## aproach: mirror all - extract content from forum posts
 3 | ##          provide flags to obtain views that are accessible to the crawler
 4 | ##          - problems: recrawls
 5 | ##          - disambiguation and URL assignment (just number posts and add a #n)?
 6 | 
 7 | 
 8 | ##
 9 | ## multiple sklerosis
10 | ## 
11 | 
12 | # overview pages and comments on forum posts
13 | https://www.medhelp.org/posts/Multiple-Sclerosis/Positive-ANA-Test/show/1123552
14 | # overview pages
15 | https://www.msconnection.org/Discussions/f33/t77364/tp1/How-long-is-too-long-to-wait-for-an-initial-con
16 | https://shift.ms/topic/cbd-oil-11
17 | # not a real forum, but rather a blog with single posts (!) very hard - since an absolutely non standard layout
18 | https://www.uninterrupted.org.au/blog-category/my-ms-journey
19 | # nested overview pages
20 | https://www.msworld.org/forum/showthread.php?145403-Sort-of-new-here
21 | # leicht parsebare ansicht; default: baumstruktur, kaum parsebar
22 | https://www.amsel.de/multiple-sklerose-forum/?tnr=1&mnr=217239&archiv_flag=2&fv=1
23 | 
24 | ##
25 | ## emerging viri
26 | ## 
27 | https://www.medhelp.org/posts/Infectious-Diseases/Noro-or-other-virus/show/1881254
28 | 
29 | ##
30 | ## parkinson
31 | ##
32 | 
33 | # provides json-ld
34 | https://healthunlocked.com/parkinsonsmovement/posts/142058845/artane-anyone
35 | # post overview page
36 | https://www.healingwell.com/community/default.aspx?f=34&m=4099304
37 | # distinguishes between question and posts >>> problem: questions are not in part of the forum structure
38 | https://www.medhelp.org/posts/Heart-Disease/Wolfe-Parkinson-White-Syndrome/show/250747
39 | # mailing list
40 | https://myparkinsons.org/cgi-bin/forum/topic_show.pl?id=5231
41 | 
42 | 
43 | ##
44 | ## angelman
45 | ##
46 | 
47 | # implementation via google search not feasible; but direct mirroring of the forum's content // first post missing
48 | https://www.mumsnet.com/Talk/pregnancy/3749275-Pregnant-with-a-black-mixed-race-with-black-baby
49 | # DONE & works - overview pages
50 | http://blog.angelman-asa.org/read.php?2,736
51 | # beautiful standard forum :)))
52 | https://forums.maladiesraresinfo.org/post11011.html#p11011
53 | 
54 | ##
55 | ## COPD - not yet classified; choosing some samples only
56 | ## 
57 | 
58 | https://www.medschat.com/Discuss/how-important-is-this-medician-G-E-Sulfamethoxazole-TMP-DS-Tabitp-to-take-due-to-COPD-206090.htm?srcq=copd
59 | # search interface has completely changed
60 | https://community.scope.org.uk/discussion/57774/copd
61 | 


--------------------------------------------------------------------------------
/src/harvest/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Automatic extraction of forum posts and metadata is a challenging task since forums do not expose their content in a
 3 | standardized structure. Harvest performs this task reliably for many web forums and offers an easy way to extract data
 4 | from web forums.
 5 | Example::
 6 |    import urllib.request
 7 |    from inscriptis import get_text
 8 |    url = 'https://www.fhgr.ch'
 9 |    html = urllib.request.urlopen(url).read().decode('utf-8')
10 |    text = get_text(html)
11 |    print(text)
12 | """
13 | 
14 | __author__ = 'Albert Weichselbraun, Roger Waldvogel'
15 | __author_email__ = 'albert.weichselbraun@fhgr.ch, roger.waldvogel@fhgr.ch'
16 | __copyright__ = '2019-2020 Albert Weichselbraun, Roger Waldvogel'
17 | __license__ = 'Apache-2.0'
18 | __version__ = '1.1.0'
19 | __status__ = 'Prototype'
20 | 
21 | try:
22 |     import re
23 |     from lxml.html import fromstring
24 | 
25 |     from harvest import posts
26 |     from harvest.extract import extract_posts
27 | 
28 | except ImportError:
29 |     import warnings
30 | 
31 |     warnings.warn(
32 |         "Missing dependencies - harvest has not been properly installed")
33 | 
34 | RE_STRIP_XML_DECLARATION = re.compile(r'^<\?xml [^>]+?\?>')
35 | 
36 | 
37 | def extract_data(html, url):
38 |     """
39 |     Extracts posts from an html
40 |     Args:
41 |     html (string): html of the web forum
42 |     url (string): the url to the html
43 |     Returns:
44 |     Dictionary: posts with metadata
45 |     """
46 |     extract_post_result = posts.extract_posts(html, url)
47 |     extraction_results = extract_posts(html, url, extract_post_result['text_xpath_pattern'],
48 |                                        extract_post_result['url_xpath_pattern'],
49 |                                        extract_post_result['date_xpath_pattern'],
50 |                                        extract_post_result['user_xpath_pattern'],
51 |                                        result_as_datetime=False)
52 | 
53 |     final_results = []
54 |     for extraction_result in extraction_results:
55 |         entity = {'post_text': extraction_result.post}
56 |         if hasattr(extraction_result, 'date'):
57 |             entity['datetime'] = extraction_result.date
58 |         if hasattr(extraction_result, 'url'):
59 |             entity['post_link'] = extraction_result.url
60 |         if hasattr(extraction_result, 'user'):
61 |             entity['user'] = extraction_result.user
62 |         final_results.append(entity)
63 | 
64 |     return {"posts": final_results}
65 | 


--------------------------------------------------------------------------------
/corpus/createGoldDocuments/calculate_position.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import re
 3 | from fuzzywuzzy import fuzz
 4 | 
 5 | logging.getLogger().setLevel(logging.INFO)
 6 | 
 7 | 
 8 | def _add_start_end_fuzzy_search(element_to_add, text, sub_text, start_index, move_text_left=0):
 9 |     sub_text_start = re.sub(r'(?<!AM|PM)( \.)', '.', sub_text[move_text_left:30 + move_text_left])
10 |     sub_text_end = re.sub(r'(?<!AM|PM)( \.)', '.', sub_text[-30:])
11 |     sub_text_start_index = text.find(sub_text_start, start_index)
12 |     if sub_text_start_index > -1:
13 |         sub_text_end_index = text.find(sub_text_end, sub_text_start_index)
14 |         if sub_text_end_index > -1:
15 |             sub_text_end_index = sub_text_end_index + len(sub_text_end)
16 |             matched_text = text[sub_text_start_index:sub_text_end_index]
17 |             if fuzz.ratio(matched_text, sub_text) > 70:
18 |                 element_to_add['start'] = sub_text_start_index
19 |                 element_to_add['end'] = sub_text_end_index
20 |                 return element_to_add['end']
21 |     elif move_text_left < len(sub_text) - 60:
22 |         return _add_start_end_fuzzy_search(element_to_add, text, sub_text, start_index, move_text_left + 20)
23 |     logging.warning(f'Not found in text:\n{sub_text}')
24 |     return -1
25 | 
26 | 
27 | def _add_start_end(element_to_add, text, sub_text, start_index, fuzzy_search=False):
28 |     if isinstance(sub_text, str):
29 |         found_start_index = text.find(sub_text, start_index)
30 |         if found_start_index > -1:
31 |             element_to_add['start'] = found_start_index
32 |             element_to_add['end'] = found_start_index + len(sub_text)
33 |             return found_start_index + len(sub_text)
34 |         elif fuzzy_search and len(sub_text) > 150:
35 |             return _add_start_end_fuzzy_search(element_to_add, text, sub_text, start_index)
36 |         else:
37 |             logging.warning(f'Not found in text:\n{sub_text}')
38 |             return -1
39 |     return -1
40 | 
41 | 
42 | def get_start_end_for_post(post, full_text, search_start_index, fuzzy_search=False):
43 |     index_post_text = _add_start_end(post['post_text'], full_text, post['post_text']['surface_form'],
44 |                                      search_start_index, fuzzy_search)
45 | 
46 |     if 'datetime' in post:
47 |         _add_start_end(post['datetime'], full_text,
48 |                        post['datetime']['surface_form'], search_start_index)
49 |     if 'user' in post:
50 |         _add_start_end(post['user'], full_text,
51 |                        post['user']['surface_form'], search_start_index)
52 |     if 'post_link' in post:
53 |         _add_start_end(post['post_link'], full_text,
54 |                        post['post_link']['surface_form'], search_start_index)
55 |     if index_post_text > -1:
56 |         return index_post_text + len(post['post_text'])
57 |     return index_post_text
58 | 


--------------------------------------------------------------------------------
/scripts/webservice.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module is used to provide a web interface for orbis-eval [https://github.com/orbis-eval].
 3 | With orbis-eval the scores of recall, precision and f1 is calculated.
 4 | """
 5 | 
 6 | from flask import Flask
 7 | from flask import request
 8 | from flask import jsonify
 9 | 
10 | import harvest.posts as posts
11 | import harvest.extract as extract
12 | from corpus.createGoldDocuments.calculate_position import get_start_end_for_post
13 | 
14 | app = Flask('harvest')
15 | 
16 | 
17 | @app.route('/extract_from_html', methods=['POST'])
18 | def events():
19 |     forum = request.json
20 |     post_0 = posts.extract_posts(forum['html'], forum['url'])
21 | 
22 |     if 'gold_standard_format' in forum and forum['gold_standard_format']:
23 |         results = []
24 |     else:
25 |         results = {'entities': {}}
26 |     if post_0['text_xpath_pattern']:
27 |         search_start_index = 0
28 |         for post_1 in extract.extract_posts(
29 |                 forum['html'],
30 |                 forum['url'],
31 |                 post_0['text_xpath_pattern'],
32 |                 post_0['url_xpath_pattern'],
33 |                 post_0['date_xpath_pattern'],
34 |                 post_0['user_xpath_pattern'], result_as_datetime=False):
35 | 
36 |             post_dict = {
37 |                 'user': {'surface_form': post_1.user},
38 |                 'datetime': {'surface_form': post_1.date},
39 |                 'post_link': {'surface_form': post_1.url},
40 |                 'post_text': {'surface_form': post_1.post}
41 |             }
42 | 
43 |             doc_id = forum['url']
44 | 
45 |             if 'gold_standard_format' in forum and forum['gold_standard_format']:
46 |                 results.append(post_dict)
47 |             else:
48 |                 if 'text' in forum:
49 |                     new_search_start_index = get_start_end_for_post(post_dict, forum['text'], search_start_index,
50 |                                                                     fuzzy_search=True)
51 |                     if new_search_start_index > 0:
52 |                         search_start_index = new_search_start_index
53 | 
54 |                 results['entities'][doc_id] = results['entities'].get(doc_id, [])
55 |                 for item in ['user', 'datetime', 'post_link', 'post_text']:
56 |                     result = {
57 |                         'doc_id': doc_id,
58 |                         'type': item,
59 |                         'surface_form': post_dict[item]['surface_form']
60 |                     }
61 |                     if 'start' in post_dict[item] and 'end' in post_dict[item]:
62 |                         result['start'] = post_dict[item]['start']
63 |                         result['end'] = post_dict[item]['end']
64 | 
65 |                     results['entities'][doc_id].append(result)
66 | 
67 |     return jsonify(results)
68 | 
69 | 
70 | def get_flask_app():
71 |     return app
72 | 
73 | 
74 | if __name__ == '__main__':
75 |     app.run(port=5000, debug=True)
76 | 


--------------------------------------------------------------------------------
/corpus/createGoldDocuments/script/pre_processing.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import gzip
 5 | import logging
 6 | import hashlib
 7 | 
 8 | from glob import glob
 9 | from json import load
10 | from inscriptis import get_text
11 | from inscriptis.model.config import ParserConfig
12 | from collections import defaultdict
13 | from harvest import posts
14 | from harvest.extract import extract_posts
15 | from urllib.parse import urlparse
16 | 
17 | from corpus.createGoldDocuments.file import write_to_json
18 | 
19 | logging.getLogger().setLevel(logging.INFO)
20 | 
21 | parser = argparse.ArgumentParser(description='Forum harvester - generate gold standard document for further processing')
22 | parser.add_argument('corpus_path', metavar='corpus_path', help='Path to the input corpus')
23 | parser.add_argument('--result-directory', dest='result_directory', help='Optional directory for storing json results.')
24 | parser.add_argument('--corpus-include-string', dest='corpus_include_string',
25 |                     help='Optionally restrict the input corpus to URLs that match the corpus include string.')
26 | 
27 | args = parser.parse_args()
28 | 
29 | result = defaultdict(list)
30 | for no, fname in enumerate(glob(args.corpus_path + "*.json.gz")):
31 |     opener = gzip.open if fname.endswith(".gz") else open
32 |     with opener(fname) as f:
33 |         forum = load(f)
34 |         domain = urlparse(forum['url']).netloc
35 |         if args.corpus_include_string and args.corpus_include_string not in forum['url']:
36 |             continue
37 | 
38 |         logging.info("Processing " + forum['url'])
39 |         postXPath = posts.extract_posts(forum)
40 |         if postXPath['xpath_pattern']:
41 |             config = ParserConfig(display_links=True, display_anchors=True)
42 |             text = get_text(forum['html'], config)
43 |             text = " ".join([c.strip() for c in text.split("\n") if c.strip()])
44 |             document = {"id": f"i{int(hashlib.md5(forum['url'].encode('utf-8')).hexdigest(), 16)}",
45 |                         "url": forum['url'], "html": forum['html'], "text": text, "gold_standard_annotation": []}
46 | 
47 |             if args.result_directory:
48 |                 for post in extract_posts(forum['html'], forum['url'],
49 |                                           postXPath['text_xpath_pattern'],
50 |                                           postXPath['url_xpath_pattern'],
51 |                                           postXPath['date_xpath_pattern'],
52 |                                           postXPath['user_xpath_pattern'], result_as_datetime=False):
53 |                     post_element = {"post_text": {"surface_form": post.post},
54 |                                     "datetime": {"surface_form": post.date},
55 |                                     "user": {"surface_form": post.user}}
56 |                     if postXPath['url_xpath_pattern']:
57 |                         post_element["post_link"] = {"surface_form": post.url}
58 |                     document["gold_standard_annotation"].append(post_element)
59 | 
60 |                 write_to_json(forum['url'], args.result_directory, document)
61 |         else:
62 |             logging.error(f'Could not process {forum["url"]}')
63 | 


--------------------------------------------------------------------------------
/scripts/extract_to_csv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import gzip
 5 | import logging
 6 | import os
 7 | 
 8 | from glob import glob
 9 | from json import load, dump
10 | from csv import writer
11 | from collections import defaultdict
12 | from harvest import posts
13 | from harvest.extract import extract_posts
14 | from urllib.parse import urlparse
15 | 
16 | logging.getLogger().setLevel(logging.INFO)
17 | 
18 | 
19 | def extract_to_csv():
20 |     parser = argparse.ArgumentParser(
21 |         description='Forum harvester - extracts and harvests posts + metadata from Web forums')
22 |     parser.add_argument('corpus_path', metavar='corpus_path', help='Path to the input corpus')
23 |     parser.add_argument('output_file', metavar='output_file', help='Output file for the parser\'s results.')
24 | 
25 |     parser.add_argument('--result-directory', dest='result_directory',
26 |                         help='Optional directory for storing CSV results.')
27 |     parser.add_argument('--debug-directory', dest='debug_directory', help='Optional directory for debug information.')
28 |     parser.add_argument('--corpus-include-string', dest='corpus_include_string',
29 |                         help='Optionally restrict the input corpus to URLs that match the corpus include string.')
30 | 
31 |     args = parser.parse_args()
32 | 
33 |     result = defaultdict(list)
34 | 
35 |     for no, fname in enumerate(glob(args.corpus_path + "*.json.gz")):
36 |         logging.info(fname)
37 |         opener = gzip.open if fname.endswith(".gz") else open
38 |         with opener(fname) as f:
39 |             forum = load(f)
40 |             domain = urlparse(forum['url']).netloc
41 |             if args.corpus_include_string and args.corpus_include_string not in forum['url']:
42 |                 continue
43 | 
44 |             if args.debug_directory:
45 |                 debug_fname = os.path.join(args.debug_directory, "{}-{}.html".format(no, domain))
46 |                 with open(debug_fname, "w") as g:
47 |                     g.write(forum['html'])
48 | 
49 |             logging.info("Processing " + forum['url'])
50 |             extract_post_result = posts.extract_posts(forum['html'], forum['url'])
51 |             result[domain].append(extract_post_result)
52 | 
53 |             if args.result_directory and extract_post_result['text_xpath_pattern']:
54 |                 result_fname = os.path.join(args.result_directory, f'{domain}.csv')
55 |                 with open(result_fname, 'a+') as g:
56 |                     csvwriter = writer(g)
57 |                     if os.stat(result_fname).st_size == 0:
58 |                         csvwriter.writerow(['forum_link', 'post_link', 'user', 'date', 'post'])
59 |                     for post in extract_posts(forum['html'], forum['url'],
60 |                                               extract_post_result['text_xpath_pattern'],
61 |                                               extract_post_result['url_xpath_pattern'],
62 |                                               extract_post_result['date_xpath_pattern'],
63 |                                               extract_post_result['user_xpath_pattern']):
64 |                         csvwriter.writerow([forum['url'], post.url, post.user, post.date, post.post])
65 | 
66 |     with open(args.output_file, "w") as f:
67 |         dump(result, f, indent=True)
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     extract_to_csv()
72 | 


--------------------------------------------------------------------------------
/src/harvest/similarity_calculator.py:
--------------------------------------------------------------------------------
 1 | from itertools import chain
 2 | 
 3 | from harvest.utils import get_xpath_tree_text
 4 | import logging
 5 | import re
 6 | import numpy as np
 7 | 
 8 | VSM_MODEL_SIZE = 5000
 9 | 
10 | # tags that are not allowed to be part of a forum xpath (lowercase)
11 | BLACKLIST_TAGS = ('option', 'footer', 'form', 'head', 'tfoot')
12 | REWARDED_CLASSES = ('content', 'message', 'post', 'wrapper')
13 | 
14 | 
15 | def _text_to_vsm(text):
16 |     '''
17 |     translates a text into the vector space model
18 |     using the hashing trick.
19 | 
20 |     VSM_MODEL_SIZE determines the size of the vsm.
21 |     '''
22 |     vms = np.full(VSM_MODEL_SIZE, 0)
23 |     for word in text.split():
24 |         index = word.__hash__() % VSM_MODEL_SIZE
25 |         vms[index] += 1
26 |     return vms
27 | 
28 | 
29 | def _descendants_contain_blacklisted_tag(xpath, dom, blacklisted_tags):
30 |     descendants = set([t.tag for t in chain(*[e.iterdescendants() for e in dom.xpath(xpath)])])
31 |     for tag in blacklisted_tags:
32 |         if tag in descendants:
33 |             return True
34 |     return False
35 | 
36 | 
37 | def _ancestors_contains_blacklisted_tag(xpath_string, blacklisted_tags):
38 |     """
39 |     returns
40 |     -------
41 |     True, if the xpath_string (i.e. the ancestors) contains any blacklisted_tag
42 |     """
43 |     xpath = xpath_string.split("/")
44 |     for tag in blacklisted_tags:
45 |         if tag in xpath:
46 |             return True
47 |     return False
48 | 
49 | 
50 | def _ancestors_contains_class(xpath, rewarded_classes):
51 |     classes_x_path = re.findall(r"(?!.*\[)@class=\".*\"", xpath)
52 |     if classes_x_path:
53 |         classes = [x.lower() for x in list(filter(None, re.sub(r"@class=|\"", "", classes_x_path[-1]).split(" ")))]
54 |         for html_class in classes:
55 |             for rewarded_class in rewarded_classes:
56 |                 if rewarded_class in html_class:
57 |                     return True
58 | 
59 | 
60 | def assess_node(reference_content, dom, xpath, reward_classes=False):
61 |     """
62 |     returns
63 |     -------
64 |     a metric that is based on
65 |       (i) the vector space model and
66 |      (ii) the number of returned elements
67 |     (iii) whether the descendants contain any blacklisted tags
68 |     to assess whether the node is likely to be part of a forum post.
69 |     """
70 |     if xpath == "//" or _descendants_contain_blacklisted_tag(xpath, dom, BLACKLIST_TAGS):
71 |         return 0., 1
72 | 
73 |     xpath_content_list = get_xpath_tree_text(dom, xpath)
74 |     xpath_element_count = len(xpath_content_list)
75 | 
76 |     reference_vsm = _text_to_vsm(reference_content)
77 |     xpath_vsm = _text_to_vsm(' '.join(xpath_content_list))
78 | 
79 |     divisor = (np.linalg.norm(reference_vsm) * np.linalg.norm(xpath_vsm))
80 |     if not divisor:
81 |         logging.warning("Cannot compute similarity - empty reference (%s) or xpath (%ss) text.", reference_content,
82 |                         ' '.join(xpath_content_list))
83 |         return 0., 1
84 |     similarity = np.dot(reference_vsm, xpath_vsm) / divisor
85 | 
86 |     # discount any node that contains BLACKLIST_TAGS
87 |     if _ancestors_contains_blacklisted_tag(xpath, BLACKLIST_TAGS):
88 |         similarity /= 10
89 |     elif reward_classes and _ancestors_contains_class(xpath, REWARDED_CLASSES):
90 |         similarity += 0.1
91 |     return similarity, xpath_element_count
92 | 


--------------------------------------------------------------------------------
/src/harvest/metadata/link.py:
--------------------------------------------------------------------------------
  1 | """
  2 | link
  3 | ----
  4 | 
  5 | Tries to obtain the URL of the given post
  6 | determine post URL
  7 | ------------------
  8 | * relevant tags: <a> (href or name)
  9 | * point to the same domain, or even better also to the same page (without parameters)
 10 | * appear always in the same element
 11 | """
 12 | import logging
 13 | import re
 14 | 
 15 | from collections import defaultdict
 16 | from urllib.parse import urlparse, urljoin
 17 | 
 18 | from harvest.utils import get_xpath_expression, get_xpath_expression_child_filter, get_merged_xpath, extract_text
 19 | 
 20 | 
 21 | def _get_without_post_link(path):
 22 |     """
 23 |     Used to handle case cases like post link /threads/deviantart-horrors.2366/post-145153 with forum link
 24 |     /threads/deviantart-horrors.2366
 25 |     Args:
 26 |         path:
 27 | 
 28 |     Returns:
 29 | 
 30 |     """
 31 |     path_elements = path.split('/')
 32 |     if len([x for x in path_elements if x.strip() != '']) > 2:
 33 |         new_path = "/".join(path_elements[:-1])
 34 |         return new_path
 35 | 
 36 |     return path
 37 | 
 38 | 
 39 | def _get_link_representation(element):
 40 |     if extract_text(element):
 41 |         return extract_text(element)
 42 |     elif 'href' in element.attrib:
 43 |         return element.attrib['href']
 44 |     return ''
 45 | 
 46 | 
 47 | def _is_counting_up(candidates):
 48 |     for xpath, matches in candidates.items():
 49 |         post_ids = [re.search(r'\d+', _get_link_representation(x)) for x in matches['elements']]
 50 |         if all(post_ids):
 51 |             post_ids = [int(x.group(0)) for x in post_ids]
 52 |             if all(x < y for x, y in zip(post_ids, post_ids[1:])):
 53 |                 matches['score'] += 1
 54 | 
 55 | 
 56 | def _get_link(dom, post_elements, base_url, forum_posts):
 57 |     '''
 58 |     Obtains the URL to the given post.
 59 |     '''
 60 |     url_candidates = defaultdict(lambda: {'elements': [],
 61 |                                           'has_anchor_tag': False, 'score': 0})
 62 | 
 63 |     # collect candidate paths
 64 |     for element in post_elements:
 65 |         for tag in element.iterdescendants():
 66 |             if tag.tag == 'a':
 67 |                 xpath = get_xpath_expression(tag)
 68 |                 xpath += get_xpath_expression_child_filter(tag)
 69 |                 # anchor tags with the name attribute will
 70 |                 # lead to the post
 71 |                 attributes = list(attr.lower() for attr in tag.attrib)
 72 |                 if 'name' in attributes:
 73 |                     url_candidates[xpath]['has_anchor_tag'] = True
 74 |                 if 'name' in attributes or 'href' in attributes:
 75 |                     url_candidates[xpath]['elements'].append(tag)
 76 | 
 77 |     # merge xpath
 78 |     for merged_xpath in get_merged_xpath(url_candidates.keys()):
 79 |         merged_elements = dom.xpath(merged_xpath)
 80 |         if merged_elements:
 81 |             url_candidates[merged_xpath]['elements'] = merged_elements
 82 |             if 'name' in (attr.lower() for attr in merged_elements[0].attrib):
 83 |                 url_candidates[merged_xpath]['has_anchor_tag'] = True
 84 | 
 85 |     # filter candidate paths
 86 |     for xpath, matches in list(url_candidates.items()):
 87 |         # consider the number of posts or the number of posts + 2 spare for possible header elements
 88 |         if len(forum_posts) - len(matches['elements']) not in range(0, 3):
 89 |             del url_candidates[xpath]
 90 | 
 91 |     # filter candidates that contain URLs to other domains and
 92 |     # record the urls' targets
 93 |     forum_url = urlparse(base_url)
 94 |     for xpath, matches in list(url_candidates.items()):
 95 |         for match in matches['elements']:
 96 |             parsed_url = urlparse(urljoin(forum_url.scheme + "://" + forum_url.netloc, match.attrib.get('href', '')))
 97 |             if parsed_url.netloc != forum_url.netloc:
 98 |                 del url_candidates[xpath]
 99 |                 break
100 | 
101 |             if _get_without_post_link(parsed_url.path) not in forum_url.path:
102 |                 del url_candidates[xpath]
103 |                 break
104 | 
105 |     _is_counting_up(url_candidates)
106 | 
107 |     # obtain the most likely url path
108 |     for xpath, _ in sorted(url_candidates.items(),
109 |                            key=lambda x: (x[1]['has_anchor_tag'], x[1]['score']),
110 |                            reverse=True):
111 |         return xpath
112 | 
113 |     return None
114 | 
115 | 
116 | def get_link(dom, post_xpath, base_url, forum_posts):
117 |     '''
118 |     Args:
119 |         dom: The DOM tree to analyze.
120 |         post_xpath (str): xpath of the post to search dates.
121 |         base_url (str): URL of the forum.
122 |     Returns:
123 |         str: the xpath to the post date.
124 |     '''
125 | 
126 |     logging.info('Start finding post link')
127 |     post_elements = dom.xpath(post_xpath)
128 |     while True:
129 |         result = _get_link(dom, post_elements, base_url, forum_posts)
130 |         if result or len(post_elements) <= 1:
131 |             logging.info(f'Post link xpath: {result}')
132 |             return result
133 |         post_xpath = post_xpath + "/.."
134 |         post_elements = dom.xpath(post_xpath)
135 | 


--------------------------------------------------------------------------------
/src/harvest/metadata/date.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | link
  3 | ----
  4 | 
  5 | Tries to obtain the URL of the given post
  6 | '''
  7 | import logging
  8 | 
  9 | from collections import defaultdict
 10 | from datetime import datetime
 11 | from harvest.date_search import search_dates
 12 | from dateutil import parser
 13 | from lxml import etree
 14 | 
 15 | from harvest.utils import (get_xpath_expression, get_cleaned_element_text, get_xpath_expression_child_filter,
 16 |                            get_merged_xpath)
 17 | 
 18 | MAX_DATE_LEN = 120
 19 | 
 20 | 
 21 | def _get_date(dom, post_elements, base_url, forum_posts):
 22 |     date_candidates = defaultdict(lambda: {'elements': [],
 23 |                                            'most_recent_date': datetime.fromtimestamp(0),  # 1970
 24 |                                            'lowermost_date': datetime.fromtimestamp(1E11),  # >5000
 25 |                                            'chronological_order': True,
 26 |                                            'same_size_posts': False,
 27 |                                            'multiple_dates': False})
 28 |     # collect candidate paths
 29 |     for element in post_elements:
 30 |         for tag in element.iterdescendants():
 31 |             text = get_cleaned_element_text(tag)
 32 |             # do not consider text larger than MAX_DATE_LEN relevant for date extraction
 33 | 
 34 |             if (len(text) > MAX_DATE_LEN or not search_dates(text) or
 35 |                     tag.tag is etree.Comment) and not (tag.tag == 'time' and 'datetime' in tag.attrib):
 36 |                 continue
 37 | 
 38 |             xpath = get_xpath_expression(tag, parent_element=element, single_class_filter=True)
 39 |             xpath += get_xpath_expression_child_filter(tag)
 40 |             date_candidates[xpath]['elements'].append(tag)
 41 | 
 42 |     # merge xpath
 43 |     for merged_xpath in get_merged_xpath(date_candidates.keys()):
 44 |         merged_elements = dom.xpath(merged_xpath)
 45 |         if merged_elements:
 46 |             date_candidates[merged_xpath]['elements'] = merged_elements
 47 | 
 48 |     # filter candidate paths that do not yield a date for every post
 49 |     for xpath, matches in list(date_candidates.items()):
 50 |         # consider the number of posts or the number of posts + 2 spare for possible header elements
 51 |         if len(forum_posts) - len(matches['elements']) not in range(0, 3):
 52 |             del date_candidates[xpath]
 53 | 
 54 |     # Set if same length as posts
 55 |     for xpath, matches in list(date_candidates.items()):
 56 |         if len(forum_posts) == len(matches['elements']):
 57 |             matches['same_size_posts'] = True
 58 | 
 59 |     # rank candidates based on the following criteria
 60 |     # - they must yield a date for every post
 61 |     # - we choose the candidate with the most recent date
 62 |     #   (to distinguish between "post" and "member since" dates minus 1 year per year timedelta between the dates)
 63 |     for xpath, matches in list(date_candidates.items()):
 64 |         previous_date = datetime.min
 65 |         for match in matches['elements']:
 66 |             if match.tag == 'time':
 67 |                 time = match.attrib.get('datetime', '')
 68 |                 extracted_dates = [(time, parser.parse(time, ignoretz=True))]
 69 |             else:
 70 |                 extracted_dates = search_dates(get_cleaned_element_text(match))
 71 | 
 72 |             if not extracted_dates:
 73 |                 del date_candidates[xpath]
 74 |                 break
 75 | 
 76 |             if len(extracted_dates) > 1:
 77 |                 date_candidates[xpath]['multiple_dates'] = True
 78 |                 date_candidates[xpath]['most_recent_date'] = max(date_candidates[xpath]['most_recent_date'],
 79 |                                                                  max([date[1] for date in extracted_dates]))
 80 |                 date_candidates[xpath]['lowermost_date'] = min(date_candidates[xpath]['lowermost_date'],
 81 |                                                                min([date[1] for date in extracted_dates]))
 82 | 
 83 |                 if previous_date > max([date[1] for date in extracted_dates]):
 84 |                     date_candidates[xpath]['chronological_order'] = False
 85 |             else:
 86 |                 date_candidates[xpath]['most_recent_date'] = max(date_candidates[xpath]['most_recent_date'],
 87 |                                                                  extracted_dates[0][1])
 88 |                 date_candidates[xpath]['lowermost_date'] = min(date_candidates[xpath]['lowermost_date'],
 89 |                                                                extracted_dates[0][1])
 90 |                 if previous_date > extracted_dates[0][1]:
 91 |                     date_candidates[xpath]['chronological_order'] = False
 92 | 
 93 |             previous_date = date_candidates[xpath]['most_recent_date']
 94 | 
 95 |     # obtain the most likely url path
 96 |     for xpath, _ in sorted(date_candidates.items(),
 97 |                            key=lambda x: (x[1]['same_size_posts'], x[1]['chronological_order'],
 98 |                                           x[1]['most_recent_date']),
 99 |                            reverse=True):
100 |         return xpath
101 | 
102 |     return None
103 | 
104 | 
105 | # strategy
106 | # --------
107 | # * obtain all xpaths that have date information
108 | #   - extract the one which contains most likely the date (otherwise no date-xpath is returned)
109 | 
110 | # * extract all dates from the date-xpath
111 | # * select the one that
112 | #   - uses the same format and
113 | #   - are newer (!= join date)
114 | 
115 | def get_date(dom, post_xpath, base_url, forum_posts):
116 |     '''
117 |     Args:
118 |         dom: The DOM tree to analyze.
119 |         post_xpath (str): xpath of the post to search dates.
120 |         base_url (str): URL of the forum.
121 |     Returns:
122 |         str: the xpath to the post date.
123 |     '''
124 |     logging.info('Start finding post date')
125 |     post_elements = dom.xpath(post_xpath)
126 |     while True:
127 |         result = _get_date(dom, post_elements, base_url, forum_posts)
128 |         if result or len(post_elements) <= 1:
129 |             logging.info(f'Post date xpath: {result}')
130 |             return result
131 |         post_xpath = post_xpath + "/.."
132 |         post_elements = dom.xpath(post_xpath)
133 | 


--------------------------------------------------------------------------------
/tests/integration/harvest/test_posts_xpath.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import os
  3 | import gzip
  4 | from json import load
  5 | from harvest.posts import extract_posts
  6 | 
  7 | 
  8 | @pytest.fixture
  9 | def load_test_data():
 10 |     def _load_test_data(file_name):
 11 |         file_path = os.path.join(os.path.dirname(__file__), '../../../data/forum', file_name)
 12 |         with gzip.open(file_path) as f:
 13 |             return load(f)
 14 | 
 15 |     return _load_test_data
 16 | 
 17 | 
 18 | def test_extract_posts_forum_shift_ms(load_test_data):
 19 |     forum_test_data = load_test_data("https%3A%2F%2Fshift.ms%2Ftopic%2Fcbd-oil-11.json.gz")
 20 |     post = extract_posts(forum_test_data['html'], forum_test_data['url'])
 21 | 
 22 |     assert post['url'] == 'https://shift.ms/topic/cbd-oil-11'
 23 |     assert post['xpath_pattern'] == '//div[@class="bbp-reply-content"]/../..'
 24 |     assert post['url_xpath_pattern'] is None
 25 |     assert post['date_xpath_pattern'] == '//div/div/div[@class="bbp-reply-date"][not(*) and string-length(text()) > 0]'
 26 |     assert post['user_xpath_pattern'] == \
 27 |            '//div/div/a[@class="bbp-author-name"][not(*) and string-length(text()) > 0]'
 28 | 
 29 | 
 30 | def test_extract_posts_forum_healingwell(load_test_data):
 31 |     forum_test_data = load_test_data(
 32 |         "https%3A%2F%2Fwww.healingwell.com%2Fcommunity%2Fdefault.aspx%3Ff%3D34%26m%3D4099304.json.gz")
 33 |     post = extract_posts(forum_test_data['html'], forum_test_data['url'])
 34 | 
 35 |     assert post['url'] == 'https://www.healingwell.com/community/default.aspx?f=34&m=4099304'
 36 |     assert post['xpath_pattern'] == '//div/div[@class="post-body"]/../../..'
 37 |     assert post['url_xpath_pattern'] == \
 38 |            '//div[(contains(@class, \'post-even\')) or (contains(@class, \'post-odd\'))]/a[not(*) and string-length(text()) = 0]'
 39 |     assert post['date_xpath_pattern'] == '//div/div/div[@class="posted"][not(*) and string-length(text()) > 0]'
 40 |     assert post['user_xpath_pattern'] == \
 41 |            '//div/div/div/div/div/div/a[@class="user-name"][not(*) and string-length(text()) > 0]'
 42 | 
 43 | 
 44 | def test_extract_posts_forum_medhelp(load_test_data):
 45 |     forum_test_data = load_test_data("https%3A%2F%2Fwww.medhelp.org%2Fposts%2FHeart-Disease%2FWolfe-Parkinson-"
 46 |                                      "White-Syndrome%2Fshow%2F250747.json.gz")
 47 |     post = extract_posts(forum_test_data['html'], forum_test_data['url'])
 48 | 
 49 |     assert post['url'] == 'https://www.medhelp.org/posts/Heart-Disease/Wolfe-Parkinson-White-Syndrome/show/250747'
 50 |     assert post['xpath_pattern'] == '//div/div[@class="resp_body "]/..'
 51 |     assert post['url_xpath_pattern'] is None
 52 |     assert post[
 53 |                'date_xpath_pattern'] == '//div/div/div/time[@class="mh_timestamp"][not(*) and string-length(text()) = 0]'
 54 |     assert post['user_xpath_pattern'] == '//div/div/div[@class="username"]/a[span]'
 55 | 
 56 | 
 57 | def test_extract_posts_forum_medschat(load_test_data):
 58 |     forum_test_data = load_test_data("https%3A%2F%2Fwww.medschat.com%2FDiscuss%2Fhow-important-is-this-medician-G-E-"
 59 |                                      "Sulfamethoxazole-TMP-DS-Tabitp-to-take-due-to-COPD-206090"
 60 |                                      ".htm%3Fsrcq%3Dcopd.json.gz")
 61 |     post = extract_posts(forum_test_data['html'], forum_test_data['url'])
 62 | 
 63 |     assert post['url'] == 'https://www.medschat.com/Discuss/how-important-is-this-medician-G-E-Sulfamethoxazole-' \
 64 |                           'TMP-DS-Tabitp-to-take-due-to-COPD-206090.htm?srcq=copd'
 65 |     assert post['xpath_pattern'] == '//div/span[@class="search_results"]/../..'
 66 |     assert post['url_xpath_pattern'] == '//a[@class="action_bar_blue"][not(*) and string-length(text()) > 0]'
 67 |     assert post['date_xpath_pattern'] == '//div/span[@class="small soft"]/time[not(*) and string-length(text()) > 0]'
 68 |     assert post[
 69 |                'user_xpath_pattern'] == '//div[@class="list_item_b_content"]/strong[not(*) and string-length(text()) > 0]'
 70 | 
 71 | 
 72 | def test_extract_posts_forum_msconnection(load_test_data):
 73 |     forum_test_data = load_test_data("https%3A%2F%2Fwww.msconnection.org%2FDiscussions%2Ff33%2Ft77364%2Ftp1%2FHow-long-"
 74 |                                      "is-too-long-to-wait-for-an-initial-con.json.gz")
 75 |     post = extract_posts(forum_test_data['html'], forum_test_data['url'])
 76 | 
 77 |     assert post['url'] == 'https://www.msconnection.org/Discussions/f33/t77364/tp1/How-long-is-too-long-to-wait-' \
 78 |                           'for-an-initial-con'
 79 |     assert post['xpath_pattern'] == '//li/div[@class="discussion-post-body"]'
 80 |     assert post['url_xpath_pattern'] == None
 81 |     assert post['date_xpath_pattern'] == \
 82 |            '//header/div/div[@class="discussion-post-meta-info"]/br[not(*) and string-length(text()) = 0]'
 83 |     assert post['user_xpath_pattern'] == '//header/div/div/a[@class="PostUser"][not(*) and string-length(text()) > 0]'
 84 | 
 85 | 
 86 | def test_extract_posts_forum_msworld(load_test_data):
 87 |     forum_test_data = load_test_data("https%3A%2F%2Fwww.msworld.org%2Fforum%2Fshowthread.php%3F145403-"
 88 |                                      "Sort-of-new-here.json.gz")
 89 |     post = extract_posts(forum_test_data['html'], forum_test_data['url'])
 90 | 
 91 |     assert post['url'] == 'https://www.msworld.org/forum/showthread.php?145403-Sort-of-new-here'
 92 |     assert post['xpath_pattern'] == '//div/blockquote[@class="postcontent restore"]/../../../../..'
 93 |     assert post['url_xpath_pattern'] == '//a[@class="postcounter"][not(*) and string-length(text()) > 0]'
 94 |     assert post['date_xpath_pattern'] == '//div/div/span/span[@class="date"][span]'
 95 |     assert post['user_xpath_pattern'] == \
 96 |            '//div/div/div/div/div/a[(contains(@class, \'popupctrl\') and contains(@class, \'username\'))][strong]'
 97 | 
 98 | 
 99 | def test_extract_posts_forum_uninterrupted(load_test_data):
100 |     forum_test_data = load_test_data("https%3A%2F%2Fwww.uninterrupted.org.au%2Fblog-category%2Fmy-ms-journey.json.gz")
101 |     post = extract_posts(forum_test_data['html'], forum_test_data['url'])
102 | 
103 |     assert post['url'] == 'https://www.uninterrupted.org.au/blog-category/my-ms-journey'
104 |     assert post['xpath_pattern'] == '//div[@class="field-content"]/../..'
105 |     assert post['url_xpath_pattern'] is None
106 |     assert post['date_xpath_pattern'] is None
107 |     assert post['user_xpath_pattern'] == '//div/span/a[@class="username"][not(*) and string-length(text()) > 0]'
108 | 


--------------------------------------------------------------------------------
/src/harvest/utils.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Functions that are shared across modules.
  3 | '''
  4 | 
  5 | import re
  6 | 
  7 | from lxml import etree
  8 | 
  9 | VALID_NODE_TYPE_QUALIFIERS = ('class',)
 10 | RE_FILTER_XML_HEADER = re.compile("<\\?xml version=\".*? encoding=.*?\\?>")
 11 | 
 12 | 
 13 | def get_html_dom(html_content):
 14 |     '''
 15 |     Params:
 16 |       html_content: the HTML page to retrieve the DOM from.
 17 | 
 18 |     Returns:
 19 |       The corresponding lxml document object model (DOM).
 20 |     '''
 21 |     html = RE_FILTER_XML_HEADER.sub("", html_content)
 22 |     return etree.HTML(html)
 23 | 
 24 | 
 25 | def extract_text(element):
 26 |     '''
 27 |     Returns:
 28 |       str -- The text for the given element.
 29 |     '''
 30 |     return ' '.join([t.strip() for t in element.itertext() if t.strip()])
 31 | 
 32 | 
 33 | def get_xpath_expression_child_filter(element):
 34 |     """
 35 |     Returns:
 36 |         str -- The xpath expression to filter because of child element
 37 |     """
 38 |     child_filter = ""
 39 |     children = element.getchildren()
 40 |     if len(children) == 1 and type(children[0].tag) == str:
 41 |         child_filter = "[" + children[0].tag + "]"
 42 |     elif element.text and element.text.strip() and not children:
 43 |         child_filter = "[not(*) and string-length(text()) > 0]"
 44 |     elif not element.text and not children:
 45 |         child_filter = "[not(*) and string-length(text()) = 0]"
 46 |     return child_filter
 47 | 
 48 | 
 49 | def get_xpath_combinations_for_classes(x_path):
 50 |     """
 51 |     Returns:
 52 |         array -- Possible xpath combinations of classes
 53 |     """
 54 |     classes_x_path = re.findall(r"(?!.*\[)@class=\".*\"", x_path)
 55 |     xpath_combinations = []
 56 |     if classes_x_path:
 57 |         classes = list(filter(None, re.sub(r"@class=|\"", "", classes_x_path[-1]).split(" ")))
 58 |         for html_class in classes:
 59 |             xpath_combinations.append(
 60 |                 re.sub(r"(?!.*\[)@class=\".*\"\]", r"contains(concat(' ',@class,' '),' " + html_class + r" ')]",
 61 |                        x_path))
 62 |         if len(classes) > 1:
 63 |             new_classes = " and ".join(["contains(@class, \'" + x + "\')" for x in classes]) + "]"
 64 |             xpath_combinations.append(re.sub(r"(?!.*\[)@class=\".*\"\]", new_classes, x_path))
 65 |     if not xpath_combinations:
 66 |         xpath_combinations = [x_path]
 67 |     return xpath_combinations
 68 | 
 69 | 
 70 | def get_xpath_expression(element, parent_element=None, single_class_filter=False):
 71 |     '''
 72 |     Returns:
 73 |       str -- The xpath expression for the given comment.
 74 |     '''
 75 |     xpath_list = []
 76 |     has_class_filter = False
 77 | 
 78 |     while (not has_class_filter or parent_element is not None and element is not parent_element) \
 79 |             and element is not None:
 80 |         without_class_filter = single_class_filter and has_class_filter
 81 |         xpath_expression = _get_xpath_element_expression(element, without_class_filter=without_class_filter)
 82 |         if not has_class_filter and "[" in xpath_expression:
 83 |             has_class_filter = True
 84 |         # Todo does this improve the detection overall?
 85 |         # if not has_class_filter:
 86 |         #    xpath_expression = xpath_expression + "[not(@class)]"
 87 |         xpath_list.append(xpath_expression)
 88 | 
 89 |         element = element.getparent()
 90 | 
 91 |     xpath_list.reverse()
 92 |     return "//" + "/".join(xpath_list)
 93 | 
 94 | 
 95 | def _get_xpath_element_expression(element, without_class_filter=False):
 96 |     '''
 97 |     Returns:
 98 |       str -- The xpath expression for the given element.
 99 |     '''
100 |     attr_filter = None
101 |     if not without_class_filter:
102 |         attr_filter = " & ".join(['@%s="%s"' % (key, value)
103 |                                   for key, value in element.attrib.items()
104 |                                   if key in VALID_NODE_TYPE_QUALIFIERS])
105 |     return element.tag + "[%s]" % attr_filter if attr_filter else element.tag
106 | 
107 | 
108 | def get_xpath_tree_text(dom, xpath):
109 |     '''
110 |     Args:
111 |       xpath (str): The xpath to extract.
112 |     Returns:
113 |        list -- A list of text obtained by all elements matching the given
114 |        xpath.
115 |     '''
116 |     return [re.sub(r'\s\s+', ' ', extract_text(element)) for element in dom.xpath(xpath)]
117 | 
118 | 
119 | def get_cleaned_element_text(element):
120 |     '''
121 |     Returns:
122 |         str -- the text of the given element (without its children and
123 |         punctuation).
124 |     '''
125 |     return f'{element.text or ""} {element.tail or ""}'.replace(",", " ") \
126 |         .replace(";", " ").strip()
127 | 
128 | 
129 | def _get_classes_concat_with_and_condition(classes):
130 |     return "(" + " and ".join(["contains(@class, \'" + x + "\')" for x in classes]) + ")"
131 | 
132 | 
133 | def _get_merged_classes_xpath_condition(classes, classes2):
134 |     return "[" + _get_classes_concat_with_and_condition(classes) + " or " + \
135 |            _get_classes_concat_with_and_condition(classes2) + "]"
136 | 
137 | 
138 | def _get_classes(regex_class_detection, xpath):
139 |     """
140 |     Args:
141 |         regex_class_detection: regex to detect class
142 |         xpath: xpath string to get classes
143 | 
144 |     Returns: list of classes
145 |     """
146 |     classes = re.findall(regex_class_detection, xpath)
147 |     return list(filter(None, re.sub(r"@class=|\"|\[|\]", "", classes[0]).split(" ")))
148 | 
149 | 
150 | def _get_merged_xpath(regex_class_detection, xpath, xpath_to_compare, merged_xpath):
151 |     """
152 |     Args:
153 |         regex_class_detection: Regex expression to look for class attributes
154 |         xpath: xpath string
155 |         xpath_to_compare: xpath string to compare with param xpath
156 |         merged_xpath: dictionary with already merged xpath
157 | 
158 |     Returns: merged xpath if possible. If no match is found, none is returned
159 | 
160 |     """
161 |     xpath_without_class = re.sub(regex_class_detection, "", xpath)
162 |     xpath_to_compare_without_class = re.sub(regex_class_detection, "", xpath_to_compare)
163 |     if xpath_without_class == xpath_to_compare_without_class and xpath_to_compare not in merged_xpath:
164 |         classes = _get_classes(regex_class_detection, xpath)
165 |         classes_to_compare = _get_classes(regex_class_detection, xpath_to_compare)
166 |         same_classes = list(set(classes).intersection(classes_to_compare))
167 |         if same_classes:
168 |             same_classes.sort()
169 |             return re.sub(regex_class_detection, "[" + _get_classes_concat_with_and_condition(same_classes) + "]",
170 |                           xpath)
171 | 
172 |         if classes and classes_to_compare:
173 |             merged_xpath_classes = _get_merged_classes_xpath_condition(classes, classes_to_compare)
174 |             return re.sub(regex_class_detection, merged_xpath_classes, xpath)
175 | 
176 | 
177 | def get_merged_xpath(xpaths):
178 |     """
179 |     Args:
180 |         xpaths: List of xpaths to look for xpaths which can be merged
181 | 
182 |     Returns: A list with the merged xpath
183 |     """
184 |     merged_xpaths = dict()
185 |     regex_class_detection = r"\[@class=\".*\"\]"
186 |     for xpath in xpaths:
187 |         if re.search(regex_class_detection, xpath):
188 |             for xpath_to_compare in [x for x in xpaths if x != xpath]:
189 |                 if re.search(regex_class_detection, xpath_to_compare):
190 |                     merged_xpath = _get_merged_xpath(regex_class_detection, xpath, xpath_to_compare, merged_xpaths)
191 |                     if merged_xpath:
192 |                         merged_xpaths[xpath] = merged_xpath
193 | 
194 |     return list(merged_xpaths.values())
195 | 
196 | 
197 | def get_grandparent(element):
198 |     if etree.iselement(element) and etree.iselement(element.getparent()) and \
199 |             etree.iselement(element.getparent().getparent()):
200 |         return element.getparent().getparent()
201 | 
202 | 
203 | def elements_have_no_overlap(elements):
204 |     for element in elements:
205 |         for element_to_compare in [child for child in [x for x in elements if x is not element]]:
206 |             for child_element in element.iterdescendants():
207 |                 if element_to_compare is child_element:
208 |                     return False
209 |     return True
210 | 


--------------------------------------------------------------------------------
/src/harvest/extract.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | '''
  4 | Extracts posts and metadata from Web forums based on the xpath's provide
  5 | by the machine learning component.
  6 | 
  7 | - posts are extracted "as is" and send through a boilerplate removal component
  8 | - URL and user metadata is extracted
  9 | - the post date is extracted based on the provided xpath + simple
 10 |   pre-processing
 11 | '''
 12 | 
 13 | from collections import namedtuple
 14 | from datetime import datetime
 15 | from operator import itemgetter
 16 | from urllib.parse import urljoin, urlparse
 17 | from dateparser.search import search_dates
 18 | from dateutil import parser
 19 | 
 20 | from harvest.utils import get_html_dom, get_xpath_tree_text, get_cleaned_element_text, extract_text
 21 | 
 22 | from harvest.cleanup.forum_post import remove_boilerplate
 23 | from harvest.config import LANGUAGES
 24 | 
 25 | ExtractionResult = namedtuple('ExtractionResult', ('post', 'url', 'date',
 26 |                                                    'user'))
 27 | 
 28 | 
 29 | def _get_reference_url(element):
 30 |     '''
 31 |     Returns either the URL to the given element (if the `name` attribute is
 32 |     set) or the URL the element points to (if the `href` attribute is present).
 33 | 
 34 |     Args:
 35 |       element: The lxml element from which to extract the URL.
 36 | 
 37 |     Returns:
 38 |       str -- The URL that points to the element (name) or that the element
 39 |       is pointing to (href)
 40 |     '''
 41 |     if 'name' in element.attrib:
 42 |         return f"#{element.attrib['name']}"
 43 | 
 44 |     if 'href' in element.attrib:
 45 |         return f"{element.attrib['href']}"
 46 | 
 47 |     return None
 48 | 
 49 | 
 50 | def _get_user_name(element):
 51 |     '''
 52 |     Returns either the URL to the given element (if the `name` attribute is
 53 |     set) or the URL the element points to (if the `href` attribute is present) or the cleaned text.
 54 | 
 55 |     Args:
 56 |       element: The lxml element from which to extract the URL.
 57 | 
 58 |     Returns:
 59 |       str -- The URL that points to the element (name) or that the element
 60 |       is pointing to (href)
 61 |     '''
 62 |     if element.tag == 'a':
 63 |         return _get_reference_url(element)
 64 |     else:
 65 |         return extract_text(element)
 66 | 
 67 | 
 68 | def _get_date_text(time_element, time_element_as_datetime=True):
 69 |     is_tag_time = time_element.tag == 'time'
 70 |     if is_tag_time and 'datetime' in time_element.attrib:
 71 |         if time_element_as_datetime:
 72 |             time = time_element.attrib.get('datetime', '')
 73 |             parsed_time = parser.parse(time, ignoretz=True)
 74 |             return is_tag_time, parsed_time
 75 | 
 76 |     return is_tag_time, get_cleaned_element_text(time_element)
 77 | 
 78 | 
 79 | def get_forum_date(dom, post_date_xpath, result_as_datetime=True):
 80 |     '''
 81 |     Selects the date present in the given post_date_xpath. Future dates are
 82 |     automatically filtered. If no date has been identified for a post, a None
 83 |     value is inserted.
 84 | 
 85 |     Args:
 86 |         dom: the DOM representation of the forum page.
 87 |         post_date_xpath (str): The xpath of the forum date.
 88 |         result_as_datetime (bool): If true the date are returned as datetime. Otherwise the date are returned as string
 89 | 
 90 |     Returns:
 91 |         list -- A list of dates for every forum post.
 92 |     '''
 93 |     result = []
 94 |     date_mentions = (_get_date_text(e, time_element_as_datetime=result_as_datetime)
 95 |                      for e in dom.xpath(post_date_xpath) if
 96 |                      e.tag == 'time' or search_dates(_get_date_text(e)[1], languages=LANGUAGES))
 97 |     for is_time_element, date_mention in date_mentions:
 98 |         found = None
 99 |         if is_time_element:
100 |             found = date_mention
101 |         else:
102 |             for data_as_string, date in sorted(
103 |                     search_dates(date_mention, settings={'RETURN_AS_TIMEZONE_AWARE': False}, languages=LANGUAGES),
104 |                     key=itemgetter(1), reverse=True):
105 |                 if date <= datetime.now():
106 |                     if result_as_datetime:
107 |                         found = date
108 |                     else:
109 |                         found = data_as_string
110 |                     break
111 |         result.append(found)
112 | 
113 |     return result
114 | 
115 | 
116 | def get_forum_url(dom, post_url_xpath):
117 |     '''
118 |     Args:
119 |       dom: The DOM representation of the forum page.
120 |       post_url_xpath (str): The xpath to the post URL.
121 |       url (str): The URL of the given page.
122 | 
123 |     Returns:
124 |       list -- A list of all forum URLs.
125 |     '''
126 |     return [_get_reference_url(element)
127 |             for element in dom.xpath(post_url_xpath)]
128 | 
129 | 
130 | def get_forum_user(dom, post_user_xpath):
131 |     '''
132 |     Args:
133 |       dom: The DOM representation of the forum page.
134 |       post_user_xpath (str): The xpath to the post user name.
135 |       url (str): The URL of the given page.
136 | 
137 |     Returns:
138 |       list -- A list of all forum URLs.
139 |     '''
140 |     return [_get_user_name(element)
141 |             for element in dom.xpath(post_user_xpath)]
142 | 
143 | 
144 | def generate_forum_url(url, num_posts):
145 |     '''
146 |     Generates forum URLs based on the forum base URL and the number of
147 |     posts.
148 | 
149 |     Args:
150 |       url (str): the forum URL
151 |       num_posts (int): the number of posts for which to generate a URL
152 |     Returns:
153 |       list -- a list of URLs for the posts.
154 |     '''
155 |     return [urljoin(url, f'#{no}') for no in range(1, num_posts + 1)]
156 | 
157 | 
158 | def _get_same_size_as_posts(length_forum_post, forum_element):
159 |     result = forum_element[-length_forum_post:]
160 |     if len(forum_element) != length_forum_post:
161 |         for x in range(0, length_forum_post - len(forum_element)):
162 |             result.append(forum_element[0])
163 |     return result
164 | 
165 | 
166 | def _get_container_elements(dom, xpath, number_of_posts):
167 |     post_elements = dom.xpath(xpath)
168 |     while True:
169 |         xpath = xpath + "/.."
170 |         new_post_elements = dom.xpath(xpath)
171 |         if new_post_elements is None or len(new_post_elements) < number_of_posts:
172 |             return post_elements
173 |         post_elements = new_post_elements
174 | 
175 | 
176 | def add_anonymous_user(dom, users, post_xpath, post_user_xpath):
177 |     posts = dom.xpath(post_xpath)
178 |     if len(posts) > len(users):
179 |         user_elements = dom.xpath(post_user_xpath)
180 |         posts = _get_container_elements(dom, post_xpath, len(posts))
181 |         for index in range(len(posts)):
182 |             contains_user = False
183 |             for tag in posts[index].iterdescendants():
184 |                 if tag in user_elements:
185 |                     contains_user = True
186 |                     break
187 |             if not contains_user:
188 |                 users.insert(index, "Anonymous")
189 |                 if len(posts) == len(users):
190 |                     break
191 | 
192 | 
193 | def extract_posts(html_content, url, post_xpath, post_url_xpath,
194 |                   post_date_xpath, post_user_xpath, result_as_datetime=True):
195 |     '''
196 |     Returns:
197 |       dict -- The extracted forum post and the corresponding metadat.
198 |     '''
199 |     dom = get_html_dom(html_content)
200 | 
201 |     forum_posts = remove_boilerplate(get_xpath_tree_text(dom, post_xpath))
202 |     forum_urls = get_forum_url(dom, post_url_xpath) \
203 |         if post_url_xpath else generate_forum_url(url, len(forum_posts))
204 |     forum_dates = get_forum_date(dom, post_date_xpath, result_as_datetime=result_as_datetime) \
205 |         if post_date_xpath else len(forum_posts) * ['']
206 |     forum_users = get_forum_user(dom, post_user_xpath) \
207 |         if post_user_xpath else len(forum_posts) * ['']
208 | 
209 |     add_anonymous_user(dom, forum_users, post_xpath, post_user_xpath)
210 |     forum_urls = _get_same_size_as_posts(len(forum_posts), forum_urls)
211 |     forum_dates = _get_same_size_as_posts(len(forum_posts), forum_dates)
212 |     forum_users = _get_same_size_as_posts(len(forum_posts), forum_users)
213 | 
214 |     return [ExtractionResult(post, url, date, user)
215 |             for post, url, date, user in zip(forum_posts, forum_urls,
216 |                                              forum_dates, forum_users)]
217 | 


--------------------------------------------------------------------------------
/scripts/test-urls.lst:
--------------------------------------------------------------------------------
  1 | #Various random forums
  2 | 
  3 | #https://bbs.archlinux.org/viewtopic.php?id=249553
  4 | #https://forum.ubuntuusers.de/topic/ubuntu-lst-18-04-newbie/
  5 | #https://forum.ubuntuusers.de/topic/appimage-programm-in-alle-programme-als-icon-a/
  6 | #https://forum.odroid.com/viewtopic.php?f=139&t=22170
  7 | #https://forum.odroid.com/viewtopic.php?f=139&t=19897
  8 | 
  9 | #Medical forums
 10 | #http://blog.angelman-asa.org/read.php?2,736
 11 | #http://blog.angelman-asa.org/read.php?2,132
 12 | #https://community.scope.org.uk/discussion/57774/copd
 13 | #https://community.scope.org.uk/discussion/68941/disabled-mum
 14 | #https://forums.maladiesraresinfo.org/post11011.html#p11011
 15 | #https://forums.maladiesraresinfo.org/credit-immobilier-maladie-rare-t2720.html
 16 | #https://healthunlocked.com/parkinsonsmovement/posts/142058845/artane-anyone
 17 | #https://healthunlocked.com/parkinsonsmovement/posts/143660160/the-radiograph-shows-calcium-deposits-in-the-joint-capsule.-the-other-radiograph-is-the-podiatrists-own-ankle-for-comparison.
 18 | #https://myparkinsons.org/cgi-bin/forum/topic_show.pl?id=5231
 19 | #https://myparkinsons.org/cgi-bin/forum/topic_show.pl?id=5256
 20 | #https://shift.ms/topic/cbd-oil-11
 21 | #https://shift.ms/topic/news-on-myelin-repair
 22 | #https://www.amsel.de/multiple-sklerose-forum/?tnr=1&mnr=217239&archiv_flag=2&fv=1
 23 | #https://www.amsel.de/multiple-sklerose-forum/?tnr=1&mnr=221323&archiv_flag=2&fv=1
 24 | #https://www.medhelp.org/posts/Heart-Disease/Wolfe-Parkinson-White-Syndrome/show/250747
 25 | #https://www.medhelp.org/posts/Heart-Rhythm/Tikosyn-load-ablation/show/1640925
 26 | #https://www.medschat.com/Discuss/how-important-is-this-medician-G-E-Sulfamethoxazole-TMP-DS-Tabitp-to-take-due-to-COPD-206090.htm?srcq=copd
 27 | #https://www.medschat.com/Discuss/Nexium-drug-information-159060.htm
 28 | #https://www.msconnection.org/Discussions/f33/t77364/tp1/How-long-is-too-long-to-wait-for-an-initial-con
 29 | #https://www.msconnection.org/Discussions/f27/t79421/tp1/Does-this-sound-like-MS
 30 | #https://www.msworld.org/forum/showthread.php?145403-Sort-of-new-here
 31 | #https://www.msworld.org/forum/showthread.php?143493-FDA-Approes-Generic-20mg-AND-40MG
 32 | #https://www.mumsnet.com/Talk/pregnancy/3749275-Pregnant-with-a-black-mixed-race-with-black-baby
 33 | #https://www.mumsnet.com/Talk/adoptions/3940686-Siblings
 34 | #http://www.paradisi.de/Health_und_Ernaehrung/Erkrankungen/Parkinson/Forum/120167.php
 35 | #http://www.paradisi.de/Health_und_Ernaehrung/Erkrankungen/Parkinson/Forum/186517.php
 36 | 
 37 | #Random Forums from https://en.wikipedia.org/wiki/List_of_Internet_forums
 38 | 
 39 | #https://www.airliners.net/forum/viewtopic.php?f=3&t=1437935
 40 | #https://www.airliners.net/forum/viewtopic.php?f=3&t=1428699
 41 | #https://bpdfamily.com/message_board/index.php?topic=343886.0
 42 | #https://bpdfamily.com/message_board/index.php?topic=344165.0
 43 | #https://talk.collegeconfidential.com/student-here-ask-me-anything/2183693-got-into-nyu-pre-med-intention-ask-me-anything.html
 44 | #https://talk.collegeconfidential.com/ivy-league/2184314-application-to-ivy-leagues.html
 45 | #http://forum.ebaumsworld.com/viewtopic.php?f=14&t=42095&start=470
 46 | #http://forum.ebaumsworld.com/viewtopic.php?f=14&t=78519
 47 | #https://www.fanfiction.net/topic/146535/168685925/1/The-OC-Creation-and-Minor-Character-Information-Topic
 48 | #https://www.fanfiction.net/topic/146535/108548484/1/The-About-the-World-Topic
 49 | #https://www.gtplanet.net/forum/threads/f1-2018-general-discussion.378195/
 50 | #https://www.gtplanet.net/forum/threads/historic-cars-in-f1-2018-feel-slugish.387294/
 51 | #https://kiwifarms.net/threads/deviantart-horrors.2366/
 52 | #https://kiwifarms.net/threads/the-twitter-pedo-hunter-loli-crusader-community.64404/
 53 | #https://forums.macrumors.com/threads/se-or-11.2231616/
 54 | #https://forums.macrumors.com/threads/x-vs-8.2183765/
 55 | #https://forums.moneysavingexpert.com/discussion/5567669/unsure-whether-to-consolidate-please-advise
 56 | #https://forums.moneysavingexpert.com/discussion/6100693/how-do-0-credit-card-balances-work-when-you-have-borrowed-twice
 57 | #https://www.nairaland.com/5813456/how-snakes-get-into-toilet
 58 | #https://www.nairaland.com/5812914/akeredolu-rejects-plot-impeach-deputy
 59 | #https://forum.nationstates.net/viewtopic.php?f=4&t=170098
 60 | #https://forum.nationstates.net/viewtopic.php?f=12&t=419
 61 | #https://www.neowin.net/forum/topic/1393830-hello-everyone/
 62 | #https://www.neowin.net/forum/topic/1391546-hello-im-dion/
 63 | #https://www.pistonheads.com/gassing/topic.asp?h=0&f=239&t=1858583
 64 | #https://www.pistonheads.com/gassing/topic.asp?h=0&f=156&t=1866139
 65 | #http://skyscraperpage.com/forum/showthread.php?t=242327
 66 | #http://skyscraperpage.com/forum/showthread.php?t=242165
 67 | #https://forums.sherdog.com/threads/all-time-goat-poll.3916359/
 68 | #https://forums.sherdog.com/threads/free-fight-nick-diaz-debut.4102395/
 69 | 
 70 | 
 71 | #http://www.beliebte-foren.de/
 72 | 
 73 | #https://www.computerbase.de/forum/threads/ram-upgrade-auf-32-gb-fuer-3700x.1940201/
 74 | #https://www.computerbase.de/forum/threads/ram-empfehlung-fuer-ryzen.1940441/
 75 | #https://proxer.me/forum/142-anime/386798-kann-keine-anime-mehr-abspielen
 76 | #https://proxer.me/forum/213-allgemein/386665-grammatikfehler-auf-der-seite
 77 | #http://www.hifi-forum.de/viewthread-84-87.html
 78 | #http://www.hifi-forum.de/viewthread-84-29928.html
 79 | #https://www.android-hilfe.de/forum/samsung-allgemein.423/faq-diskussion-zum-kauf-samsung-galaxy-s10-s10e-s10-snapdragon-variante.904645.html
 80 | #https://www.android-hilfe.de/forum/samsung-galaxy-s10-s10-s10e-s10-5g.3478/samsung-galaxy-s10e-s10-s10-zeigt-her-eure-homescreens.905512.html
 81 | #https://www.drwindows.de/windows-7-allgemein/16340-zufall-entdeckte-problemlsungen.html
 82 | #https://www.drwindows.de/windows-7-allgemein/167371-windows-7-dvd-iso-datei-umwandel.html
 83 | #https://www.dslr-forum.de/showthread.php?t=1847412
 84 | #https://www.dslr-forum.de/showthread.php?t=2016951
 85 | #https://forum.mein-schoener-garten.de/viewtopic.php?f=1&t=4825193&sid=0e26c5b6c7cd9b067a6a5dc32896eebb
 86 | #https://forum.mein-schoener-garten.de/viewtopic.php?f=1&t=4829305&sid=0e26c5b6c7cd9b067a6a5dc32896eebb
 87 | #https://www.med1.de/forum/beruf-alltag-und-umwelt/corona-eine-gehypde-apokalypse-972190/
 88 | #https://www.med1.de/forum/blut-gefaesse-herz-lunge/sauerstoffsaettigung-nachts-969551/
 89 | #https://forum.digitalfernsehen.de/threads/df-hilferuf.416785/
 90 | #https://forum.digitalfernsehen.de/threads/erneuerbare-energie.413489/
 91 | #https://www.juraforum.de/forum/t/bettlaegerige-person-ohne-pflege-aus-krankenhaus-entlassen.678903/
 92 | #https://www.juraforum.de/forum/t/fahrtkostenerstattung-bei-falschen-rezepten.675629/
 93 | #https://www.musiker-board.de/threads/baubericht-0-14-ital-fichte-palisander.689167/
 94 | #https://www.musiker-board.de/threads/kopfplattenbruch-reparatur-lakewood-m48-custom.706841/
 95 | #https://forum.worldofplayers.de/forum/threads/1553036-Wie-aufwendig-ist-die-Arbeit-mit-vBulletin
 96 | #https://forum.worldofplayers.de/forum/threads/1548322-Welchen-Blog-benutzt-man-in-2020
 97 | #https://www.klamm.de/forum/f42/klamm-treff-jeder-lernt-die-stadt-von-jedem-kennen-327507.html
 98 | #https://www.klamm.de/forum/f42/conventioncamp-in-hannover-341612.html
 99 | #https://uhrforum.de/threads/der-yema-fotothread-und-nicht-nur-das.414009/
100 | #https://uhrforum.de/threads/schachtel-fuer-mauthe-nr-50-322.432114/
101 | #https://www.wohnmobilforum.de/w-t141583.html
102 | #https://www.wohnmobilforum.de/w-t141863.html
103 | #https://forum.glamour.de/t/nebenwirkungen-aknenormin/345148/2
104 | #https://forum.glamour.de/t/designertaschen-laber-laber/18136
105 | 
106 | 
107 | #Top forum examples from https://www.wpressblog.com/free-forum-posting-sites-list/
108 | 
109 | #https://www.cnet.com/forums/discussions/welcome-to-the-digital-camera-forum-315232/
110 | #https://www.cnet.com/forums/discussions/select-camera/
111 | #https://forum.wordreference.com/threads/attuned-to-the-reiki-symbols.3691417/
112 | #https://forum.wordreference.com/threads/adding-accent-marks-accent-marks-are-mandatory-in-french.557434/
113 | #https://forum.utorrent.com/topic/86747-help-us-build-the-next-great-bittorrent-product/
114 | #https://forum.utorrent.com/topic/23012-check-on-startup/
115 | #https://forum.xda-developers.com/showthread.php?t=2326393
116 | #https://forum.xda-developers.com/android/software/tool-tool-one-driversunlocktwrpfactory-t3358711
117 | #https://us.forums.blizzard.com/en/wow/t/layers-and-character-creation-adjustments-on-select-realms/499760
118 | #https://us.forums.blizzard.com/en/wow/t/can-i-transfer-back-to-locked-server-if-i-have-existing-character/505388
119 | #https://forum.videolan.org/viewtopic.php?f=14&t=92075
120 | #https://forum.videolan.org/viewtopic.php?f=14&t=145604
121 | #https://community.kaspersky.com/kaspersky-security-cloud-11/rootkit-scan-not-executed-6849
122 | #https://community.kaspersky.com/kaspersky-security-cloud-11/portuguese-in-free-version-8313
123 | #https://forum.statcounter.com/threads/custom-tags-examples.44340/
124 | #https://forum.statcounter.com/threads/best-android-apps-in-uk-2019.79812/
125 | #https://forums.futura-sciences.com/annonces-officielles/78761-moderateurs.html
126 | #https://forums.futura-sciences.com/annonces-officielles/12735-latex-debarque-fsg-explications-mode-demploi.html
127 | #https://forum.openoffice.org/en/forum/viewtopic.php?f=5&t=63160
128 | #https://forum.openoffice.org/en/forum/viewtopic.php?f=5&t=82202
129 | #https://community.bitdefender.com/en/discussion/82059/i-noticed-that-the-bitdefender-process-can-be-easily-killed
130 | #https://community.bitdefender.com/en/discussion/81455/how-to-disable-notification
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/src/harvest/posts.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Forum Extraction AI and heuristic
  4 | # ---------------------------------
  5 | # (C)opyrights 2020 Albert Weichselbraun
  6 | 
  7 | # simplifications:
  8 | # ================
  9 | # - only consider tags with a class attribute
 10 | # - vsm based on the hashing trick
 11 | 
 12 | # algorithm
 13 | # =========
 14 | # - match text to xpath nodes
 15 | # - extract the text based on the xpath nodes and determine the best match
 16 | #   based on the node + its children
 17 | # - from the best match that yields multiple results (i.e. forum posts)
 18 | #   select node parent elements as long as we still get the same number of
 19 | #   results.
 20 | # - constraints
 21 | #   - blocked tags are not allowed to appear down- or upstream of the selected
 22 | #     path (e.g. it is not possible that a forum post contains a 'form' or
 23 | #     'input' element :)
 24 | #   - there are forums that are contained in a form tag ....
 25 | 
 26 | # cleanup posts
 27 | # -------------
 28 | # * remove repeated elements
 29 | # * appear at the beginning or end of a post
 30 | # * may contain information on
 31 | #   - user
 32 | #   - date (subscription versus post date) => always compare dates within a page for computing the date extraction rule
 33 | #   - replies, likes, etc.
 34 | 
 35 | import logging
 36 | import re
 37 | 
 38 | from lxml import etree
 39 | 
 40 | from harvest.cleanup.forum_post import remove_boilerplate
 41 | from harvest.metadata.date import get_date
 42 | from harvest.metadata.link import get_link
 43 | from harvest.metadata.username import get_user
 44 | from harvest.metadata.usertext import get_text_xpath_pattern
 45 | from harvest.post_text import get_cleaned_text
 46 | from harvest.similarity_calculator import assess_node
 47 | from harvest.utils import (get_xpath_expression, get_html_dom, get_xpath_combinations_for_classes,
 48 |                            get_xpath_tree_text, get_grandparent, elements_have_no_overlap)
 49 | 
 50 | CORPUS = "./data/forum/"
 51 | 
 52 | # number of characters required for a match
 53 | MATCH_PREFIX_SIZE = 30
 54 | 
 55 | BLACKLIST_POST_TEXT_TAG = ('h1', 'h2', 'h3', 'h4', 'h5', 'a')
 56 | 
 57 | # minimum number of posts we suspect on the page
 58 | MIN_POST_COUNT = 3
 59 | 
 60 | 
 61 | def _get_matching_element(comment, dom):
 62 |     """
 63 |     returns
 64 |     -------
 65 |     the element that matches the given comment
 66 |     """
 67 |     if not comment.strip():
 68 |         return None
 69 | 
 70 |     for e in dom.iter():
 71 |         text = (e.text or "").strip()
 72 |         min_length_of_text = len(comment[:MATCH_PREFIX_SIZE])
 73 |         if text and comment.startswith(text[:MATCH_PREFIX_SIZE]) and len(text) >= min_length_of_text and \
 74 |                 e.tag is not etree.Comment:
 75 |             return e
 76 | 
 77 |     return None
 78 | 
 79 | 
 80 | def _get_xpath_tree(comment, dom, tree):
 81 |     element = _get_matching_element(comment, dom)
 82 |     return (None, None) if element is None else (element, tree.getpath(element))
 83 | 
 84 | 
 85 | def _remove_trailing_p_element(xpath_score, xpath_element_count, xpath, reference_text, dom):
 86 |     """
 87 |     The p elements at the end can be removed. Some posts have several p elements and some have none at all.
 88 |     Those without p element can then not be detected. As Example, leading post can not be detected:
 89 |     https://us.forums.blizzard.com/en/wow/t/layers-and-character-creation-adjustments-on-select-realms/499760
 90 | 
 91 |     Args:
 92 |         xpath: the xpath to remove the p element from
 93 | 
 94 |     Returns:
 95 | 
 96 |     """
 97 |     cleaned_xpath = re.sub(r'(?<!([\/]))\/p$', '', xpath)
 98 |     if cleaned_xpath != xpath:
 99 |         xpath_score, xpath_element_count = assess_node(reference_content=reference_text, dom=dom,
100 |                                                        xpath=cleaned_xpath)
101 |     return xpath_score, xpath_element_count, cleaned_xpath
102 | 
103 | 
104 | def _get_xpaths_candidates(text_sections, dom, tree, reference_text):
105 |     candidate_xpaths = []
106 |     for section_text in text_sections:
107 |         element, xpath = _get_xpath_tree(section_text, dom, tree)
108 |         logging.debug(f"Processing section of text '{section_text}' with xpath '{xpath}'.")
109 |         if not xpath:
110 |             continue
111 |         element = _get_matching_element(section_text, dom)
112 |         if element.tag not in BLACKLIST_POST_TEXT_TAG:
113 |             xpath_pattern = get_xpath_expression(element, parent_element=get_grandparent(element),
114 |                                                  single_class_filter=True)
115 | 
116 |             xpath_score, xpath_element_count = assess_node(reference_content=reference_text, dom=dom,
117 |                                                            xpath=xpath_pattern, reward_classes=True)
118 |             if xpath_element_count > 1:
119 |                 candidate_xpaths.append((xpath_score, xpath_element_count, xpath_pattern))
120 | 
121 |     return candidate_xpaths
122 | 
123 | 
124 | def _get_post_frame(xpath_pattern, xpath_score, reference_text, dom):
125 |     while True:
126 |         new_xpath_pattern = xpath_pattern + "/.."
127 |         new_xpath_score, new_xpath_element_count = assess_node(reference_content=reference_text, dom=dom,
128 |                                                                xpath=new_xpath_pattern)
129 |         if new_xpath_element_count < MIN_POST_COUNT:
130 |             return xpath_pattern, xpath_score
131 | 
132 |         xpath_pattern = new_xpath_pattern
133 |         xpath_score = new_xpath_score
134 | 
135 | 
136 | def _get_combination_of_posts(xpath_pattern, xpath_score, xpath_element_count, reference_text, dom):
137 |     """
138 |     Check if combinations of classes result in detecting leading post
139 |     Args:
140 |         xpath_pattern:
141 |         xpath_score:
142 |         xpath_element_count:
143 |         reference_text:
144 |         dom:
145 | 
146 |     Returns:
147 |     Combination of classes if they resulting in a better score. Otherwise the parameters xpath_patter, xpath_score and
148 |     xpath_element_count are returned.
149 |     """
150 |     candidate_xpaths = []
151 |     for final_xpath in get_xpath_combinations_for_classes(xpath_pattern):
152 |         new_xpath_score, new_xpath_element_count = assess_node(reference_content=reference_text, dom=dom,
153 |                                                                xpath=final_xpath)
154 |         if (xpath_element_count < new_xpath_element_count <= xpath_element_count + 2 or
155 |             xpath_element_count * 2 - new_xpath_element_count in range(-1, 2)) and new_xpath_score > xpath_score:
156 |             if elements_have_no_overlap(dom.xpath(final_xpath)):
157 |                 candidate_xpaths.append((new_xpath_score, new_xpath_element_count, final_xpath))
158 | 
159 |     if candidate_xpaths:
160 |         candidate_xpaths.sort()
161 |         return candidate_xpaths.pop()
162 |     return xpath_score, xpath_element_count, xpath_pattern
163 | 
164 | 
165 | def extract_posts(html, url):
166 |     dom = get_html_dom(html)
167 |     tree = etree.ElementTree(dom)
168 |     result = {'url': url, 'dragnet': None, 'url_xpath_pattern': None, 'xpath_pattern': None,
169 |               'xpath_score': None, 'forum_posts': None, 'date_xpath_pattern': None, 'user_xpath_pattern': None,
170 |               'text_xpath_pattern': None}
171 | 
172 |     text_sections = get_cleaned_text(html)
173 |     logging.debug(f"Extracted {len(text_sections)} lines of comments.")
174 |     reference_text = " ".join(text_sections)
175 | 
176 |     candidate_xpaths = _get_xpaths_candidates(text_sections, dom, tree, reference_text)
177 | 
178 |     if not candidate_xpaths:
179 |         logging.warning("Couldn't identify any candidate posts for forum", url)
180 |         return result
181 | 
182 |     # obtain anchor node
183 |     candidate_xpaths.sort()
184 |     xpath_score, xpath_element_count, xpath_pattern = candidate_xpaths.pop()
185 |     xpath_score, xpath_element_count, xpath_pattern = _remove_trailing_p_element(xpath_score, xpath_element_count,
186 |                                                                                  xpath_pattern, reference_text, dom)
187 | 
188 |     xpath_pattern, xpath_score = _get_post_frame(xpath_pattern, xpath_score, reference_text, dom)
189 | 
190 |     xpath_score, xpath_element_count, xpath_pattern = _get_combination_of_posts(xpath_pattern, xpath_score,
191 |                                                                                 xpath_element_count, reference_text,
192 |                                                                                 dom)
193 | 
194 |     logging.info(
195 |         f"Obtained most likely forum xpath for forum {url}: {xpath_pattern} with a score of {xpath_score}.")
196 |     if xpath_pattern:
197 |         forum_posts = get_xpath_tree_text(dom, xpath_pattern)
198 |         forum_posts = remove_boilerplate(forum_posts)
199 | 
200 |     result['xpath_pattern'] = xpath_pattern
201 |     result['xpath_score'] = xpath_score
202 |     result['forum_posts'] = forum_posts
203 | 
204 |     if xpath_pattern:
205 |         result['text_xpath_pattern'] = get_text_xpath_pattern(dom, xpath_pattern, forum_posts)
206 | 
207 |     # add the post URL
208 |     url_xpath_pattern = get_link(dom, xpath_pattern, url, forum_posts)
209 |     if url_xpath_pattern:
210 |         result['url_xpath_pattern'] = url_xpath_pattern
211 | 
212 |     # add the post Date
213 |     date_xpath_pattern = get_date(dom, xpath_pattern, url, forum_posts)
214 |     if date_xpath_pattern:
215 |         result['date_xpath_pattern'] = date_xpath_pattern
216 | 
217 |     # add the post user
218 |     user_xpath_pattern = get_user(dom, xpath_pattern, url, forum_posts)
219 |     if user_xpath_pattern:
220 |         result['user_xpath_pattern'] = user_xpath_pattern
221 |     return result
222 | 


--------------------------------------------------------------------------------
/src/harvest/metadata/username.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | link
  3 | ----
  4 | 
  5 | Tries to obtain the name of the post's author
  6 | '''
  7 | import logging
  8 | import re
  9 | import numpy as np
 10 | 
 11 | from harvest.config import LANGUAGES
 12 | from itertools import combinations
 13 | from collections import defaultdict
 14 | from dateparser.search import search_dates
 15 | from urllib.parse import urlparse, urljoin
 16 | 
 17 | from harvest.utils import (get_xpath_expression, get_xpath_expression_child_filter, get_merged_xpath,
 18 |                            get_cleaned_element_text)
 19 | 
 20 | USER_PAGE_HINTS = ('user', 'member', 'person', 'profile')
 21 | FORBIDDEN_TERMS = ('terms of use', 'privacy policy', 'add message', 'reply', 'answer', 'share', 'report', 'registered',
 22 |                    'setting')
 23 | 
 24 | SCORE_INCREMENT = 1
 25 | SCORE_TEXT_CHANCE_INCREMENT = 3
 26 | 
 27 | 
 28 | def _set_user_hint_exits_for_attribute(matches, attribute_value):
 29 |     for user_hint in USER_PAGE_HINTS:
 30 |         if re.search(user_hint, attribute_value, re.IGNORECASE):
 31 |             matches['score'] += SCORE_INCREMENT
 32 |             return True
 33 | 
 34 | 
 35 | def _set_user_hint_exits(url_candidates):
 36 |     for xpath, matches in [x for x in url_candidates.items()]:
 37 |         _set_user_hint_exits_for_attribute(matches, xpath)
 38 |         for match in [m.get('href') for m in matches['elements'] if m.get('href')]:
 39 |             if _set_user_hint_exits_for_attribute(matches, match.lower()):
 40 |                 break
 41 | 
 42 | 
 43 | def _set_text_changes(url_candidates):
 44 |     for xpath, matches in list(url_candidates.items()):
 45 |         if len(np.unique([e.text for e in matches['elements'] if e.text])) > 1:
 46 |             matches['score'] += SCORE_TEXT_CHANCE_INCREMENT
 47 |         else:
 48 |             text_in_sub_elements = []
 49 |             for tag in [e for e in matches['elements']]:
 50 |                 for subTag in tag.iterdescendants('span', 'div', 'b', 'strong'):
 51 |                     if subTag.text and subTag.text not in text_in_sub_elements:
 52 |                         text_in_sub_elements.append(subTag.text)
 53 |             if len(text_in_sub_elements) > 1:
 54 |                 matches['score'] += SCORE_TEXT_CHANCE_INCREMENT
 55 | 
 56 | 
 57 | def _filter_items_with_forbidden_words(url_candidates):
 58 |     for xpath, matches in list(url_candidates.items()):
 59 |         for tag in matches['elements']:
 60 |             if tag.text and tag.text.strip().lower() in FORBIDDEN_TERMS:
 61 |                 del url_candidates[xpath]
 62 |                 break
 63 | 
 64 | 
 65 | def _filter_user_name_without_link_includes_date(url_candidates):
 66 |     for xpath, candidate in [x for x in url_candidates.items() if not x[1]['is_link']]:
 67 |         for element in candidate['elements']:
 68 |             text = element.text.strip()
 69 |             if search_dates(text, languages=LANGUAGES) or text in FORBIDDEN_TERMS:
 70 |                 del url_candidates[xpath]
 71 |                 break
 72 | 
 73 | 
 74 | def _filter_user_name_without_link_and_no_text_changes(url_candidates):
 75 |     for xpath, candidate in [x for x in url_candidates.items()
 76 |                              if not x[1]['is_link'] and x[1]['score'] == 0]:
 77 |         previous_element = None
 78 |         has_changed = False
 79 |         for element in candidate['elements']:
 80 |             text = element.text.strip()
 81 |             if previous_element is not None and previous_element.text.strip() != text:
 82 |                 has_changed = True
 83 |                 break
 84 |             previous_element = element
 85 | 
 86 |         if not has_changed and url_candidates[xpath]:
 87 |             del url_candidates[xpath]
 88 | 
 89 | 
 90 | def _filter_more_than_one_element_per_post(url_candidates, post_elements):
 91 |     if len(post_elements) > 1:
 92 |         for xpath, candidate in [x for x in url_candidates.items()]:
 93 |             for post_element in post_elements:
 94 |                 if len([x for x in post_element.iterdescendants() if x in candidate['elements']]) > 1 and \
 95 |                         url_candidates[xpath]:
 96 |                     del url_candidates[xpath]
 97 |                     break
 98 | 
 99 | 
100 | def _filter_post_links(url_candidates):
101 |     for xpath, candidate in list(url_candidates.items()):
102 |         sequence = re.findall(r"\d+", " ".join(get_cleaned_element_text(x) for x in candidate['elements']))
103 |         sequence = [int(s) for s in sequence]
104 |         if len(sequence) > 2 and all(x + 1 == y for x, y in zip(sequence, sequence[1:])):
105 |             del url_candidates[xpath]
106 | 
107 | 
108 | def _filter_post_to_candidate_length(url_candidates, posts):
109 |     for xpath, matches in list(url_candidates.items()):
110 |         if len(matches['elements']) > len(posts) or len(matches['elements']) < len(posts) - 2:
111 |             del url_candidates[xpath]
112 | 
113 | 
114 | def _filter_url_other_domain(url_candidates, base_url):
115 |     forum_url = urlparse(base_url)
116 |     for xpath, matches in [x for x in url_candidates.items() if x[1]['is_link']]:
117 |         for match in matches['elements']:
118 |             parsed_url = urlparse(urljoin(base_url, match.attrib.get('href', '')))
119 |             if parsed_url.netloc and parsed_url.netloc != forum_url.netloc or parsed_url.path == forum_url.path:
120 |                 del url_candidates[xpath]
121 |                 break
122 | 
123 | 
124 | def _is_user_name_pattern(text):
125 |     return text and text.strip() and 3 < len(text.strip()) < 100 and len(
126 |         text.strip().split(" ")) <= 4 and not re.findall('http[s]?://', text)
127 | 
128 | 
129 | def _contains_user_name_pattern(tag):
130 |     if tag.getchildren():
131 |         return _is_user_name_pattern(" ".join([get_cleaned_element_text(x) for x in tag.iterdescendants()]))
132 |     return _is_user_name_pattern(tag.text)
133 | 
134 | 
135 | def _combine_xpath_candidates(url_candidates, number_of_posts):
136 |     candidates_less_then_posts = [x for x in url_candidates.items() if len(x[1]['elements']) < number_of_posts]
137 |     if number_of_posts > 1 and len(candidates_less_then_posts) > 1:
138 |         valid_combinations = []
139 |         for comb in combinations(candidates_less_then_posts, 2):
140 |             if len(comb[0][1]['elements']) + len(comb[1][1]['elements']) == number_of_posts:
141 |                 valid_combinations.append(comb)
142 |         for elements1, elements2 in sorted(valid_combinations,
143 |                                            key=lambda x: (x[0][1]['score'] + x[1][1]['score']), reverse=True):
144 |             combined_xpath = elements1[0] + "|" + elements2[0]
145 |             url_candidates[combined_xpath]['elements'] = elements1[1]['elements'] + elements2[1]['elements']
146 |             url_candidates[combined_xpath]['is_link'] = elements1[1]['is_link'] or elements2[1]['is_link']
147 |             url_candidates[combined_xpath]['score'] = min(elements1[1]['score'], elements2[1]['score'])
148 |             break
149 | 
150 | 
151 | def _collect_candidates_paths(post_elements):
152 |     url_candidates = defaultdict(lambda: {'elements': [], 'is_link': True, 'score': 0})
153 |     for element in post_elements:
154 |         for tag in element.iterdescendants():
155 |             if ((tag.tag == 'a' and 'href' in tag.attrib and not [x for x in list(tag) if x.tag == 'time']) or
156 |                 (tag.tag in ['span', 'strong', 'div', 'b'] and not list(tag))) and \
157 |                     _contains_user_name_pattern(tag):
158 |                 xpath = get_xpath_expression(tag, parent_element=element, single_class_filter=True)
159 |                 xpath += get_xpath_expression_child_filter(tag)
160 |                 url_candidates[xpath]['elements'].append(tag)
161 |                 if tag.tag != 'a':
162 |                     url_candidates[xpath]['is_link'] = False
163 | 
164 |     return url_candidates
165 | 
166 | 
167 | def get_user_name(name, base_url):
168 |     '''
169 |     returns
170 |     -------
171 |     A standardized representation of the user's URL.
172 |     '''
173 |     return ".".join(name.split()) + '@' + urlparse(base_url).netloc
174 | 
175 | 
176 | def _get_user(dom, post_elements, base_url, posts):
177 |     url_candidates = _collect_candidates_paths(post_elements)
178 | 
179 |     for merged_xpath in get_merged_xpath(url_candidates.keys()):
180 |         merged_elements = dom.xpath(merged_xpath)
181 |         if merged_elements:
182 |             url_candidates[merged_xpath]['elements'] = merged_elements
183 | 
184 |     _filter_url_other_domain(url_candidates, base_url)
185 |     _filter_items_with_forbidden_words(url_candidates)
186 |     _filter_user_name_without_link_includes_date(url_candidates)
187 |     _filter_post_links(url_candidates)
188 |     _filter_more_than_one_element_per_post(url_candidates, post_elements)
189 | 
190 |     _set_user_hint_exits(url_candidates)
191 |     _set_text_changes(url_candidates)
192 | 
193 |     _combine_xpath_candidates(url_candidates, len(posts))
194 |     _filter_user_name_without_link_and_no_text_changes(url_candidates)
195 | 
196 |     _filter_post_to_candidate_length(url_candidates, posts)
197 | 
198 |     # obtain the most likely url path
199 | 
200 |     for xpath, _ in sorted(url_candidates.items(),
201 |                            key=lambda x: (x[1]['is_link'], x[1]['score'], len(x[1]['elements'])), reverse=True):
202 |         return xpath
203 | 
204 |     return None
205 | 
206 | 
207 | # strategy
208 | # --------
209 | # * consider decendndants as well as elements at the same level
210 | # * the number of URL candidates must be identical to the number of posts
211 | #   or must not have less than two elements than the posts
212 | # * assign points for URLs that contain 'user', 'member', 'person', 'profile',
213 | #   etc.
214 | 
215 | 
216 | def get_user(dom, post_xpath, base_url, posts):
217 |     """
218 |     Obtains the URL to the given post.
219 | 
220 |     Args:
221 |         - dom: the forums DOM object
222 |         - post_xpath: the determined post xpath
223 |         - base url: URL of the given forum
224 |         - posts: the extracted posts
225 |     """
226 |     logging.info('Start finding user name')
227 |     post_elements = dom.xpath(post_xpath)
228 |     while True:
229 |         result = _get_user(dom, post_elements, base_url, posts)
230 |         if result or len(post_elements) <= 1:
231 |             logging.info(f'User name xpath: {result}')
232 |             return result
233 |         post_xpath = post_xpath + "/.."
234 |         post_elements = dom.xpath(post_xpath)
235 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/corpus/goldDocumentsPre/myparkinsons.org.cgi-bin.forum.topic_show.pl.5256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |  "id": "i18885220280728907651545511619171969200",
 3 |  "url": "https://myparkinsons.org/cgi-bin/forum/topic_show.pl?id=5256",
 4 |  "html": "<!DOCTYPE html>\n<HTML lang=\"en\">\n<HEAD>\n<meta charset=\"utf-8\" />\n<TITLE>Parkinsons disease Caregiver Help | Discuss Parkinson's disease: Diagnosis through Advanced Parkinson Care @MyParkinsons.org</TITLE>\n<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">\n<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">\n<META NAME=\"ROBOTS\" CONTENT=\"index, follow, all\">\n<META name=\"description\" content=\"Parkinson's disease Caregiver Online Support Group. Share your caregiver experiences with other parkinson caregivers.  Learn more about the disease and its effects from Parkinsons patients. \">\n<STYLE TYPE='text/css'>\n  <!--\n  BODY      { font-family: 'Verdana'; color: #000; font-size: 10pt;;background: #dddddd; margin-top:0;}\n#wrapper {margin: 0 auto;max-width: 960px;width: 98%;background:#fff;border: 1px solid #7e7f71;box-shadow: 0 0 10px 0px rgba(12, 3, 25, 0.4);}\n#callout {width: 100%;height: auto;text-align:center;background: #6f7060;overflow: hidden;padding-top:1%;}\n  P         { font-family: 'Verdana'; color: #000; font-size: 10pt; }\n  TD        { font-family: 'Verdana'; color: #000; font-size: 10pt; }\n  INPUT     { font-family: 'Verdana'; font-size: 10pt; }\n  SELECT    { font-family: 'Verdana'; font-size: 10pt; }\n  TEXTAREA  { font-family: 'Verdana'; font-size: 10pt; }\n  FORM      { margin-bottom: 0; }\n  A         { text-decoration: none; font-weight: normal; }\n  A:link    { color: #000086; }\n  A:visited { color: #333; }\n  A:active  { color: #ab4f48; }\n  A:hover   { color: red; }\nul { font-family: 'Verdana'; color: #000; font-size: 10pt; }\nli { font-family: 'Verdana'; color: #000; font-size: 10pt; }\n#band {width: 100%;min-height: 48px;background-image: url('/images/bkg_band.png');background-repeat: repeat-x;overflow: hidden;text-align:center;padding-top: 4px;}\n.band  {margin: 1% auto;font-size: .8em;color:#fff;letter-spacing:1px; font-weight:400;}\n.clearfix {clear: both; }\n.adsbygoogle {width:300px;height:250px;}\nimg.logo {max-width:575px; width:100%; height:auto;}\n@media screen and (max-width: 625px) {img.logo {margin:0 auto; width:90%;}}\n@media screen and (max-width: 575px) {.adsbygoogle {display:none;}}\n.banner img {width: 100%; border-top: 1px solid #7e7f71; border-top: 1px solid #7e7f71; }\n@media screen and (max-width: 478px) {body, table {font-size: .8em;}}\n  -->\n</STYLE>\n</HEAD>\n<BODY ><div id=wrapper>\n<div id=callout>\n<a href='/'><img class='logo' src='/images/logo_type.gif' style='border:none;'></a>\n</div>\n<div class=clearfix></div>\n<div id=band>\n<span class=band>For those who care for someone with Parkinson's disease</span><br>\n</div>\n<div class=clearfix></div>\n<center>\n<TABLE style='width:100%; border:none;'>\n<center>\n<!--  Content Table -->\n\n<TABLE width='80%' ALIGN='center' BORDER=0 BORDERCOLOR='silver' BGCOLOR='#d1d2c0' CELLSPACING=0 CELLPADDING=2>\n<TR><TD align=center>[<SPAN TITLE='Associated homepage'><A HREF='/'>Home</A></SPAN>]\n[<SPAN TITLE='Forum start page'><A HREF='forum_show.pl'>Forum</A></SPAN>]\n[<SPAN TITLE='Help and FAQ'><A HREF='forum_help.pl'>Help</A></SPAN>]\n[<SPAN TITLE='Search posts for keywords'><A HREF='forum_search.pl'>Search</A></SPAN>]\n[<SPAN TITLE='Register user account'><A HREF='/register/'>Register</A></SPAN>]\n[<SPAN TITLE='Login with username and password'><A HREF='user_login.pl'>Login</A></SPAN>]\n[<SPAN TITLE='Help Support our Efforts'><A HREF='/donate/'>Donate</A></SPAN>]\n</TD></TR>\n<TR bgcolor='#ffffff'><TD>\nYou are not logged in\n<P></TD></TR>\n</TABLE>\n\n<BR CLEAR='all'>\n\n<TABLE width='80%'  BORDER=0 BORDERCOLOR='silver' BGCOLOR='#d1d2c0' CELLSPACING=0 CELLPADDING=2>\n<TR><TD>\n<TABLE WIDTH='100%' BORDER=0 CELLSPACING=0 CELLPADDING=0>\n<TR><TD>\n<B>Topic</B> Coronavirus and PD?\n</TD><TD ALIGN='right'>\n<A HREF='topic_show.pl?id=5257'><IMG SRC='/forum/images/prev.gif' BORDER=0 HEIGHT=14 WIDTH=14 ALIGN='absmiddle' ALT='Go to previous topic'></A>\n<A HREF='topic_show.pl?id=5247'><IMG SRC='/forum/images/next.gif' BORDER=0 HEIGHT=14 WIDTH=14 ALIGN='absmiddle' ALT='Go to next topic'></A>\n<A HREF=''><IMG SRC='/forum/images/up.gif' BORDER=0 HEIGHT=14 WIDTH=14 ALIGN='absmiddle' ALT='Go to higher level'></A>\n</TD></TR>\n</TABLE>\n</TABLE>\n\n<BR>\n\n<A NAME='32677'></A>\n<TABLE width='80%'  BORDER=0 BORDERCOLOR='silver' BGCOLOR='#d1d2c0' CELLSPACING=0 CELLPADDING=2>\n<TR><TD BGCOLOR='#d1d2c0'>\n<TABLE WIDTH='100%' BORDER=0 CELLSPACING=0 CELLPADDING=0>\n<TR><TD><A HREF='#0'>\n<IMG SRC='https://www.myparkinsons.org/forum/images/post.gif' BORDER=0 WIDTH=14 HEIGHT=14 ALIGN='absmiddle'></A>\n<B>By</B> <A HREF='user_info.pl?id=16417'>jcoff012</A>\n<B>On</B> 2020.03.12 13:17\n</TD><TD ALIGN='right'>\n</TD></TR>\n</TABLE>\n</TD></TR>\n<TR bgcolor='#ffffff'><TD>\nWe don\u0092t see the neurologist til March 23, so I was wondering if anyone has any feedback from their\u0092s? As a diabetic with lymphedema, my primary said last Friday that I should not fly or be in large crowds for awhile. Seems that a PWP should have similar concerns, but do they?\n</TD></TR>\n</TABLE>\n\n<p>\n\n<A NAME='32678'></A>\n<TABLE width='80%'  BORDER=0 BORDERCOLOR='silver' BGCOLOR='#d1d2c0' CELLSPACING=0 CELLPADDING=2>\n<TR><TD BGCOLOR='#d1d2c0'>\n<TABLE WIDTH='100%' BORDER=0 CELLSPACING=0 CELLPADDING=0>\n<TR><TD><A HREF='#32677'>\n<IMG SRC='https://www.myparkinsons.org/forum/images/post.gif' BORDER=0 WIDTH=14 HEIGHT=14 ALIGN='absmiddle'></A>\n<B>By</B> <A HREF='user_info.pl?id=27725'>junipersage</A>\n<B>On</B> 2020.03.12 15:21\n</TD><TD ALIGN='right'>\n</TD></TR>\n</TABLE>\n</TD></TR>\n<TR bgcolor='#ffffff'><TD>\nWe haven't spoken to the neurologist, but I work in a county public health department and have been actively working on COVID-19. What we are telling people is that Parkinson's is not necessarily as strong a risk factor as other conditions (particularly lung conditions like asthma and COPD, and or heart conditions.) But even so, the virus is most severe in people over 60, and the risk increases with age. Also, any underlying condition that can make people more frail could be a problem. So its probably a good idea for PWP to take extra caution. Here's a webpage with more info: HREF='https://www.apdaparkinson.org/article/covid-19-overview-for-pd-community/'>https://www.apdaparkinson.org/article/covid-19-overview-for-pd-community/</A>\n</TD></TR>\n</TABLE>\n\n<p>\n\n<A NAME='32679'></A>\n<TABLE width='80%'  BORDER=0 BORDERCOLOR='silver' BGCOLOR='#d1d2c0' CELLSPACING=0 CELLPADDING=2>\n<TR><TD BGCOLOR='#d1d2c0'>\n<TABLE WIDTH='100%' BORDER=0 CELLSPACING=0 CELLPADDING=0>\n<TR><TD><A HREF='#32678'>\n<IMG SRC='https://www.myparkinsons.org/forum/images/post.gif' BORDER=0 WIDTH=14 HEIGHT=14 ALIGN='absmiddle'></A>\n<B>By</B> <A HREF='user_info.pl?id=16417'>jcoff012</A>\n<B>On</B> 2020.03.12 17:31\n</TD><TD ALIGN='right'>\n</TD></TR>\n</TABLE>\n</TD></TR>\n<TR bgcolor='#ffffff'><TD>\nBless you! This is wonderfully helpful. I have a sore throat and upset stomach, so we decided my husband would go to the Taekwondo studio with our grandson today. I am not concerned that this is Coronavirus, but feel strongly about a lower resistance now. He is my PWP, but after talking this out, he said he\u0092ll stay away from the other parents and staff.  He read the article, too. Thank you!\n</TD></TR>\n</TABLE>\n\n<p>\n\n</td></tr></table>\n</td></tr></table>\n<CENTER>\n<TABLE border=0 cellpadding=0 cellspacing=0 width=100%>\n<BR>\n\n<!--- Begin Included Footer --->\n <font face=\"arial,helvetica\" style='font-size: 10.5px; letter-spacing: .75px;'><center>&copy; MyParkinsons.org &#183; Published by <a href='http://www.jaessmedia.com'>jAess Media</a>\n &#183; <a class=footerlink href='/privacy-policy-terms-of-use/'>Privacy Policy & Terms of Use</a><br>\n Sponsorship Assistance for this website and Forum has been provided by <a href='/donate/'> by people\n like you</a><br><br></span></font></center> \n<!--- End Included Footer --->\n<BR>\n<BR>\n\n</CENTER>\n</div>\n</BODY>\n</HTML>\n",
 5 |  "text": " For those who care for someone with Parkinson's disease [Home] [[Forum](forum_show.pl)] [[Help](forum_help.pl)] [[Search](forum_search.pl)] [Register] [[Login](user_login.pl)] [Donate] You are not logged in Topic Coronavirus and PD?  [](topic_show.pl?id=5257) [](topic_show.pl?id=5247) [](32677) [ ](#0) By [jcoff012](user_info.pl?id=16417) On 2020.03.12 13:17 We don\u0092t see the neurologist til March 23, so I was wondering if anyone has any feedback from their\u0092s? As a diabetic with lymphedema, my primary said last Friday that I should not fly or be in large crowds for awhile. Seems that a PWP should have similar concerns, but do they? [](32678) [ ](#32677) By [junipersage](user_info.pl?id=27725) On 2020.03.12 15:21 We haven't spoken to the neurologist, but I work in a county public health department and have been actively working on COVID-19. What we are telling people is that Parkinson's is not necessarily as strong a risk factor as other conditions (particularly lung conditions like asthma and COPD, and or heart conditions.) But even so, the virus is most severe in people over 60, and the risk increases with age. Also, any underlying condition that can make people more frail could be a problem. So its probably a good idea for PWP to take extra caution. Here's a webpage with more info: https://www.apdaparkinson.org/article/covid-19-overview-for-pd-community/ [](32679) [ ](#32678) By [jcoff012](user_info.pl?id=16417) On 2020.03.12 17:31 Bless you! This is wonderfully helpful. I have a sore throat and upset stomach, so we decided my husband would go to the Taekwondo studio with our grandson today. I am not concerned that this is Coronavirus, but feel strongly about a lower resistance now. He is my PWP, but after talking this out, he said he\u0092ll stay away from the other parents and staff. He read the article, too. Thank you! \u00a9 MyParkinsons.org \u00b7 Published by jAess Media \u00b7 Privacy Policy & Terms of Use Sponsorship Assistance for this website and Forum has been provided by  by people like you",
 6 |  "gold_standard_annotation": [
 7 |   {
 8 |    "post_text": {
 9 |     "surface_form": "We don\u0092t see the neurologist til March 23, so I was wondering if anyone has any feedback from their\u0092s? As a diabetic with lymphedema, my primary said last Friday that I should not fly or be in large crowds for awhile. Seems that a PWP should have similar concerns, but do they?"
10 |    },
11 |    "datetime": {
12 |     "surface_form": "2020.03.12 13:17"
13 |    },
14 |    "user": {
15 |     "surface_form": "user_info.pl?id=16417"
16 |    },
17 |    "post_link": {
18 |     "surface_form": "#0"
19 |    }
20 |   },
21 |   {
22 |    "post_text": {
23 |     "surface_form": "We haven't spoken to the neurologist, but I work in a county public health department and have been actively working on COVID-19. What we are telling people is that Parkinson's is not necessarily as strong a risk factor as other conditions (particularly lung conditions like asthma and COPD, and or heart conditions.) But even so, the virus is most severe in people over 60, and the risk increases with age. Also, any underlying condition that can make people more frail could be a problem. So its probably a good idea for PWP to take extra caution. Here's a webpage with more info: https://www.apdaparkinson.org/article/covid-19-overview-for-pd-community/"
24 |    },
25 |    "datetime": {
26 |     "surface_form": "2020.03.12 15:21"
27 |    },
28 |    "user": {
29 |     "surface_form": "user_info.pl?id=27725"
30 |    },
31 |    "post_link": {
32 |     "surface_form": "#32677"
33 |    }
34 |   },
35 |   {
36 |    "post_text": {
37 |     "surface_form": "Bless you! This is wonderfully helpful. I have a sore throat and upset stomach, so we decided my husband would go to the Taekwondo studio with our grandson today. I am not concerned that this is Coronavirus, but feel strongly about a lower resistance now. He is my PWP, but after talking this out, he said he\u0092ll stay away from the other parents and staff. He read the article, too. Thank you!"
38 |    },
39 |    "datetime": {
40 |     "surface_form": "2020.03.12 17:31"
41 |    },
42 |    "user": {
43 |     "surface_form": "user_info.pl?id=16417"
44 |    },
45 |    "post_link": {
46 |     "surface_form": "#32678"
47 |    }
48 |   }
49 |  ]
50 | }


--------------------------------------------------------------------------------
/corpus/goldDocuments/myparkinsons.org.cgi-bin.forum.topic_show.pl.5256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |  "id": "i18885220280728907651545511619171969200",
 3 |  "url": "https://myparkinsons.org/cgi-bin/forum/topic_show.pl?id=5256",
 4 |  "html": "<!DOCTYPE html>\n<HTML lang=\"en\">\n<HEAD>\n<meta charset=\"utf-8\" />\n<TITLE>Parkinsons disease Caregiver Help | Discuss Parkinson's disease: Diagnosis through Advanced Parkinson Care @MyParkinsons.org</TITLE>\n<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">\n<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">\n<META NAME=\"ROBOTS\" CONTENT=\"index, follow, all\">\n<META name=\"description\" content=\"Parkinson's disease Caregiver Online Support Group. Share your caregiver experiences with other parkinson caregivers.  Learn more about the disease and its effects from Parkinsons patients. \">\n<STYLE TYPE='text/css'>\n  <!--\n  BODY      { font-family: 'Verdana'; color: #000; font-size: 10pt;;background: #dddddd; margin-top:0;}\n#wrapper {margin: 0 auto;max-width: 960px;width: 98%;background:#fff;border: 1px solid #7e7f71;box-shadow: 0 0 10px 0px rgba(12, 3, 25, 0.4);}\n#callout {width: 100%;height: auto;text-align:center;background: #6f7060;overflow: hidden;padding-top:1%;}\n  P         { font-family: 'Verdana'; color: #000; font-size: 10pt; }\n  TD        { font-family: 'Verdana'; color: #000; font-size: 10pt; }\n  INPUT     { font-family: 'Verdana'; font-size: 10pt; }\n  SELECT    { font-family: 'Verdana'; font-size: 10pt; }\n  TEXTAREA  { font-family: 'Verdana'; font-size: 10pt; }\n  FORM      { margin-bottom: 0; }\n  A         { text-decoration: none; font-weight: normal; }\n  A:link    { color: #000086; }\n  A:visited { color: #333; }\n  A:active  { color: #ab4f48; }\n  A:hover   { color: red; }\nul { font-family: 'Verdana'; color: #000; font-size: 10pt; }\nli { font-family: 'Verdana'; color: #000; font-size: 10pt; }\n#band {width: 100%;min-height: 48px;background-image: url('/images/bkg_band.png');background-repeat: repeat-x;overflow: hidden;text-align:center;padding-top: 4px;}\n.band  {margin: 1% auto;font-size: .8em;color:#fff;letter-spacing:1px; font-weight:400;}\n.clearfix {clear: both; }\n.adsbygoogle {width:300px;height:250px;}\nimg.logo {max-width:575px; width:100%; height:auto;}\n@media screen and (max-width: 625px) {img.logo {margin:0 auto; width:90%;}}\n@media screen and (max-width: 575px) {.adsbygoogle {display:none;}}\n.banner img {width: 100%; border-top: 1px solid #7e7f71; border-top: 1px solid #7e7f71; }\n@media screen and (max-width: 478px) {body, table {font-size: .8em;}}\n  -->\n</STYLE>\n</HEAD>\n<BODY ><div id=wrapper>\n<div id=callout>\n<a href='/'><img class='logo' src='/images/logo_type.gif' style='border:none;'></a>\n</div>\n<div class=clearfix></div>\n<div id=band>\n<span class=band>For those who care for someone with Parkinson's disease</span><br>\n</div>\n<div class=clearfix></div>\n<center>\n<TABLE style='width:100%; border:none;'>\n<center>\n<!--  Content Table -->\n\n<TABLE width='80%' ALIGN='center' BORDER=0 BORDERCOLOR='silver' BGCOLOR='#d1d2c0' CELLSPACING=0 CELLPADDING=2>\n<TR><TD align=center>[<SPAN TITLE='Associated homepage'><A HREF='/'>Home</A></SPAN>]\n[<SPAN TITLE='Forum start page'><A HREF='forum_show.pl'>Forum</A></SPAN>]\n[<SPAN TITLE='Help and FAQ'><A HREF='forum_help.pl'>Help</A></SPAN>]\n[<SPAN TITLE='Search posts for keywords'><A HREF='forum_search.pl'>Search</A></SPAN>]\n[<SPAN TITLE='Register user account'><A HREF='/register/'>Register</A></SPAN>]\n[<SPAN TITLE='Login with username and password'><A HREF='user_login.pl'>Login</A></SPAN>]\n[<SPAN TITLE='Help Support our Efforts'><A HREF='/donate/'>Donate</A></SPAN>]\n</TD></TR>\n<TR bgcolor='#ffffff'><TD>\nYou are not logged in\n<P></TD></TR>\n</TABLE>\n\n<BR CLEAR='all'>\n\n<TABLE width='80%'  BORDER=0 BORDERCOLOR='silver' BGCOLOR='#d1d2c0' CELLSPACING=0 CELLPADDING=2>\n<TR><TD>\n<TABLE WIDTH='100%' BORDER=0 CELLSPACING=0 CELLPADDING=0>\n<TR><TD>\n<B>Topic</B> Coronavirus and PD?\n</TD><TD ALIGN='right'>\n<A HREF='topic_show.pl?id=5257'><IMG SRC='/forum/images/prev.gif' BORDER=0 HEIGHT=14 WIDTH=14 ALIGN='absmiddle' ALT='Go to previous topic'></A>\n<A HREF='topic_show.pl?id=5247'><IMG SRC='/forum/images/next.gif' BORDER=0 HEIGHT=14 WIDTH=14 ALIGN='absmiddle' ALT='Go to next topic'></A>\n<A HREF=''><IMG SRC='/forum/images/up.gif' BORDER=0 HEIGHT=14 WIDTH=14 ALIGN='absmiddle' ALT='Go to higher level'></A>\n</TD></TR>\n</TABLE>\n</TABLE>\n\n<BR>\n\n<A NAME='32677'></A>\n<TABLE width='80%'  BORDER=0 BORDERCOLOR='silver' BGCOLOR='#d1d2c0' CELLSPACING=0 CELLPADDING=2>\n<TR><TD BGCOLOR='#d1d2c0'>\n<TABLE WIDTH='100%' BORDER=0 CELLSPACING=0 CELLPADDING=0>\n<TR><TD><A HREF='#0'>\n<IMG SRC='https://www.myparkinsons.org/forum/images/post.gif' BORDER=0 WIDTH=14 HEIGHT=14 ALIGN='absmiddle'></A>\n<B>By</B> <A HREF='user_info.pl?id=16417'>jcoff012</A>\n<B>On</B> 2020.03.12 13:17\n</TD><TD ALIGN='right'>\n</TD></TR>\n</TABLE>\n</TD></TR>\n<TR bgcolor='#ffffff'><TD>\nWe don\u0092t see the neurologist til March 23, so I was wondering if anyone has any feedback from their\u0092s? As a diabetic with lymphedema, my primary said last Friday that I should not fly or be in large crowds for awhile. Seems that a PWP should have similar concerns, but do they?\n</TD></TR>\n</TABLE>\n\n<p>\n\n<A NAME='32678'></A>\n<TABLE width='80%'  BORDER=0 BORDERCOLOR='silver' BGCOLOR='#d1d2c0' CELLSPACING=0 CELLPADDING=2>\n<TR><TD BGCOLOR='#d1d2c0'>\n<TABLE WIDTH='100%' BORDER=0 CELLSPACING=0 CELLPADDING=0>\n<TR><TD><A HREF='#32677'>\n<IMG SRC='https://www.myparkinsons.org/forum/images/post.gif' BORDER=0 WIDTH=14 HEIGHT=14 ALIGN='absmiddle'></A>\n<B>By</B> <A HREF='user_info.pl?id=27725'>junipersage</A>\n<B>On</B> 2020.03.12 15:21\n</TD><TD ALIGN='right'>\n</TD></TR>\n</TABLE>\n</TD></TR>\n<TR bgcolor='#ffffff'><TD>\nWe haven't spoken to the neurologist, but I work in a county public health department and have been actively working on COVID-19. What we are telling people is that Parkinson's is not necessarily as strong a risk factor as other conditions (particularly lung conditions like asthma and COPD, and or heart conditions.) But even so, the virus is most severe in people over 60, and the risk increases with age. Also, any underlying condition that can make people more frail could be a problem. So its probably a good idea for PWP to take extra caution. Here's a webpage with more info: HREF='https://www.apdaparkinson.org/article/covid-19-overview-for-pd-community/'>https://www.apdaparkinson.org/article/covid-19-overview-for-pd-community/</A>\n</TD></TR>\n</TABLE>\n\n<p>\n\n<A NAME='32679'></A>\n<TABLE width='80%'  BORDER=0 BORDERCOLOR='silver' BGCOLOR='#d1d2c0' CELLSPACING=0 CELLPADDING=2>\n<TR><TD BGCOLOR='#d1d2c0'>\n<TABLE WIDTH='100%' BORDER=0 CELLSPACING=0 CELLPADDING=0>\n<TR><TD><A HREF='#32678'>\n<IMG SRC='https://www.myparkinsons.org/forum/images/post.gif' BORDER=0 WIDTH=14 HEIGHT=14 ALIGN='absmiddle'></A>\n<B>By</B> <A HREF='user_info.pl?id=16417'>jcoff012</A>\n<B>On</B> 2020.03.12 17:31\n</TD><TD ALIGN='right'>\n</TD></TR>\n</TABLE>\n</TD></TR>\n<TR bgcolor='#ffffff'><TD>\nBless you! This is wonderfully helpful. I have a sore throat and upset stomach, so we decided my husband would go to the Taekwondo studio with our grandson today. I am not concerned that this is Coronavirus, but feel strongly about a lower resistance now. He is my PWP, but after talking this out, he said he\u0092ll stay away from the other parents and staff.  He read the article, too. Thank you!\n</TD></TR>\n</TABLE>\n\n<p>\n\n</td></tr></table>\n</td></tr></table>\n<CENTER>\n<TABLE border=0 cellpadding=0 cellspacing=0 width=100%>\n<BR>\n\n<!--- Begin Included Footer --->\n <font face=\"arial,helvetica\" style='font-size: 10.5px; letter-spacing: .75px;'><center>&copy; MyParkinsons.org &#183; Published by <a href='http://www.jaessmedia.com'>jAess Media</a>\n &#183; <a class=footerlink href='/privacy-policy-terms-of-use/'>Privacy Policy & Terms of Use</a><br>\n Sponsorship Assistance for this website and Forum has been provided by <a href='/donate/'> by people\n like you</a><br><br></span></font></center> \n<!--- End Included Footer --->\n<BR>\n<BR>\n\n</CENTER>\n</div>\n</BODY>\n</HTML>\n",
 5 |  "text": " For those who care for someone with Parkinson's disease [Home] [[Forum](forum_show.pl)] [[Help](forum_help.pl)] [[Search](forum_search.pl)] [Register] [[Login](user_login.pl)] [Donate] You are not logged in Topic Coronavirus and PD?  [](topic_show.pl?id=5257) [](topic_show.pl?id=5247) [](32677) [ ](#0) By [jcoff012](user_info.pl?id=16417) On 2020.03.12 13:17 We don\u0092t see the neurologist til March 23, so I was wondering if anyone has any feedback from their\u0092s? As a diabetic with lymphedema, my primary said last Friday that I should not fly or be in large crowds for awhile. Seems that a PWP should have similar concerns, but do they? [](32678) [ ](#32677) By [junipersage](user_info.pl?id=27725) On 2020.03.12 15:21 We haven't spoken to the neurologist, but I work in a county public health department and have been actively working on COVID-19. What we are telling people is that Parkinson's is not necessarily as strong a risk factor as other conditions (particularly lung conditions like asthma and COPD, and or heart conditions.) But even so, the virus is most severe in people over 60, and the risk increases with age. Also, any underlying condition that can make people more frail could be a problem. So its probably a good idea for PWP to take extra caution. Here's a webpage with more info: https://www.apdaparkinson.org/article/covid-19-overview-for-pd-community/ [](32679) [ ](#32678) By [jcoff012](user_info.pl?id=16417) On 2020.03.12 17:31 Bless you! This is wonderfully helpful. I have a sore throat and upset stomach, so we decided my husband would go to the Taekwondo studio with our grandson today. I am not concerned that this is Coronavirus, but feel strongly about a lower resistance now. He is my PWP, but after talking this out, he said he\u0092ll stay away from the other parents and staff. He read the article, too. Thank you! \u00a9 MyParkinsons.org \u00b7 Published by jAess Media \u00b7 Privacy Policy & Terms of Use Sponsorship Assistance for this website and Forum has been provided by  by people like you",
 6 |  "gold_standard_annotation": [
 7 |   {
 8 |    "post_text": {
 9 |     "surface_form": "We don\u0092t see the neurologist til March 23, so I was wondering if anyone has any feedback from their\u0092s? As a diabetic with lymphedema, my primary said last Friday that I should not fly or be in large crowds for awhile. Seems that a PWP should have similar concerns, but do they?",
10 |     "start": 362,
11 |     "end": 639
12 |    },
13 |    "datetime": {
14 |     "surface_form": "2020.03.12 13:17",
15 |     "start": 345,
16 |     "end": 361
17 |    },
18 |    "user": {
19 |     "surface_form": "user_info.pl?id=16417",
20 |     "start": 319,
21 |     "end": 340
22 |    },
23 |    "post_link": {
24 |     "surface_form": "#0",
25 |     "start": 301,
26 |     "end": 303
27 |    }
28 |   },
29 |   {
30 |    "post_text": {
31 |     "surface_form": "We haven't spoken to the neurologist, but I work in a county public health department and have been actively working on COVID-19. What we are telling people is that Parkinson's is not necessarily as strong a risk factor as other conditions (particularly lung conditions like asthma and COPD, and or heart conditions.) But even so, the virus is most severe in people over 60, and the risk increases with age. Also, any underlying condition that can make people more frail could be a problem. So its probably a good idea for PWP to take extra caution. Here's a webpage with more info: https://www.apdaparkinson.org/article/covid-19-overview-for-pd-community/",
32 |     "start": 722,
33 |     "end": 1378
34 |    },
35 |    "datetime": {
36 |     "surface_form": "2020.03.12 15:21",
37 |     "start": 705,
38 |     "end": 721
39 |    },
40 |    "user": {
41 |     "surface_form": "user_info.pl?id=27725",
42 |     "start": 679,
43 |     "end": 700
44 |    },
45 |    "post_link": {
46 |     "surface_form": "#32677",
47 |     "start": 654,
48 |     "end": 660
49 |    }
50 |   },
51 |   {
52 |    "post_text": {
53 |     "surface_form": "Bless you! This is wonderfully helpful. I have a sore throat and upset stomach, so we decided my husband would go to the Taekwondo studio with our grandson today. I am not concerned that this is Coronavirus, but feel strongly about a lower resistance now. He is my PWP, but after talking this out, he said he\u0092ll stay away from the other parents and staff. He read the article, too. Thank you!",
54 |     "start": 1458,
55 |     "end": 1850
56 |    },
57 |    "datetime": {
58 |     "surface_form": "2020.03.12 17:31",
59 |     "start": 1441,
60 |     "end": 1457
61 |    },
62 |    "user": {
63 |     "surface_form": "user_info.pl?id=16417",
64 |     "start": 1415,
65 |     "end": 1436
66 |    },
67 |    "post_link": {
68 |     "surface_form": "#32678",
69 |     "start": 1393,
70 |     "end": 1399
71 |    }
72 |   }
73 |  ]
74 | }


--------------------------------------------------------------------------------
/tests/integration/harvest/test_extract_data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from json import load
  3 | 
  4 | import pytest
  5 | from fuzzywuzzy import fuzz
  6 | 
  7 | from harvest import extract_data
  8 | 
  9 | 
 10 | # @Todo lead post not detected-> test_forum_healthunlocked
 11 | # @Todo not recognized because of inscriptis -> test_forum_proxer
 12 | # @Todo lead post not detected -> test_forum_shift_ms
 13 | # @Todo lead post not detected- -> test_forum_medhelp
 14 | # @Todo text not recognized -> test_forum_medschat
 15 | # @Todo text not recognized because to many other threads recommendations -> test_forum_paradisi
 16 | 
 17 | @pytest.fixture
 18 | def compare():
 19 |     def _compare(gold_annotations, response, ignored_element=[], ratio=95):
 20 |         for index, gold_annotation in enumerate(gold_annotations, start=0):
 21 |             for element in gold_annotation:
 22 |                 if element not in ignored_element:
 23 |                     if element == 'post_text':
 24 |                         assert fuzz.ratio(gold_annotation[element]['surface_form'],
 25 |                                           response[index][element]) > ratio
 26 |                     else:
 27 |                         assert gold_annotation[element]['surface_form'] == response[index][element]
 28 | 
 29 |     return _compare
 30 | 
 31 | 
 32 | @pytest.fixture
 33 | def remove_index():
 34 |     def _remove_index(response, indexes_to_remove):
 35 |         final_response = []
 36 |         for index, response_element in enumerate(response, start=0):
 37 |             if index not in indexes_to_remove:
 38 |                 final_response.append(response_element)
 39 |         return final_response
 40 | 
 41 |     return _remove_index
 42 | 
 43 | 
 44 | @pytest.fixture
 45 | def load_test_data():
 46 |     def _load_test_data(file_name):
 47 |         file_path = os.path.join(os.path.dirname(__file__), '../../../corpus/goldDocuments', file_name)
 48 |         with open(file_path) as f:
 49 |             return load(f)
 50 | 
 51 |     return _load_test_data
 52 | 
 53 | 
 54 | def test_forum_angelman(load_test_data, compare):
 55 |     forum_test_data = load_test_data("blog.angelman-asa.org.read.php.json")
 56 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
 57 |     compare(forum_test_data['gold_standard_annotation'], response, ['post_link'])
 58 | 
 59 | 
 60 | def test_forum_bpdfamily(load_test_data, compare):
 61 |     forum_test_data = load_test_data("bpdfamily.com.message_board.index.php.json")
 62 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
 63 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'user', 'post_link'])
 64 | 
 65 | 
 66 | def test_forum_bitdefender(load_test_data, compare):
 67 |     forum_test_data = load_test_data(
 68 |         "community.bitdefender.com.en.discussion.82059.i-noticed-that-the-bitdefender-process-can-be-easily-killed.json")
 69 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
 70 |     compare(forum_test_data['gold_standard_annotation'], response, ['post_link'])
 71 | 
 72 | 
 73 | def test_forum_kaspersky(load_test_data, compare):
 74 |     forum_test_data = load_test_data(
 75 |         "community.kaspersky.com.kaspersky-security-cloud-11.rootkit-scan-not-executed-6849.json")
 76 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
 77 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'post_link'])
 78 | 
 79 | 
 80 | def test_forum_community_scope(load_test_data, compare):
 81 |     forum_test_data = load_test_data("community.scope.org.uk.discussion.68941.disabled-mum.json")
 82 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
 83 |     compare(forum_test_data['gold_standard_annotation'], response, ['post_link'])
 84 | 
 85 | 
 86 | def test_forum_digitalfernsehen(load_test_data, compare):
 87 |     forum_test_data = load_test_data("forum.digitalfernsehen.de.threads.df-hilferuf.416785..json")
 88 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
 89 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime'])
 90 | 
 91 | 
 92 | def test_forum_ebaumsworld(load_test_data, compare):
 93 |     forum_test_data = load_test_data("forum.ebaumsworld.com.viewtopic.php.42095.json")
 94 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
 95 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime'])
 96 | 
 97 | 
 98 | def test_forum_glamour(load_test_data, compare):
 99 |     forum_test_data = load_test_data("forum.glamour.de.t.designertaschen-laber-laber.18136.json")
100 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
101 |     compare(forum_test_data['gold_standard_annotation'], response, [])
102 | 
103 | 
104 | def test_forum_mein_schoener_garten(load_test_data, compare):
105 |     forum_test_data = load_test_data("forum.mein-schoener-garten.de.viewtopic.php.4825193.json")
106 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
107 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime'])
108 | 
109 | 
110 | def test_forum_nationstates(load_test_data, compare):
111 |     forum_test_data = load_test_data("forum.nationstates.net.viewtopic.php.419.json")
112 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
113 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime'])
114 | 
115 | 
116 | def test_forum_openoffice(load_test_data, compare):
117 |     forum_test_data = load_test_data("forum.openoffice.org.en.forum.viewtopic.php.json")
118 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
119 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'user'])
120 | 
121 | 
122 | def test_forum_statcounter(load_test_data, compare):
123 |     forum_test_data = load_test_data("forum.statcounter.com.threads.best-android-apps-in-uk-2019.79812..json")
124 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
125 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'user', 'post_link'])
126 | 
127 | 
128 | def test_forum_ubuntuusers(load_test_data, compare):
129 |     forum_test_data = load_test_data("forum.ubuntuusers.de.topic.appimage-programm-in-alle-programme-als-icon-a..json")
130 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
131 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'user', 'post_link'], ratio=78)
132 | 
133 | 
134 | def test_forum_utorrent(load_test_data, compare):
135 |     forum_test_data = load_test_data("forum.utorrent.com.topic.23012-check-on-startup..json")
136 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
137 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'post_link'])
138 | 
139 | 
140 | def test_forum_videolan(load_test_data, compare):
141 |     forum_test_data = load_test_data("forum.videolan.org.viewtopic.php.json")
142 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
143 |     compare(forum_test_data['gold_standard_annotation'], response, [])
144 | 
145 | 
146 | def test_forum_wordreference(load_test_data, compare):
147 |     forum_test_data = load_test_data("forum.wordreference.com.threads.attuned-to-the-reiki-symbols.3691417..json")
148 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
149 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'user'])
150 | 
151 | 
152 | def test_forum_worldofplayers(load_test_data, compare):
153 |     forum_test_data = load_test_data(
154 |         "forum.worldofplayers.de.forum.threads.1548322-Welchen-Blog-benutzt-man-in-2020.json")
155 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
156 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime'])
157 | 
158 | 
159 | def test_forum_futura_sciences(load_test_data, compare, remove_index):
160 |     forum_test_data = load_test_data("forums.futura-sciences.com.annonces-officielles.78761-moderateurs.html.json")
161 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
162 |     # Remove the advertisement slots
163 |     response = remove_index(response, [1, 6, 8, 15, 22, 29])
164 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime'])
165 | 
166 | 
167 | def test_forum_macrumors(load_test_data, compare):
168 |     forum_test_data = load_test_data("forums.macrumors.com.threads.se-or-11.2231616..json")
169 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
170 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime'])
171 | 
172 | 
173 | def test_forum_maladiesraresinfo(load_test_data, compare):
174 |     forum_test_data = load_test_data("forums.maladiesraresinfo.org.post11011.html.json")
175 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
176 |     compare(forum_test_data['gold_standard_annotation'], response, ['post_link'])
177 | 
178 | 
179 | def test_forum_moneysavingexpert(load_test_data, compare):
180 |     forum_test_data = load_test_data(
181 |         "forums.moneysavingexpert.com.discussion.6100693.how-do-0-credit-card-balances-work-when-you-have-borrowed-twice.json")
182 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
183 |     compare(forum_test_data['gold_standard_annotation'], response, ['post_link'])
184 | 
185 | 
186 | def test_forum_sherdog(load_test_data, compare):
187 |     forum_test_data = load_test_data("forums.sherdog.com.threads.all-time-goat-poll.3916359..json")
188 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
189 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'post_link'])
190 | 
191 | 
192 | def test_forum_kiwifarms(load_test_data, compare):
193 |     forum_test_data = load_test_data(
194 |         "kiwifarms.net.threads.the-twitter-pedo-hunter-loli-crusader-community.64404..json")
195 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
196 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'user'])
197 | 
198 | 
199 | def test_forum_myparkinsons(load_test_data, compare, remove_index):
200 |     forum_test_data = load_test_data("myparkinsons.org.cgi-bin.forum.topic_show.pl.5256.json")
201 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
202 |     # Remove header that looks exactly like the posts
203 |     response = remove_index(response, [0, 1])
204 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'post_link'], ratio=90)
205 | 
206 | 
207 | def test_forum_skyscraperpage(load_test_data, compare):
208 |     forum_test_data = load_test_data("skyscraperpage.com.forum.showthread.php.json")
209 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
210 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'post_link'])
211 | 
212 | 
213 | def test_forum_collegeconfidential(load_test_data, compare):
214 |     forum_test_data = load_test_data(
215 |         "talk.collegeconfidential.com.student-here-ask-me-anything.2183693-got-into-nyu-pre-med-intention-ask-me-anything.html.json")
216 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
217 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'post_link'])
218 | 
219 | 
220 | def test_forum_uhrforum(load_test_data, compare):
221 |     forum_test_data = load_test_data("uhrforum.de.threads.der-yema-fotothread-und-nicht-nur-das.414009..json")
222 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
223 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'user'])
224 | 
225 | 
226 | def test_forum_blizzard(load_test_data, compare):
227 |     forum_test_data = load_test_data(
228 |         "us.forums.blizzard.com.en.wow.t.can-i-transfer-back-to-locked-server-if-i-have-existing-character.505388.json")
229 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
230 |     compare(forum_test_data['gold_standard_annotation'], response)
231 | 
232 | 
233 | def test_forum_airliners(load_test_data, compare):
234 |     forum_test_data = load_test_data("www.airliners.net.forum.viewtopic.php.1428699.json")
235 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
236 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime'], ratio=85)
237 | 
238 | 
239 | def test_forum_amsel(load_test_data, compare):
240 |     forum_test_data = load_test_data("www.amsel.de.multiple-sklerose-forum..json")
241 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
242 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime'])
243 | 
244 | 
245 | def test_forum_android_hilfe(load_test_data, compare):
246 |     forum_test_data = load_test_data(
247 |         "www.android-hilfe.de.forum.samsung-allgemein.423.faq-diskussion-zum-kauf-samsung-galaxy-s10-s10e-s10-snapdragon-variante.904645.html.json")
248 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
249 |     compare(forum_test_data['gold_standard_annotation'], response, ['user'])
250 | 
251 | 
252 | def test_forum_computerbase(load_test_data, compare):
253 |     forum_test_data = load_test_data(
254 |         "www.computerbase.de.forum.threads.ram-empfehlung-fuer-ryzen.1940441..json")
255 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
256 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime'], ratio=85)
257 | 
258 | 
259 | def test_forum_drwindows(load_test_data, compare):
260 |     forum_test_data = load_test_data(
261 |         "www.drwindows.de.windows-7-allgemein.16340-zufall-entdeckte-problemlsungen.html.json")
262 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
263 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'user', 'post_link'], ratio=89)
264 | 
265 | 
266 | def test_forum_fanfiction(load_test_data, compare):
267 |     forum_test_data = load_test_data(
268 |         "www.fanfiction.net.topic.146535.108548484.1.The-About-the-World-Topic.json")
269 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
270 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'user'], ratio=75)
271 | 
272 | 
273 | def test_forum_gtplanet(load_test_data, compare):
274 |     forum_test_data = load_test_data("www.gtplanet.net.forum.threads.f1-2018-general-discussion.378195..json")
275 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
276 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime'])
277 | 
278 | 
279 | def test_forum_hifi(load_test_data, compare):
280 |     forum_test_data = load_test_data("www.hifi-forum.de.viewthread-84-87.html.json")
281 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
282 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime'])
283 | 
284 | 
285 | def test_forum_juraforum(load_test_data, compare):
286 |     forum_test_data = load_test_data(
287 |         "www.juraforum.de.forum.t.fahrtkostenerstattung-bei-falschen-rezepten.675629..json")
288 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
289 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'user', 'post_link'])
290 | 
291 | 
292 | def test_forum_med1(load_test_data, compare):
293 |     forum_test_data = load_test_data(
294 |         "www.med1.de.forum.beruf-alltag-und-umwelt.corona-eine-gehypde-apokalypse-972190..json")
295 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
296 |     compare(forum_test_data['gold_standard_annotation'], response, ['post_link'])
297 | 
298 | 
299 | def test_forum_msworld(load_test_data, compare):
300 |     forum_test_data = load_test_data("www.msworld.org.forum.showthread.php.json")
301 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
302 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime'])
303 | 
304 | 
305 | def test_forum_msconnection(load_test_data, compare):
306 |     forum_test_data = load_test_data("www.msconnection.org.Discussions.f27.t79421.tp1.Does-this-sound-like-MS.json")
307 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
308 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime'])
309 | 
310 | 
311 | def test_forum_mumsnet(load_test_data, compare):
312 |     forum_test_data = load_test_data(
313 |         "www.mumsnet.com.Talk.pregnancy.3749275-Pregnant-with-a-black-mixed-race-with-black-baby.json")
314 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
315 |     compare(forum_test_data['gold_standard_annotation'], response)
316 | 
317 | 
318 | def test_forum_musiker_board(load_test_data, compare):
319 |     forum_test_data = load_test_data(
320 |         "www.musiker-board.de.threads.baubericht-0-14-ital-fichte-palisander.689167..json")
321 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
322 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime'], ratio=93)
323 | 
324 | 
325 | def test_forum_nairaland(load_test_data, compare):
326 |     forum_test_data = load_test_data("www.nairaland.com.5812914.akeredolu-rejects-plot-impeach-deputy.json")
327 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
328 |     compare(forum_test_data['gold_standard_annotation'], response, ['datetime', 'post_link'])
329 | 
330 | 
331 | def test_forum_neowin(load_test_data, compare):
332 |     forum_test_data = load_test_data("www.neowin.net.forum.topic.1391546-hello-im-dion..json")
333 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
334 |     compare(forum_test_data['gold_standard_annotation'], response, ['user'])
335 | 
336 | 
337 | def test_forum_pistonheads(load_test_data, compare):
338 |     forum_test_data = load_test_data("www.pistonheads.com.gassing.topic.asp.1858583.json")
339 |     response = extract_data(forum_test_data['html'], forum_test_data['url'])['posts']
340 |     compare(forum_test_data['gold_standard_annotation'], response, ['post_link', 'datetime'])
341 | 


--------------------------------------------------------------------------------
/corpus/goldDocumentsPre/forum.ubuntuusers.de.topic.appimage-programm-in-alle-programme-als-icon-a..json:
--------------------------------------------------------------------------------
 1 | {
 2 |  "id": "i239851449887439968677754102219662807017",
 3 |  "url": "https://forum.ubuntuusers.de/topic/appimage-programm-in-alle-programme-als-icon-a/",
 4 |  "html": "\n\n\n\n\n<!DOCTYPE html>\n<html lang=\"de-de\"\n      >\n  <head>\n    <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n\n    <title>\n      \n  \n  \n    \n  \n    \n  \n    \n  \n    \n  \n\n  Appimage Programm in \"alle Programme\" als Icon anzeigen? \u203a GNOME (Ubuntu ab 17.10) \u203a Grafische Oberfl\u00e4che \u203a Forum \u203a ubuntuusers.de\n\n    </title>\n    \n      \n      \n      \n        <link rel=\"stylesheet\" type=\"text/css\" href=\"//static-cdn.ubuntu-de.org/style/main.css?v=v0.22.1\" />\n      \n        <link rel=\"stylesheet\" type=\"text/css\" href=\"//static-cdn.ubuntu-de.org/style/markup.css?v=v0.22.1\" />\n      \n        <link rel=\"stylesheet\" type=\"text/css\" href=\"//static-cdn.ubuntu-de.org/style/forum.css?v=v0.22.1\" />\n      \n        <link rel=\"stylesheet\" type=\"text/css\" href=\"//static-cdn.ubuntu-de.org/style/highlight.css?v=v0.22.1\" />\n      \n\n      <link rel=\"stylesheet\" type=\"text/css\" href=\"//static-cdn.ubuntu-de.org/style/print.css?v=v0.22.1\" media=\"print\" />\n\n      \n\n      <link rel=\"stylesheet\" type=\"text/css\" href=\"//media-cdn.ubuntu-de.org/linkmap/linkmap-9630bbab4518320fca2f7eab5ff89406.css\" />\n\n      \n        <link rel=\"alternate\" href=\"https://forum.ubuntuusers.de/feeds/topic/appimage-programm-in-alle-programme-als-icon-a/title/20/\" title=\"Appimage Programm in &#34;alle Programme&#34; als Icon anzeigen? - \u00dcberschriften\" type=\"application/atom+xml\" />\n      \n        <link rel=\"alternate\" href=\"https://forum.ubuntuusers.de/feeds/topic/appimage-programm-in-alle-programme-als-icon-a/short/20/\" title=\"Appimage Programm in &#34;alle Programme&#34; als Icon anzeigen? - Einleitung\" type=\"application/atom+xml\" />\n      \n        <link rel=\"alternate\" href=\"https://forum.ubuntuusers.de/feeds/topic/appimage-programm-in-alle-programme-als-icon-a/full/20/\" title=\"Appimage Programm in &#34;alle Programme&#34; als Icon anzeigen? - Komplett\" type=\"application/atom+xml\" />\n      \n      <link rel=\"shortcut icon\" href=\"//static-cdn.ubuntu-de.org/img/favicon.ico\" />\n      <meta name=\"theme-color\" content=\"#2b2929\" />\n\n      \n          \n\n          \n      \n\n    \n  </head>\n  <body>\n  <nav class=\"navi_global\">\n    <a href=\"#main\" class=\"skip-link\">Zum Hauptinhalt springen</a>\n    <a href=\"#sidebar\" class=\"skip-link\">Zur Seitenleiste springen</a>\n    <a href=\"https://verein.ubuntu-de.org/\">\n      <img src=\"//static-cdn.ubuntu-de.org/img/ubuntu-logo-set-web-svg/SVG_small_use/ubuntu_white_hex_su-ubuntu_Deutschland_eV-no_font.svg\"\n           alt=\"ubuntu Deutschland e.V. Logo\">\n    </a>\n\n  <ul>\n    <li>\n      <noscript>\n        <strong class=\"nojs error\">Bitte aktiviere JavaScript!</strong>\n      </noscript>\n    </li>\n    \n      <li>\n        <a href=\"https://ubuntuusers.de/login/?next=https%3A%2F%2Fforum.ubuntuusers.de%2Ftopic%2Fappimage-programm-in-alle-programme-als-icon-a%2F\" id=\"login_link\">\n          Anmelden\n        </a>\n      </li>\n      <li>\n        <a href=\"https://ubuntuusers.de/register/\">\n          Registrieren\n        </a>\n      </li>\n    \n  </ul>\n  </nav>\n\n    <header class=\"header\">\n      <h1><a href=\"/\"><span>ubuntuusers.de</span></a></h1>\n      <ul class=\"tabbar\">\n        \n          <li class=\"portal\">\n            <a href=\"https://ubuntuusers.de/\">Portal</a>\n          </li>\n        \n          <li class=\"forum active\">\n            <a href=\"https://forum.ubuntuusers.de/\">Forum</a>\n          </li>\n        \n          <li class=\"wiki\">\n            <a href=\"https://wiki.ubuntuusers.de/\">Wiki</a>\n          </li>\n        \n          <li class=\"ikhaya\">\n            <a href=\"https://ikhaya.ubuntuusers.de/\">Ikhaya</a>\n          </li>\n        \n          <li class=\"planet\">\n            <a href=\"https://planet.ubuntuusers.de/\">Planet</a>\n          </li>\n        \n        \n        <li class=\"community\">\n          <a href=\"https://wiki.ubuntuusers.de/Mitmachen/\">Mitmachen</a>\n        </li>\n      </ul>\n    </header>\n\n        \n\n        \n        <form method=\"GET\" action=\"https://duckduckgo.com/\" class=\"search\" name=\"searchsys\" data-active-app=\"forum\">\n          <div>\n            <input type=\"text\" size=\"40\" name=\"keyword\" class=\"search_query\" />\n            <input type=\"hidden\" name=\"q\">\n\n            \n            <input type=\"hidden\" name=\"kam\" value=\"osm\">\n            <input type=\"hidden\" name=\"kj\" value=\"D8A648\">\n            <input type=\"hidden\" name=\"ky\" value=\"f3eedd\">\n            <input type=\"hidden\" name=\"k18\" value=\"1\">\n            <input type=\"hidden\" name=\"ka\" value=\"Ubuntu\">\n\n            <input type=\"submit\" value=\"Suche\" class=\"search_submit\" />\n\n            <a href=\"https://wiki.ubuntuusers.de/Suchfunktion/\">via DuckDuckGo</a>\n          </div>\n        </form>\n\n        \n  <div class=\"dropdown\">\n    <ul class=\"dropdown\">\n      <li>\n        <span>Filter</span>\n      </li>\n      <li>\n        <ul>\n            <li>\n              <a href=\"https://forum.ubuntuusers.de/newposts/\">Neue Beitr\u00e4ge</a>\n              <a href=\"https://forum.ubuntuusers.de/newposts/gnome/\" class=\"limited\">nur hier</a>\n            </li>\n            <li>\n              <a href=\"https://forum.ubuntuusers.de/unanswered/\">Unbeantwortete Themen</a>\n              <a href=\"https://forum.ubuntuusers.de/unanswered/gnome/\" class=\"limited\">nur hier</a>\n            </li>\n            <li>\n              <a href=\"https://forum.ubuntuusers.de/unsolved/\">Ungel\u00f6ste Themen</a>\n              <a href=\"https://forum.ubuntuusers.de/unsolved/gnome/\" class=\"limited\">nur hier</a>\n            </li>\n            <li>\n              <a href=\"https://forum.ubuntuusers.de/last24/\">24 Stunden</a>\n              <a href=\"https://forum.ubuntuusers.de/last24/gnome/\" class=\"limited\">nur hier</a>\n            </li>\n            <li>\n              <a href=\"https://forum.ubuntuusers.de/last12/\">12 Stunden</a>\n              <a href=\"https://forum.ubuntuusers.de/last12/gnome/\" class=\"limited\">nur hier</a>\n            </li>\n            <li>\n              <a href=\"https://forum.ubuntuusers.de/last6/\">6 Stunden</a>\n              <a href=\"https://forum.ubuntuusers.de/last6/gnome/\" class=\"limited\">nur hier</a>\n            </li>\n          \n        </ul>\n      </li>\n    </ul>\n  </div>\n\n  <div class=\"pathbar_extension\">\n    \n    \n  </div>\n\n\n      <div class=\"breadcrumb -top\">\n        \n          <ol>\n            \n              <li><a href=\"https://forum.ubuntuusers.de/\">Forum</a></li>\n            \n              <li><a href=\"https://forum.ubuntuusers.de/category/grafische-oberflaeche/\">Grafische Oberfl\u00e4che</a></li>\n            \n              <li><a href=\"https://forum.ubuntuusers.de/forum/gnome/\">GNOME (Ubuntu ab 17.10)</a></li>\n            \n              <li><a href=\"https://forum.ubuntuusers.de/topic/appimage-programm-in-alle-programme-als-icon-a/\">Appimage Programm in \"alle Programme\" als Icon anzeigen?</a></li>\n            \n          </ol>\n        \n      </div>\n\n        \n        \n        \n\n        \n\n        <main id=\"main\"\n              class=\"content\">\n          \n\n          \n\n          \n  <div class=\"topic_box admin_link_hover\">\n    <h2>Appimage Programm in &#34;alle Programme&#34; als Icon anzeigen?</h2>\n    <div class=\"topic_box_content\">\n      <div class=\"pagination pagination_right\"><span class=\"disabled prev\">\u00ab Vorherige</span><span class=\"pageselect active\">1</span><span class=\"disabled next\">N\u00e4chste \u00bb</span>\n</div>\n      <strong>Status:</strong>\n      <span class=\"status_unsolved\">\n        Ungel\u00f6st\n      </span>\n      <span class=\"linklist\">|</span>\n      <span class=\"ubuntu_version\">\n        <strong>Ubuntu-Version:</strong>\n        Ubuntu 20.04 (Focal Fossa)\n      </span>\n      <br />\n      \n  <span class=\"linklist\">\n      <a href=\"https://forum.ubuntuusers.de/topic/appimage-programm-in-alle-programme-als-icon-a/reply/\" class=\"action action_reply\">Antworten</a> |\n    \n  </span>\n\n    </div>\n  </div>\n\n  <table class=\"topic admin_link_hover\">\n    <tbody>\n        <tr id=\"post-9165689\">\n          <td class=\"author\">\n            <p class=\"username\">\n              <a href=\"https://ubuntuusers.de/user/Liane/\">Liane</a>\n              \n            </p>\n            <p>Anmeldungsdatum:<br />11. April 2009</p>\n            <p class=\"posts\">Beitr\u00e4ge: <a href=\"https://forum.ubuntuusers.de/author/Liane/\" rel=\"nofollow\">457</a></p>\n          </td>\n          <td class=\"post\">\n            <div class=\"postinfo\">\n              <div class=\"linklist\">\n                  <a href=\"https://forum.ubuntuusers.de/post/9165689/quote/\"\n                     class=\"action action_quote\">Zitieren</a>\n              </div>\n              <a href=\"https://forum.ubuntuusers.de/post/9165689/\" title=\"Permalink zu diesem Beitrag\">\n                <img src=\"//static-cdn.ubuntu-de.org/img/icon_minipost.gif\" alt=\"Beitrag\" /></a>\n              14. Juni 2020 10:23\n              \n               (zuletzt bearbeitet: 14. Juni 2020 10:41)\n              \n            </div>\n            \n            <div class=\"text\">\n              <p>Hallo zusammen,</p><p>ich habe ein Programm, dass als Appimage zu starten ist.</p><p>Ich w\u00fcrde gerne dieses Programm wie die anderen Anwendungen auch auf dem \nDashboard (hoffe ich habe das Men\u00fc richtig beschrieben) mittels &quot;alle Programme&quot;\nsehen k\u00f6nnen.</p><p>Der Umweg \u00fcber den Ordner suchen und das Programm darin zu starten ist umst\u00e4ndlich.</p><p>Danke!</p><p>bye\nLiane</p><div class=\"moderated\"><p><strong>Moderiert von <a class=\"crosslink user\" href=\"https://ubuntuusers.de/user/Taomon/\">Taomon</a>:</strong></p> <p> Dieses Thema ist verschoben worden. Bitte beachte die als <span class=\"underline\">wichtig</span> markierten Themen (\u201eWelche Themen geh\u00f6ren hier her und welche nicht?\u201c)!</p></div><p>\n</p>\n            </div>\n          </td>\n        </tr>\n        <tr id=\"post-9165696\">\n          <td class=\"author\">\n            <p class=\"username\">\n              <a href=\"https://ubuntuusers.de/user/fleet_street/\">fleet_street</a>\n              \n            </p>\n            <p>Anmeldungsdatum:<br />30. August 2016</p>\n            <p class=\"posts\">Beitr\u00e4ge: <a href=\"https://forum.ubuntuusers.de/author/fleet_street/\" rel=\"nofollow\">1046</a></p>\n              <p>Wohnort: Hunsr\u00fcck</p>\n          </td>\n          <td class=\"post\">\n            <div class=\"postinfo\">\n              <div class=\"linklist\">\n                  <a href=\"https://forum.ubuntuusers.de/post/9165696/quote/\"\n                     class=\"action action_quote\">Zitieren</a>\n              </div>\n              <a href=\"https://forum.ubuntuusers.de/post/9165696/\" title=\"Permalink zu diesem Beitrag\">\n                <img src=\"//static-cdn.ubuntu-de.org/img/icon_minipost.gif\" alt=\"Beitrag\" /></a>\n              14. Juni 2020 10:42\n              \n            </div>\n            \n            <div class=\"text\">\n              <p>Was du ben\u00f6tigst, ist ein Starter \u2192 <a href=\"https://wiki.ubuntuusers.de/.desktop-Dateien/\" class=\"internal\">.desktop-Dateien</a> </p>\n            </div>\n          </td>\n        </tr>\n        <tr id=\"post-9165847\">\n          <td class=\"author\">\n            <p class=\"username\">\n              <a href=\"https://ubuntuusers.de/user/Liane/\">Liane</a>\n              \n            </p>\n              <div class=\"member_title\">(Themenstarter)</div>\n            <p>Anmeldungsdatum:<br />11. April 2009</p>\n            <p class=\"posts\">Beitr\u00e4ge: <a href=\"https://forum.ubuntuusers.de/author/Liane/\" rel=\"nofollow\">457</a></p>\n          </td>\n          <td class=\"post\">\n            <div class=\"postinfo\">\n              <div class=\"linklist\">\n                  <a href=\"https://forum.ubuntuusers.de/post/9165847/quote/\"\n                     class=\"action action_quote\">Zitieren</a>\n              </div>\n              <a href=\"https://forum.ubuntuusers.de/post/9165847/\" title=\"Permalink zu diesem Beitrag\">\n                <img src=\"//static-cdn.ubuntu-de.org/img/icon_minipost.gif\" alt=\"Beitrag\" /></a>\n              14. Juni 2020 17:56\n              \n            </div>\n            \n            <div class=\"text\">\n              <p>danke f\u00fcr die Antwort. Erinnerlich, gibt es eine Software, mit der man mittels GUI diesen Starter erstellen kann?</p><p>DANKE </p>\n            </div>\n          </td>\n        </tr>\n        <tr id=\"post-9165916\">\n          <td class=\"author\">\n            <p class=\"username\">\n              <a href=\"https://ubuntuusers.de/user/Tut-tut/\">Tut-tut</a>\n              \n            </p>\n              \n  <img class=\"avatar\"\n    src=\"//media-cdn.ubuntu-de.org/portal/avatars/avatar_user227047_4.jpeg\"\n    alt=\"Avatar von Tut-tut\" />\n\n            <p>Anmeldungsdatum:<br />24. August 2012</p>\n            <p class=\"posts\">Beitr\u00e4ge: <a href=\"https://forum.ubuntuusers.de/author/Tut-tut/\" rel=\"nofollow\">990</a></p>\n          </td>\n          <td class=\"post\">\n            <div class=\"postinfo\">\n              <div class=\"linklist\">\n                  <a href=\"https://forum.ubuntuusers.de/post/9165916/quote/\"\n                     class=\"action action_quote\">Zitieren</a>\n              </div>\n              <a href=\"https://forum.ubuntuusers.de/post/9165916/\" title=\"Permalink zu diesem Beitrag\">\n                <img src=\"//static-cdn.ubuntu-de.org/img/icon_minipost.gif\" alt=\"Beitrag\" /></a>\n              14. Juni 2020 20:24\n              \n            </div>\n            \n            <div class=\"text\">\n              <p>Da brauchst du bestimmt keine Software!</p><p>Mein Appimage ist Moneyplex:\nBei mir habe ich die Startdatei im Ordner mit der rechten Maustaste angeklickt und diese &quot;ausf\u00fchrbar&quot; in den Eigenschaften gemacht. Danach war das Problem wie du es beschreibst: L\u00f6sung siehe <a title=\"https://forum.ubuntuusers.de/topic/starter-auf-dem-desktop-zu-favoriten-hinzufueg/\" href=\"https://forum.ubuntuusers.de/topic/starter-auf-dem-desktop-zu-favoriten-hinzufueg/\" class=\"crosslink\">https://forum.ubuntuusers.de/topic/starter-auf-dem-desktop-zu-favoriten-hinzufueg/</a></p>\n            </div>\n          </td>\n        </tr>\n        <tr id=\"post-9166263\">\n          <td class=\"author\">\n            <p class=\"username\">\n              <a href=\"https://ubuntuusers.de/user/fleet_street/\">fleet_street</a>\n              \n            </p>\n            <p>Anmeldungsdatum:<br />30. August 2016</p>\n            <p class=\"posts\">Beitr\u00e4ge: <a href=\"https://forum.ubuntuusers.de/author/fleet_street/\" rel=\"nofollow\">1046</a></p>\n              <p>Wohnort: Hunsr\u00fcck</p>\n          </td>\n          <td class=\"post\">\n            <div class=\"postinfo\">\n              <div class=\"linklist\">\n                  <a href=\"https://forum.ubuntuusers.de/post/9166263/quote/\"\n                     class=\"action action_quote\">Zitieren</a>\n              </div>\n              <a href=\"https://forum.ubuntuusers.de/post/9166263/\" title=\"Permalink zu diesem Beitrag\">\n                <img src=\"//static-cdn.ubuntu-de.org/img/icon_minipost.gif\" alt=\"Beitrag\" /></a>\n              15. Juni 2020 21:57\n              \n            </div>\n            \n            <div class=\"text\">\n              <p><a href=\"https://ubuntuusers.de/user/Liane/\" class=\"crosslink user\">Liane</a> <a href=\"https://forum.ubuntuusers.de/post/9165847/\" class=\"crosslink post\">schrieb</a>:</p><blockquote><p>\u2026 Erinnerlich, gibt es eine Software, mit der man mittels GUI diesen Starter erstellen kann?</p></blockquote><p> \nDa erinnerst du dich vielleicht an dein Linux Mint Cinnamon. </p>\n            </div>\n          </td>\n        </tr>\n        <tr id=\"post-9166282\">\n          <td class=\"author\">\n            <p class=\"username\">\n              <a href=\"https://ubuntuusers.de/user/Bleys/\">Bleys</a>\n              \n            </p>\n              \n  <img class=\"avatar\"\n    src=\"//media-cdn.ubuntu-de.org/portal/avatars/avatar_user18181.jpeg\"\n    alt=\"Avatar von Bleys\" />\n\n            <p>Anmeldungsdatum:<br />13. August 2006</p>\n            <p class=\"posts\">Beitr\u00e4ge: <a href=\"https://forum.ubuntuusers.de/author/Bleys/\" rel=\"nofollow\">5175</a></p>\n              <p>Wohnort: Essen, NRW</p>\n          </td>\n          <td class=\"post\">\n            <div class=\"postinfo\">\n              <div class=\"linklist\">\n                  <a href=\"https://forum.ubuntuusers.de/post/9166282/quote/\"\n                     class=\"action action_quote\">Zitieren</a>\n              </div>\n              <a href=\"https://forum.ubuntuusers.de/post/9166282/\" title=\"Permalink zu diesem Beitrag\">\n                <img src=\"//static-cdn.ubuntu-de.org/img/icon_minipost.gif\" alt=\"Beitrag\" /></a>\n              16. Juni 2020 01:23\n              \n            </div>\n            \n            <div class=\"text\">\n              <div class=\"code\"><table class=\"notranslate syntaxtable\"><tr><td class=\"linenos\"><div class=\"linenodiv\"><pre>1</pre></div></td><td class=\"code\"><div class=\"notranslate syntax\"><pre><span></span>sudo apt install menulibre\n</pre></div>\n</td></tr></table></div><p>In der \u00dcbersicht hei\u00dft die Anwendung &quot;Men\u00fcbearbeitung&quot;. Starten, die passende Kategorie ausw\u00e4hlen, oben links auf das Plus klicken.</p>\n            </div>\n          </td>\n        </tr>\n    </tbody>\n  </table>\n\n  <div class=\"topic_box admin_link_hover\">\n    <div class=\"topic_box_content\">\n      <div class=\"pagination pagination_right\"><span class=\"disabled prev\">\u00ab Vorherige</span><span class=\"pageselect active\">1</span><span class=\"disabled next\">N\u00e4chste \u00bb</span>\n</div>\n      \n  <span class=\"linklist\">\n      <a href=\"https://forum.ubuntuusers.de/topic/appimage-programm-in-alle-programme-als-icon-a/reply/\" class=\"action action_reply\">Antworten</a> |\n    \n  </span>\n\n    </div>\n  </div>\n  <div class=\"pathbar\">\n    <div class=\"pagination pagination_right\">\n      <a href=\"https://forum.ubuntuusers.de/topic/appimage-programm-in-alle-programme-als-icon-a/previous/\"\n         rel=\"nofollow\" class=\"prev\">\u00ab Vorheriges Thema</a>\n      <a href=\"https://forum.ubuntuusers.de/topic/appimage-programm-in-alle-programme-als-icon-a/next/\"\n         rel=\"nofollow\" class=\"next\">N\u00e4chstes Thema \u00bb</a>\n    </div>\n  </div>\n\n        </main>\n\n    <div class=\"breadcrumb -bottom\">\n      \n          <ol>\n            \n              <li><a href=\"https://forum.ubuntuusers.de/\">Forum</a></li>\n            \n              <li><a href=\"https://forum.ubuntuusers.de/category/grafische-oberflaeche/\">Grafische Oberfl\u00e4che</a></li>\n            \n              <li><a href=\"https://forum.ubuntuusers.de/forum/gnome/\">GNOME (Ubuntu ab 17.10)</a></li>\n            \n              <li><a href=\"https://forum.ubuntuusers.de/topic/appimage-programm-in-alle-programme-als-icon-a/\">Appimage Programm in \"alle Programme\" als Icon anzeigen?</a></li>\n            \n          </ol>\n        \n    </div>\n\n    <footer class=\"footer\">\n      <ul>\n        <li class=\"poweredby\">\n          Powered by <a href=\"https://ubuntuusers.de/inyoka/\">Inyoka</a>\n          \n          <br />\n          \n          Inyoka v0.22.1\n          \n          \n        </li>\n        <li class=\"license\">\n          \ud83c\udd2f   2004 \u2013 2020 ubuntuusers.de \u2022 Einige Rechte vorbehalten<br />\n          <a href=\"https://ubuntuusers.de/lizenz/\" rel=\"cc:morePermissions\">Lizenz</a> \u2022\n          <a href=\"https://ubuntuusers.de/kontakt/\">Kontakt</a> \u2022\n          <a href=\"https://ubuntuusers.de/datenschutz/\">Datenschutz</a> \u2022\n          <a href=\"https://ubuntuusers.de/impressum/\">Impressum</a> \u2022\n          <a href=\"https://ubuntuusers.statuspage.io\">Serverstatus</a>\n        </li>\n        <li class=\"housing\">\n          <span title=\"Unterbringung und Netzanbindung eines Servers\">Serverhousing</span> gespendet von<br />\n          <a href=\"http://noris.jobs/linux\"><img src=\"//static-cdn.ubuntu-de.org/img/noris_logo.png\" alt=\"noris network\" /></a>\n          <a href=\"http://www.anexia.at/managed-hosting/\"><img src=\"//static-cdn.ubuntu-de.org/img/anexia_logo.png\" alt=\"anexia\" /></a>\n        </li>\n      </ul>\n    </footer>\n\n    <script type=\"text/javascript\" src=\"https://ubuntuusers.de/jsi18n/?v=v0.22.1\"></script>\n    <script type=\"text/javascript\">\n    /*<![CDATA[*/\n      var\n        $STATIC_URL = \"//static-cdn.ubuntu-de.org/\",\n        $MEDIA_URL = \"//media-cdn.ubuntu-de.org/\",\n        $BASE_DOMAIN_NAME = \"ubuntuusers.de\",\n        $CURRENT_USER = null,\n        $IS_LOGGED_IN = false,\n        $SIDEBAR_HIDDEN = false;\n    /*]]>*/\n    </script>\n\n   <script type=\"text/javascript\" src=\"//static-cdn.ubuntu-de.org/js/jquery.min.js?v=v0.22.1\"></script>\n   \n     \n       <script type=\"text/javascript\" src=\"//static-cdn.ubuntu-de.org/js/classy.min.js?v=v0.22.1\"></script>\n     \n   \n     \n       <script type=\"text/javascript\" src=\"//static-cdn.ubuntu-de.org/js/jquery.extensions.min.js?v=v0.22.1\"></script>\n     \n   \n     \n       <script type=\"text/javascript\" src=\"//static-cdn.ubuntu-de.org/js/overall.min.js?v=v0.22.1\"></script>\n     \n   \n     \n       <script type=\"text/javascript\" src=\"//static-cdn.ubuntu-de.org/js/forum.min.js?v=v0.22.1\"></script>\n     \n   \n\n   \n   \n  </body>\n</html>",
 5 |  "text": "[Zum Hauptinhalt springen](#main) [Zur Seitenleiste springen](#sidebar)   * Bitte aktiviere JavaScript! Anmelden  Registrieren   ubuntuusers.de PortalForumWikiIkhayaPlanetMitmachen via DuckDuckGo * Filter + Neue Beitr\u00e4ge nur hier + Unbeantwortete Themen nur hier + Ungel\u00f6ste Themen nur hier + 24 Stunden nur hier + 12 Stunden nur hier + 6 Stunden nur hier 1. Forum 2. Grafische Oberfl\u00e4che 3. GNOME (Ubuntu ab 17.10) 4. Appimage Programm in \"alle Programme\" als Icon anzeigen? Appimage Programm in \"alle Programme\" als Icon anzeigen? \u00ab Vorherige 1 N\u00e4chste \u00bb Status: Ungel\u00f6st | Ubuntu-Version: Ubuntu 20.04 (Focal Fossa) Antworten | Zitieren [Liane](https://ubuntuusers.de/user/Liane/)                            [ ](https://forum.ubuntuusers.de/post/9165689/) 14. Juni 2020 10:23 (zuletzt bearbeitet: 14. Juni 2020 10:41) Hallo zusammen, ich habe ein Programm, dass als Appimage zu starten ist. Ich w\u00fcrde gerne dieses Programm wie die anderen Anwendungen auch auf dem Dashboard (hoffe ich habe das Men\u00fc richtig beschrieben) mittels \"alle Programme\" sehen k\u00f6nnen. Der Umweg \u00fcber den Ordner suchen und das Programm darin zu starten ist umst\u00e4ndlich. Danke! bye Liane Moderiert von Taomon: Dieses Thema ist verschoben worden. Bitte beachte die als wichtig markierten Themen (\u201eWelche Themen geh\u00f6ren hier her und welche nicht?\u201c)! Zitieren [fleet_street](https://ubuntuusers.de/user/fleet_street/)              [ ](https://forum.ubuntuusers.de/post/9165696/) 14. Juni 2020 10:42 Was du ben\u00f6tigst, ist ein Starter \u2192 .desktop-Dateien Anmeldungsdatum: 30. August 2016 Beitr\u00e4ge: 1046 Wohnort: Hunsr\u00fcck Zitieren [Liane](https://ubuntuusers.de/user/Liane/)                            [ ](https://forum.ubuntuusers.de/post/9165847/) 14. Juni 2020 17:56 (Themenstarter)                                                      danke f\u00fcr die Antwort. Erinnerlich, gibt es eine Software, mit der man mittels GUI diesen Starter erstellen kann? DANKE Beitr\u00e4ge: 457 Zitieren [Tut-tut](https://ubuntuusers.de/user/Tut-tut/)                        [ ](https://forum.ubuntuusers.de/post/9165916/) 14. Juni 2020 20:24 Da brauchst du bestimmt keine Software! Mein Appimage ist Moneyplex: Bei mir habe ich die Startdatei im Ordner mit der rechten Maustaste angeklickt und diese \"ausf\u00fchrbar\" in den Eigenschaften gemacht. Danach war das Problem wie du es beschreibst: L\u00f6sung siehe https://forum.ubuntuusers.de/topic/starter-auf-dem-desktop-zu-favoriten-hinzufueg/ Beitr\u00e4ge: 990 Zitieren [fleet_street](https://ubuntuusers.de/user/fleet_street/)              [ ](https://forum.ubuntuusers.de/post/9166263/) 15. Juni 2020 21:57 Liane schrieb: \u2026 Erinnerlich, gibt es eine Software, mit der man mittels GUI diesen Starter erstellen kann? Da erinnerst du dich vielleicht an dein Linux Mint Cinnamon. Wohnort: Hunsr\u00fcck Zitieren [Bleys](https://ubuntuusers.de/user/Bleys/)                            [ ](https://forum.ubuntuusers.de/post/9166282/) 16. Juni 2020 01:23 1 sudo apt install menulibre In der \u00dcbersicht hei\u00dft die Anwendung \"Men\u00fcbearbeitung\". Starten, die passende Kategorie ausw\u00e4hlen, oben links auf das Plus klicken. Beitr\u00e4ge: 5175 Wohnort: Essen, NRW \u00ab Vorherige 1 N\u00e4chste \u00bb Antworten | \u00ab Vorheriges Thema N\u00e4chstes Thema \u00bb 1. Forum 2. Grafische Oberfl\u00e4che 3. GNOME (Ubuntu ab 17.10) 4. Appimage Programm in \"alle Programme\" als Icon anzeigen? * Powered by Inyoka Inyoka v0.22.1 * \ud83c\udd2f 2004 \u2013 2020 ubuntuusers.de \u2022 Einige Rechte vorbehalten Lizenz \u2022 Kontakt \u2022 Datenschutz \u2022 Impressum \u2022 Serverstatus * Serverhousing gespendet von  ",
 6 |  "gold_standard_annotation": [
 7 |   {
 8 |    "post_text": {
 9 |     "surface_form": "Hallo zusammen, ich habe ein Programm, dass als Appimage zu starten ist. Ich w\u00fcrde gerne dieses Programm wie die anderen Anwendungen auch auf dem Dashboard (hoffe ich habe das Men\u00fc richtig beschrieben) mittels \"alle Programme\" sehen k\u00f6nnen. Der Umweg \u00fcber den Ordner suchen und das Programm darin zu starten ist umst\u00e4ndlich. Danke!"
10 |    },
11 |    "datetime": {
12 |     "surface_form": "14. Juni 2020 10:23"
13 |    },
14 |    "user": {
15 |     "surface_form": "https://ubuntuusers.de/user/Liane/"
16 |    },
17 |    "post_link": {
18 |     "surface_form": "https://forum.ubuntuusers.de/post/9165689/"
19 |    }
20 |   },
21 |   {
22 |    "post_text": {
23 |     "surface_form": "Was du ben\u00f6tigst, ist ein Starter \u2192 .desktop-Dateien"
24 |    },
25 |    "datetime": {
26 |     "surface_form": "14. Juni 2020 10:42"
27 |    },
28 |    "user": {
29 |     "surface_form": "https://ubuntuusers.de/user/fleet_street/"
30 |    },
31 |    "post_link": {
32 |     "surface_form": "https://forum.ubuntuusers.de/post/9165696/"
33 |    }
34 |   },
35 |   {
36 |    "post_text": {
37 |     "surface_form": "danke f\u00fcr die Antwort. Erinnerlich, gibt es eine Software, mit der man mittels GUI diesen Starter erstellen kann? DANKE"
38 |    },
39 |    "datetime": {
40 |     "surface_form": "14. Juni 2020 17:56"
41 |    },
42 |    "user": {
43 |     "surface_form": "https://ubuntuusers.de/user/Liane/"
44 |    },
45 |    "post_link": {
46 |     "surface_form": "https://forum.ubuntuusers.de/post/9165847/"
47 |    }
48 |   },
49 |   {
50 |    "post_text": {
51 |     "surface_form": "Da brauchst du bestimmt keine Software! Mein Appimage ist Moneyplex: Bei mir habe ich die Startdatei im Ordner mit der rechten Maustaste angeklickt und diese \"ausf\u00fchrbar\" in den Eigenschaften gemacht. Danach war das Problem wie du es beschreibst: L\u00f6sung siehe https://forum.ubuntuusers.de/topic/starter-auf-dem-desktop-zu-favoriten-hinzufueg/"
52 |    },
53 |    "datetime": {
54 |     "surface_form": "14. Juni 2020 20:24"
55 |    },
56 |    "user": {
57 |     "surface_form": "https://ubuntuusers.de/user/Tut-tut/"
58 |    },
59 |    "post_link": {
60 |     "surface_form": "https://forum.ubuntuusers.de/post/9165916/"
61 |    }
62 |   },
63 |   {
64 |    "post_text": {
65 |     "surface_form": "Liane schrieb: \u2026 Erinnerlich, gibt es eine Software, mit der man mittels GUI diesen Starter erstellen kann? Da erinnerst du dich vielleicht an dein Linux Mint Cinnamon."
66 |    },
67 |    "datetime": {
68 |     "surface_form": "15. Juni 2020 21:57"
69 |    },
70 |    "user": {
71 |     "surface_form": "https://ubuntuusers.de/user/fleet_street/"
72 |    },
73 |    "post_link": {
74 |     "surface_form": "https://forum.ubuntuusers.de/post/9166263/"
75 |    }
76 |   },
77 |   {
78 |    "post_text": {
79 |     "surface_form": "1 sudo apt install menulibre In der \u00dcbersicht hei\u00dft die Anwendung \"Men\u00fcbearbeitung\". Starten, die passende Kategorie ausw\u00e4hlen, oben links auf das Plus klicken."
80 |    },
81 |    "datetime": {
82 |     "surface_form": "16. Juni 2020 01:23"
83 |    },
84 |    "user": {
85 |     "surface_form": "https://ubuntuusers.de/user/Bleys/"
86 |    },
87 |    "post_link": {
88 |     "surface_form": "https://forum.ubuntuusers.de/post/9166282/"
89 |    }
90 |   }
91 |  ]
92 | }


--------------------------------------------------------------------------------