├── tests ├── py.test.ini ├── __init__.py ├── test_install.py ├── fixtures │ ├── activity_endpoints │ │ ├── https%3A%2F%2Fwww.linkedin.com%2Fcountserv%2Fcount%2Fshare%3Furl%3Dhttps%3A%2F%2Ftheintercept.com%2F2016%2F11%2F26%2Flaura-ingraham-lifezette%2F%2F%26format%3Djson.json │ │ ├── https%3A%2F%2Fgraph.facebook.com%2F%3Fid%3Dhttps%3A%2F%2Ftheintercept.com%2F2016%2F11%2F26%2Flaura-ingraham-lifezette%2F.json │ │ └── https%3A%2F%2Fbuttons.reddit.com%2Fbutton_info.json%3Furl%3Dhttps%3A%2F%2Ftheintercept.com%2F2016%2F11%2F26%2Flaura-ingraham-lifezette%2F.json │ ├── invalid │ │ └── invalid.html │ ├── netzpolitik.org │ │ └── index.html │ └── businessinsider.com │ │ └── dropbox-vp-todd-jackson-leaves-for-first-round-capital-2018-4.html ├── test_extract.py ├── test_pos.py ├── test_domain.py ├── test_social.py ├── test_module.py └── test_html.py ├── metadoc ├── extract │ ├── __init__.py │ ├── ner.py │ ├── extractor.py │ ├── html.py │ └── pos.py ├── social │ ├── __init__.py │ ├── providers.py │ └── activity.py ├── domain │ ├── __init__.py │ ├── check.py │ ├── lookup.py │ ├── domaintools.py │ └── blacklists.py ├── install.py └── __init__.py ├── requirements-dev.txt ├── MANIFEST.in ├── .gitignore ├── requirements.txt ├── .travis.yml ├── LICENSE.md ├── serve.py ├── setup.py └── README.md /tests/py.test.ini: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /metadoc/extract/__init__.py: -------------------------------------------------------------------------------- 1 | from .extractor import Extractor -------------------------------------------------------------------------------- /metadoc/social/__init__.py: -------------------------------------------------------------------------------- 1 | from .activity import ActivityCount -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | asynctest==0.9.0 3 | pytest==3.0.5 4 | pytest-cov==2.4.0 5 | 6 | -------------------------------------------------------------------------------- /metadoc/domain/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | from .domaintools import Domaintools 5 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | include requirements-dev.txt 3 | include README.md 4 | recursive-include metadoc/extract/data * 5 | #global-exclude *.zip 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .cache 3 | .coverage 4 | **/__pycache__ 5 | **.pickle 6 | *.egg 7 | *.egg-info 8 | *-sdist 9 | dist 10 | venv 11 | htmlcov 12 | *.swp 13 | venv36 14 | metadoc/extract/data/* 15 | .pytest_cache -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==1.1.5 2 | bottle==0.12.10 3 | python-dateutil==2.6.1 4 | jmespath==0.9.0 5 | langdetect==1.0.7 6 | goose3==3.0.9 7 | nltk==3.2.1 8 | numpy==1.13.3 9 | requests==2.18.4 10 | tldextract==2.0.2 11 | whois==0.7 12 | -------------------------------------------------------------------------------- /tests/test_install.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import asynctest 3 | from metadoc.install import install_nltk_sets 4 | 5 | class MetadocInstallTest(asynctest.TestCase): 6 | def setUp(self): 7 | return 8 | 9 | @asynctest.ignore_loop 10 | def test_install(self): 11 | install_nltk_sets() 12 | -------------------------------------------------------------------------------- /tests/fixtures/activity_endpoints/https%3A%2F%2Fwww.linkedin.com%2Fcountserv%2Fcount%2Fshare%3Furl%3Dhttps%3A%2F%2Ftheintercept.com%2F2016%2F11%2F26%2Flaura-ingraham-lifezette%2F%2F%26format%3Djson.json: -------------------------------------------------------------------------------- 1 | {"count":76,"fCnt":"76","fCntPlusOne":"77","url":"https:\/\/theintercept.com\/2016\/11\/26\/laura-ingraham-lifezette\/\/"} -------------------------------------------------------------------------------- /metadoc/domain/check.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import itertools 5 | from .blacklists import blacklists 6 | 7 | def check_credibility(url): 8 | plain_lists = [l for l in list(blacklists.values())] 9 | consolidated_list = list(itertools.chain.from_iterable(plain_lists)) 10 | confidence = consolidated_list.count(url) / len(blacklists) 11 | unique_set = set(consolidated_list) 12 | 13 | return { 14 | "is_blacklisted": url in consolidated_list, 15 | "fake_confidence": "{0:.2f}".format(confidence) 16 | } -------------------------------------------------------------------------------- /metadoc/domain/lookup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import urllib 5 | import whois 6 | import datetime 7 | 8 | def whois_date_registered(domain): 9 | try: 10 | query = whois.query(domain) # silently fails in corporate env, vocally fails behind proxy 11 | except Exception as e: 12 | query = None 13 | pass 14 | 15 | # if query.creation_date == "before aug-1996": query.creation_date = datetime.datetime(1996) # .co.uk edge case 16 | # elif type(query.creation_date) is not "date": query = None 17 | return query.creation_date if query else None -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | before_install: 5 | - sudo apt-get -qq update 6 | - sudo apt-get install -y python3 libxml2-dev libxslt1-dev libtiff-dev libjpeg-dev webp whois 7 | # command to install dependencies 8 | install: 9 | - pip3 install -r requirements-dev.txt 10 | - pip3 install codecov coveralls 11 | before_script: 12 | - python -m nltk.downloader brown punkt maxent_treebank_pos_tagger wordnet stopwords averaged_perceptron_tagger words maxent_ne_chunker 13 | # command to run tests 14 | script: 15 | - py.test --cov=metadoc -v tests 16 | after_success: 17 | - coveralls 18 | notifications: 19 | email: false 20 | -------------------------------------------------------------------------------- /tests/fixtures/invalid/invalid.html: -------------------------------------------------------------------------------- 1 | CTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 3 | 4 | 5 | 6 | 7 | 8 | 9 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /tests/fixtures/activity_endpoints/https%3A%2F%2Fgraph.facebook.com%2F%3Fid%3Dhttps%3A%2F%2Ftheintercept.com%2F2016%2F11%2F26%2Flaura-ingraham-lifezette%2F.json: -------------------------------------------------------------------------------- 1 | { 2 | "og_object": { 3 | "id": "1537693146247498", 4 | "description": "Macedonian teens and Russian propagandists have been blamed for the scourge of \"fake news,\" but much originated from shady sites tied to Donald Trump allies.", 5 | "title": "Some Fake News Publishers Just Happen to Be Donald Trump\u2019s Cronies", 6 | "type": "article", 7 | "updated_time": "2016-12-03T15:02:34+0000" 8 | }, 9 | "share": { 10 | "comment_count": 3, 11 | "share_count": 13768 12 | }, 13 | "id": "https://theintercept.com/2016/11/26/laura-ingraham-lifezette/" 14 | } -------------------------------------------------------------------------------- /tests/test_extract.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | from unittest.mock import patch 4 | from metadoc.extract import Extractor 5 | from metadoc.extract.pos import do_train 6 | 7 | class MetadocExtractorTest(unittest.TestCase): 8 | def setUp(self): 9 | article_path = "tests/fixtures/theintercept.com/laura-ingraham-lifezette.html" 10 | with open(article_path, 'r') as article: 11 | self.article_html=article.read() 12 | 13 | self.extractor = Extractor(self.article_html) 14 | 15 | def test_init(self): 16 | assert self.extractor.html == self.article_html 17 | 18 | def test_without_ft(self): 19 | self.extractor.fulltext = "" 20 | self.extractor.detect_language() 21 | assert self.extractor 22 | 23 | def test_get_all_local(self): 24 | do_train() 25 | self.extractor.get_all() 26 | assert self.extractor.contenthash == "2b374ca41d42bd582e500e6cdbc936ef" 27 | assert self.extractor.title == "Some Fake News Publishers Just Happen to Be Donald Trump’s Cronies" 28 | -------------------------------------------------------------------------------- /metadoc/social/providers.py: -------------------------------------------------------------------------------- 1 | """While we're only interested in share, like, up counts for now, 2 | there's a lot of interesting metadata in e.g. reddit responses like 3 | user_reports, report_reasons, num_reports, that might be useful 4 | in building certain heuristics. 5 | """ 6 | 7 | providers = [ 8 | { 9 | "provider": "facebook", 10 | "endpoint": "https://graph.facebook.com/?id={0}", 11 | "metrics": [{ 12 | "label": "sharecount", 13 | "path": "share.share_count" 14 | }] 15 | }, 16 | { 17 | "provider": "linkedin", 18 | "endpoint": "https://www.linkedin.com/countserv/count/share?url={0}/&format=json", 19 | "metrics": [{ 20 | "label": "sharecount", 21 | "path": "count" 22 | }] 23 | }, 24 | { 25 | "provider": "reddit", 26 | "endpoint": "https://buttons.reddit.com/button_info.json?url={0}", 27 | "metrics": [{ 28 | "label": "upvotes", 29 | "path": "data.children[0].data.ups" 30 | }, 31 | { 32 | "label": "num_reports", 33 | "path": "data.children[0].data.num_reports" 34 | }] 35 | } 36 | ] -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 Paul Solbach 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a 4 | copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included 12 | in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 15 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 17 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 18 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 19 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 20 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /metadoc/install.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import glob 4 | import nltk 5 | import os 6 | import time 7 | 8 | def remove_zips(data_dir): 9 | glob_path = os.path.join(data_dir, '**/*.zip') 10 | for filename in glob.iglob(glob_path, recursive=True): 11 | print("Removing {}...".format(filename)) 12 | os.remove(filename) 13 | 14 | def install_nltk_sets(): 15 | DATA_DIR = os.path.join(os.path.dirname(__file__), "extract/data") 16 | REQUIRED_CORPORA = [ 17 | 'brown', # Required for FastNPExtractor 18 | 'punkt', # Required for WordTokenizer 19 | 'wordnet', # Required for lemmatization and Wordnet 20 | 'maxent_ne_chunker', 21 | 'stopwords', 22 | 'words' 23 | ] 24 | 25 | for each in REQUIRED_CORPORA: 26 | print(('[+] Downloading corpus: "{0}"'.format(each))) 27 | nltk.download(each, download_dir=DATA_DIR) 28 | 29 | from metadoc.extract.pos import do_train 30 | print('[+] Training tagger now.') 31 | do_train() 32 | remove_zips(DATA_DIR) 33 | return 34 | 35 | if __name__ == "__main__": 36 | install_nltk_sets() 37 | -------------------------------------------------------------------------------- /tests/test_pos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import asyncio 5 | import asynctest 6 | import pytest 7 | 8 | from asynctest.mock import patch 9 | from metadoc.extract.pos import do_train, AveragedPerceptronTagger 10 | 11 | class MetadocPerceptronTest(asynctest.TestCase): 12 | def setUp(self): 13 | return 14 | 15 | @asynctest.ignore_loop 16 | def test_init(self): 17 | self.perceptron_tagger = AveragedPerceptronTagger(autoload=True) 18 | tags = self.perceptron_tagger.tag("Rami Eid is studying at Stony Brook University in NY") 19 | assert len(tags) == 10 20 | 21 | @asynctest.ignore_loop 22 | def test_string_ends_with_nnp(self): 23 | self.perceptron_tagger = AveragedPerceptronTagger(autoload=True) 24 | test_sentence = "The extraordinary phenomenon of fake news spread by Facebook and other \ 25 | social media during the 2016 presidential election has been largely portrayed as a lucky break for Donald Trump" 26 | 27 | tags = self.perceptron_tagger.tag(test_sentence) 28 | entities = self.perceptron_tagger.named_entities(tags) 29 | 30 | assert tags[len(tags)-1][1] == "NNP" 31 | assert "Donald Trump" in entities 32 | 33 | @asynctest.ignore_loop 34 | @patch('metadoc.extract.pos.pickle.load') 35 | def test_no_pickle_found(self, _mocked_func): 36 | _mocked_func.side_effect = IOError('foo') 37 | with pytest.raises(IOError): 38 | AveragedPerceptronTagger(autoload=True) 39 | -------------------------------------------------------------------------------- /tests/fixtures/activity_endpoints/https%3A%2F%2Fbuttons.reddit.com%2Fbutton_info.json%3Furl%3Dhttps%3A%2F%2Ftheintercept.com%2F2016%2F11%2F26%2Flaura-ingraham-lifezette%2F.json: -------------------------------------------------------------------------------- 1 | {"kind": "Listing", "data": {"modhash": "nyjj03f66efdc11f1eac41664cff6d7bcff5571ad854a1b681", "children": [{"kind": "t3", "data": {"contest_mode": false, "banned_by": null, "domain": "theintercept.com", "subreddit": "hillaryclinton", "selftext_html": null, "selftext": "", "likes": null, "suggested_sort": null, "user_reports": [], "secure_media": null, "saved": false, "id": "5gjkag", "gilded": 0, "secure_media_embed": {}, "clicked": false, "report_reasons": null, "author": "NYLaw", "media": null, "name": "t3_5gjkag", "score": 1, "approved_by": null, "over_18": false, "removal_reason": null, "hidden": false, "thumbnail": "default", "subreddit_id": "t5_2u1c9", "edited": false, "link_flair_css_class": null, "author_flair_css_class": null, "downs": 0, "mod_reports": [], "archived": false, "media_embed": {}, "is_self": false, "hide_score": false, "spoiler": false, "permalink": "/r/hillaryclinton/comments/5gjkag/some_fake_news_publishers_just_happen_to_be/", "locked": false, "stickied": false, "created": 1480935974.0, "url": "https://theintercept.com/2016/11/26/laura-ingraham-lifezette/", "author_flair_text": null, "quarantine": false, "title": "Some Fake News Publishers Just Happen to be Donald Trump's Cronies", "created_utc": 1480907174.0, "link_flair_text": null, "distinguished": null, "num_comments": 0, "visited": false, "num_reports": null, "ups": 1}}], "after": null, "before": null}} -------------------------------------------------------------------------------- /metadoc/domain/domaintools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import math 3 | import time 4 | import logging 5 | import tldextract 6 | from datetime import datetime, timedelta 7 | from .lookup import whois_date_registered 8 | from .check import check_credibility 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class Domaintools(object): 13 | """Gather various metadata like whois informaion 14 | and blacklist status about any given hostname 15 | """ 16 | def __init__(self, url=None): 17 | self.url = url or None 18 | self.get_domain(url) 19 | 20 | def get_domain(self, url): 21 | no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None) 22 | tld = no_fetch_extract(url) 23 | self.domain = "{}.{}".format(tld.domain, tld.suffix) 24 | 25 | def get_date_registered(self): 26 | self.date_registered = whois_date_registered(self.domain) 27 | 28 | def check_credibility(self): 29 | self.credibility = check_credibility(self.domain) 30 | 31 | def get_all(self): 32 | start_time = time.time() 33 | if not self.domain: return 34 | self.get_date_registered() 35 | self.check_credibility() 36 | 37 | if self.date_registered: 38 | self.recalculate_fake_confidence() 39 | self.date_registered_iso = self.date_registered.isoformat() 40 | 41 | logger.debug("--- domain module %s seconds ---" % (time.time() - start_time)) 42 | 43 | def recalculate_fake_confidence(self): 44 | # Adds .2 to fake_confidence if website was registered delta 1y 45 | one_year_ago = datetime.now() - timedelta(days=1*365) 46 | if self.date_registered < one_year_ago: return 47 | 48 | confidence = self.credibility.get("fake_confidence", 0) 49 | self.credibility["fake_confidence"] = float(confidence) + .2 50 | -------------------------------------------------------------------------------- /serve.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | __title__ = 'Metadoc - Postmodern news article metadata service' 5 | __copyright__ = 'Copyright 2016, Paul Solbach' 6 | __author__ = 'Paul Solbach' 7 | __license__ = 'MIT' 8 | 9 | import concurrent 10 | import json 11 | import bottle 12 | from bottle import response, request, get, route, run, abort, error 13 | from metadoc import Metadoc 14 | 15 | bottle.BaseRequest.MEMFILE_MAX = 1024 * 1024 # up max POST payload size to 1MB 16 | 17 | @error(404) 18 | def error404(error): 19 | return json.dumps({'code': 404,'message': 'url param is missing.'}) 20 | 21 | @get('/social') 22 | def social_article(): 23 | """GET data url required""" 24 | response.content_type = 'application/json' 25 | url = request.query.getone("url") 26 | if not url: 27 | abort(404) 28 | 29 | metadoc = Metadoc(url=url) 30 | payload = metadoc.query(mode="social", fmt="social") 31 | 32 | return json.dumps(payload) 33 | 34 | @get('/extract') 35 | def extract_article(): 36 | """GET data url required""" 37 | response.content_type = 'application/json' 38 | url = request.query.getone("url") 39 | if not url: 40 | abort(404) 41 | 42 | metadoc = Metadoc(url=url) 43 | metadoc._prepare() 44 | metadoc._query_domain() 45 | metadoc._query_extract() 46 | 47 | payload = metadoc._render() # Preserve order 48 | return json.dumps(payload) 49 | 50 | @get('/full') 51 | def full_article(): 52 | """GET data url required""" 53 | response.content_type = 'application/json' 54 | url = request.query.getone("url") 55 | if not url: 56 | abort(404) 57 | 58 | metadoc = Metadoc(url=url) 59 | payload = metadoc.query() 60 | 61 | return json.dumps(payload) 62 | 63 | 64 | run(host='localhost', reloader=True, port=6060) 65 | -------------------------------------------------------------------------------- /metadoc/social/activity.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import asyncio 3 | import jmespath 4 | import json 5 | import logging 6 | import requests 7 | import signal 8 | import time 9 | 10 | from aiohttp import ClientSession 11 | from .providers import providers 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | class ActivityCount(object): 16 | """Gather activity/share stats from social APIs""" 17 | 18 | def __init__(self, url=None): 19 | self.url = url or None 20 | self.responses = [] 21 | 22 | def get_all(self, loop): 23 | activity_tasks = [] 24 | for provider in providers: 25 | url = provider["endpoint"].format(self.url) 26 | task = asyncio.ensure_future(self.collect_sharecount(url, provider)) 27 | activity_tasks.append(task) 28 | 29 | return asyncio.gather(*activity_tasks) 30 | 31 | async def get_json(self, url): 32 | async with ClientSession() as session: 33 | async with session.get(url) as response: 34 | return await response.read() 35 | 36 | async def collect_sharecount(self, url, provider): 37 | try: 38 | response = await self.get_json(url) 39 | j = json.loads(response) 40 | 41 | data = { 42 | "provider": provider["provider"], 43 | "metrics": [] 44 | } 45 | 46 | for m in provider["metrics"]: 47 | data["metrics"].append({ 48 | "count": jmespath.search(m["path"], j), 49 | "label": m["label"] 50 | }) 51 | self.responses.append(data) 52 | except Exception as exc: 53 | logger.error("Collecting sharecount failed!") 54 | logger.exception(exc) 55 | 56 | -------------------------------------------------------------------------------- /tests/test_domain.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import datetime 4 | 5 | import unittest 6 | from unittest.mock import patch 7 | from metadoc.domain import Domaintools 8 | 9 | class MetadocDomaintoolsTest(unittest.TestCase): 10 | 11 | def setUp(self): 12 | article_path = "tests/fixtures/theintercept.com/laura-ingraham-lifezette.html" 13 | self.title = "Some Fake News Publishers Just Happen to Be Donald Trump’s Cronies" 14 | self.url = "https://theintercept.com/2016/11/26/laura-ingraham-lifezette/" 15 | self.date_registered = datetime.datetime(2008, 10, 1, 0, 0) 16 | self.domaintools = Domaintools(url=self.url) 17 | 18 | with open(article_path, 'r') as article: 19 | self.article_html=article.read() 20 | 21 | def test_init(self): 22 | assert self.domaintools.url == self.url 23 | assert self.domaintools.domain == "theintercept.com" 24 | 25 | @patch('metadoc.domain.domaintools.whois_date_registered') 26 | def test_get_all_local(self, _mocked_func): 27 | _mocked_func.return_value = self.date_registered 28 | self.domaintools.get_all()#self.loop) 29 | assert self.domaintools.date_registered == self.date_registered 30 | 31 | credibility_resp = { 32 | "is_blacklisted": False, 33 | "fake_confidence": "0.00" 34 | } 35 | 36 | assert self.domaintools.credibility == credibility_resp 37 | assert self.domaintools.date_registered == self.date_registered 38 | 39 | def test_get_all_remote(self): 40 | self.domaintools.get_all() 41 | assert self.domaintools.date_registered is not self.date_registered 42 | 43 | def test_new_domain(self): 44 | today = datetime.datetime.now() 45 | self.domaintools.date_registered = today 46 | self.domaintools.check_credibility() 47 | self.domaintools.recalculate_fake_confidence() 48 | 49 | assert self.domaintools.credibility["fake_confidence"] == 0.2 50 | 51 | -------------------------------------------------------------------------------- /tests/test_social.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import asyncio 5 | import asynctest 6 | import datetime 7 | import json 8 | import jmespath 9 | import urllib.parse 10 | 11 | from asynctest.mock import patch 12 | from metadoc.social import ActivityCount 13 | from metadoc.social.providers import providers 14 | 15 | class MetadocActivityCountTest(asynctest.TestCase): 16 | def setUp(self): 17 | self.url = "https://theintercept.com/2016/11/26/laura-ingraham-lifezette/" 18 | self.activity = ActivityCount(url=self.url) 19 | 20 | @asynctest.ignore_loop 21 | def test_init(self): 22 | assert self.activity.url == self.url 23 | 24 | def mocked_get_json(self, url): 25 | escaped_url = urllib.parse.quote(url, safe='') 26 | with open("tests/fixtures/activity_endpoints/{0}.json".format(escaped_url), 'r') as file: 27 | file_content=file.read() 28 | 29 | json_response = json.loads(file_content) 30 | provider = urllib.parse.urlparse(url).netloc.split(".")[1] 31 | setattr(self, provider, json_response) 32 | 33 | return file_content 34 | 35 | @patch.object(ActivityCount, 'get_json') 36 | async def test_get_all_local(self, _mocked_func): 37 | _mocked_func.side_effect = self.mocked_get_json 38 | 39 | for metrics in self.activity.responses: 40 | provider_data = [p for p in providers if p["provider"] == metrics["provider"]] 41 | test_data = getattr(self, metrics["provider"], None) 42 | test_metric_count = jmespath.search(provider_data[0]["metrics"][0]["path"], test_data) 43 | returned_metric_count = metrics["metrics"][0]["count"] 44 | assert test_metric_count == returned_metric_count 45 | 46 | async def test_get_all_remote(self): 47 | await self.activity.get_all(self.loop) 48 | assert len(self.activity.responses) > 0 49 | 50 | async def test_invalid_url(self): 51 | activity = ActivityCount(url="nourlatall") 52 | res = await activity.collect_sharecount(url="nourlatall", provider="foo") 53 | assert res == None 54 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import os.path 4 | import sys 5 | import re 6 | from subprocess import call 7 | from setuptools import setup, find_packages 8 | from setuptools.command.install import install as _install 9 | from setuptools.command.sdist import sdist as _sdist 10 | from wheel.bdist_wheel import bdist_wheel as _bdist_wheel 11 | 12 | with open('./README.md') as f: 13 | long_description = f.read() 14 | 15 | requirements_txt = open("./requirements.txt").read() 16 | main_py = open('metadoc/__init__.py').read() 17 | metadata = dict(re.findall("__([a-z]+)__ = '([^']+)'", main_py)) 18 | 19 | 20 | def _post_install(): 21 | from metadoc.install import install_nltk_sets 22 | install_nltk_sets() 23 | 24 | class DevInstall(_install): 25 | def run(self): 26 | call(["pip install -r ./requirements-dev.txt --no-clean"], shell=True) 27 | self.execute(_post_install, (), msg="Installing nltk sets!") 28 | _install.run(self) 29 | 30 | class CustomInstall(_sdist): 31 | def run(self): 32 | call(["pip install -r ./requirements.txt --no-clean"], shell=True) 33 | self.execute(_post_install, (), msg="Installing nltk sets!") 34 | _sdist.run(self) 35 | 36 | class BdistEggInstall(_bdist_wheel): 37 | def run(self): 38 | call(["pip install -r ./requirements.txt --no-clean"], shell=True) 39 | self.execute(_post_install, (), msg="Installing nltk sets!") 40 | _bdist_wheel.run(self) 41 | 42 | setup( 43 | name='metadoc', 44 | version=metadata["version"], 45 | description="Post-truth era news article metadata service.", 46 | long_description=long_description, 47 | long_description_content_type='text/markdown', 48 | classifiers=[ # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers 49 | "Programming Language :: Python :: 3.5", 50 | "Topic :: Internet :: WWW/HTTP", 51 | "Development Status :: 3 - Alpha", 52 | "License :: OSI Approved :: MIT License", 53 | "Intended Audience :: Developers", 54 | "Operating System :: POSIX :: Linux", 55 | "Environment :: Web Environment", 56 | ], 57 | keywords=["scraping", "metadata", "news article"], 58 | author=metadata["author"], 59 | author_email='p@psolbach.com', 60 | url='https://github.com/praise-internet/metadoc', 61 | license=metadata["license"], 62 | cmdclass={'sdist': CustomInstall, 'develop': DevInstall}, 63 | packages=find_packages(exclude=['tests']), 64 | install_requires=requirements_txt.strip().split("\n"), 65 | include_package_data=True, 66 | zip_safe=False 67 | ) 68 | -------------------------------------------------------------------------------- /tests/test_module.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import asynctest 3 | import pytest 4 | from metadoc import Metadoc 5 | 6 | class MetadocModuleTest(asynctest.TestCase): 7 | def setUp(self): 8 | self.url = "https://theintercept.com/2016/11/26/laura-ingraham-lifezette/" 9 | article_path = "tests/fixtures/theintercept.com/laura-ingraham-lifezette.html" 10 | with open(article_path, 'r') as article: 11 | self.article_html=article.read() 12 | 13 | self.metadoc = Metadoc(url=self.url, html=self.article_html) 14 | 15 | @asynctest.ignore_loop 16 | def test_init(self): 17 | assert self.metadoc.url == self.url 18 | assert self.metadoc.html == self.article_html 19 | 20 | @asynctest.ignore_loop 21 | def test_query_all(self): 22 | result = self.metadoc.query() 23 | assert result 24 | 25 | @asynctest.ignore_loop 26 | def test_extract(self): 27 | self.metadoc.query("extract") 28 | assert self.metadoc.extractor 29 | 30 | @asynctest.ignore_loop 31 | def test_social(self): 32 | self.metadoc.query("social") 33 | assert self.metadoc.activity 34 | 35 | @asynctest.ignore_loop 36 | def test_social_return(self): 37 | result = self.metadoc.query("social", "social") 38 | assert list(result.keys()) == ["url", "social", "__version__"] 39 | 40 | @asynctest.ignore_loop 41 | def test_domain(self): 42 | self.metadoc.query("domain") 43 | assert self.metadoc.domain 44 | 45 | @asynctest.ignore_loop 46 | def test_no_url_fail(self): 47 | with pytest.raises(AttributeError): 48 | Metadoc() 49 | 50 | @asynctest.ignore_loop 51 | def test_invalid_url_fail(self): 52 | metadoc = Metadoc(url="https://theintercept.com/404/", html=None) 53 | result = metadoc.query() 54 | assert result["errors"][0] == "Requesting article body failed with 404 status code." 55 | 56 | @asynctest.ignore_loop 57 | def test_no_html(self): 58 | metadoc = Metadoc(url=self.url) 59 | metadoc.query() 60 | 61 | @asynctest.ignore_loop 62 | def test_check_result(self): 63 | self.metadoc._check_result({}) 64 | 65 | @asynctest.ignore_loop 66 | def test_invalid_charset_check(self): 67 | s = "Von da an beginnt fär die meisten jedoch der hektische Teil." 68 | assert self.metadoc._check_invalid_encoding(s) == True 69 | s = "Von da an beginnt für die meisten jedoch der hektische Teil." 70 | assert self.metadoc._check_invalid_encoding(s) == True 71 | s = "Von da an beginnt för die meisten jedoch der hektische Teil." 72 | assert self.metadoc._check_invalid_encoding(s) == True 73 | s = "Von da an beginnt für die meisten jedoch der hektische Teil." 74 | assert self.metadoc._check_invalid_encoding(s) == True 75 | 76 | s = "DE PÊRA" 77 | assert self.metadoc._check_invalid_encoding(s) == False 78 | 79 | @asynctest.ignore_loop 80 | def test_invalid_t3n(self): 81 | metadoc = Metadoc(url="https://t3n.de/news/remote-work-home-office-heimarbeit-erfahrungsbericht-1018248/", html=None) 82 | result = metadoc.query() 83 | assert result["title"] == "Remote Workers Life: „Das Homeoffice löst viele Probleme, schafft aber auch neue“" 84 | -------------------------------------------------------------------------------- /metadoc/extract/ner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import imp 5 | import sys 6 | import os 7 | 8 | if os.environ.get("LAMBDA_TASK_ROOT", False): 9 | # overwrite sqlite with dummy modules, for AWS Lambda 10 | sys.modules["sqlite"] = imp.new_module("sqlite") 11 | sys.modules["sqlite3.dbapi2"] = imp.new_module("sqlite.dbapi2") 12 | import nltk 13 | 14 | import difflib 15 | import operator 16 | import numpy 17 | import string 18 | import re 19 | 20 | from nltk.tokenize import RegexpTokenizer 21 | from .pos import AveragedPerceptronTagger 22 | 23 | tokenizer = RegexpTokenizer(r'\w+') 24 | 25 | # add path, for AWS Lambda 26 | LOCAL_DATA_PATH = os.path.join(os.path.dirname(__file__), "data") 27 | nltk.data.path.append(LOCAL_DATA_PATH) 28 | 29 | def isPunct(word): 30 | pattern = r"(`|\.|#|\$|%|&|\'|\(|\)|\*|\||\+|,|-|—|/|:|;|<|=|>|\?|@|\[|\]|\^|_|`|{|}|~|”|“|’)" 31 | return re.search(pattern, word) is not None 32 | 33 | class EntityExtractor(object): 34 | def __init__(self, text): 35 | self.perceptron_tagger = AveragedPerceptronTagger(autoload=True) 36 | self.stopwords = set(nltk.corpus.stopwords.words()) 37 | self.top_fraction = 70 # consider top candidate keywords only 38 | self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') 39 | self.sentences = self.sent_detector.tokenize(text) 40 | 41 | def _calculate_word_scores(self, word_list): 42 | """Quick and dirty, inspired by Sujit Pal's RAKE implementation. 43 | """ 44 | word_freq = nltk.FreqDist() 45 | for word in word_list: 46 | word_freq[word] += 1 47 | 48 | word_scores = {k:v for k, v in word_freq.items() if v > 0} 49 | return word_scores 50 | 51 | # def _get_mt_median(self, word_scores): 52 | # median = numpy.median([v for k, v in word_scores.items()]) 53 | # return {k: v for k, v in word_scores.items() if v > median} 54 | 55 | def _filter_distance(self, words): 56 | close_matches = [] 57 | wordlist = set(words[:]) # deepcopy 58 | 59 | for word in words: 60 | if word in close_matches: continue 61 | matches = difflib.get_close_matches(word, wordlist, 2) 62 | if len(matches) > 1: 63 | close_matches += matches[1:] 64 | 65 | return wordlist.difference(close_matches) 66 | 67 | def _sort_and_filter(self, word_scores): 68 | n_words = len(word_scores) 69 | sorted_word_scores = sorted(word_scores.items(), key=operator.itemgetter(1), reverse=True) 70 | top_words = sorted_word_scores[0:int(n_words/100*self.top_fraction)] 71 | punct_filtered = [k[0] for k in top_words if not isPunct(k[0])] 72 | distance_filtered = self._filter_distance(punct_filtered) 73 | return list(distance_filtered) 74 | 75 | def _contains_stopword(self, ent): 76 | filtered = [word.lower() in self.stopwords for word in ent.split(" ")] 77 | return True in filtered 78 | 79 | def get_scored_entities(self): 80 | named_ents = [] 81 | 82 | for sent in self.sentences: 83 | pos_tags = self.perceptron_tagger.tag(" ".join(nltk.word_tokenize(sent))) 84 | entities = self.perceptron_tagger.named_entities(pos_tags) 85 | named_ents += [ent for ent in entities if not self._contains_stopword(ent)] 86 | 87 | ent_scores = self._calculate_word_scores(named_ents) 88 | self.ent_scores = ent_scores 89 | return ent_scores 90 | 91 | def get_names(self): 92 | filtered_names = {k: v for k, v in self.ent_scores.items() if len(k.split(" ")) > 1} 93 | top_names = self._sort_and_filter(filtered_names) 94 | return top_names[:8] 95 | 96 | def get_keywords(self): 97 | filtered_keywords = {k.lower(): v for k, v in self.ent_scores.items() if len(k.split(" ")) == 1} 98 | top_keywords = self._sort_and_filter(filtered_keywords) 99 | return top_keywords[:8] 100 | -------------------------------------------------------------------------------- /metadoc/extract/extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import logging 4 | import lxml 5 | import math 6 | import time 7 | import hashlib 8 | 9 | from langdetect import detect 10 | from goose3 import Goose, Configuration 11 | 12 | from .ner import EntityExtractor 13 | from .html import HtmlMeta 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | class Extractor(object): 18 | """Entity recognition, pullquote extraction etc. 19 | """ 20 | def __init__(self, html=None, title=" ", **kwargs): 21 | self.html = html or None 22 | self.title = title or None 23 | self.entities = [] 24 | self.keywords = [] 25 | self.names = [] 26 | self.fulltext = None 27 | self.language = None 28 | self.description = None 29 | self.canonical_url = None 30 | self.image = None 31 | self.published_date = None 32 | self.modified_date = None 33 | self.scraped_date = None 34 | self.contenthash = None 35 | self.reading_time = None 36 | 37 | config = Configuration() 38 | config.enable_image_fetching = False 39 | self.goose = Goose(config=config) 40 | 41 | self.tree = None 42 | 43 | def detect_language(self): 44 | """Langdetect is non-deterministic, so to achieve a higher probability 45 | we attempt detection multiple times and only report success if we get identical results. 46 | """ 47 | if self.language: 48 | return 49 | 50 | try: 51 | nondet_attempts = [detect(self.fulltext) for i in range(0,2)] 52 | is_unique = len(set(nondet_attempts)) == 1 53 | self.language = nondet_attempts[0] if is_unique else False 54 | except: 55 | pass 56 | 57 | def sanitize_html(self): 58 | # Lxml bails out on html w/ emojis 59 | emoji_pattern = re.compile("[" 60 | u"\U0001F600-\U0001F64F" # emoticons 61 | u"\U0001F300-\U0001F5FF" # symbols & pictographs 62 | u"\U0001F680-\U0001F6FF" # transport & map symbols 63 | u"\U0001F1E0-\U0001F1FF" # flags (iOS) 64 | "]+", flags=re.UNICODE) 65 | 66 | self.html = emoji_pattern.sub(r'', self.html) 67 | 68 | # empty charset derails goose3 69 | self.html = self.html.replace('', '') 70 | 71 | def extract_text(self): 72 | """Parse fulltext, do keyword extraction using the newspaper lib 73 | => newspaper.readthedocs.io 74 | """ 75 | res = self.goose.extract(url=None, raw_html=self.html.encode("utf-8")) 76 | self.tree = res.raw_doc 77 | self.fulltext = res.cleaned_text 78 | self.language = res.meta_lang 79 | 80 | entities = EntityExtractor(self.fulltext) 81 | entities.get_scored_entities() # Averaged Perceptron Tagger 82 | self.keywords = entities.get_keywords() # Above median? 83 | self.names = entities.get_names() # Filter top 84 | 85 | def extract_metadata(self): 86 | """Sniff for essential and additional metadata via 87 | either metatags and or json-ld""" 88 | html_meta = HtmlMeta(self.html, tree=self.tree) 89 | html_meta.extract() 90 | 91 | # data 92 | self.authors = html_meta.authors 93 | self.title = html_meta.title 94 | self.description = html_meta.description 95 | self.canonical_url = html_meta.canonical_url 96 | self.image = html_meta.image 97 | self.published_date = html_meta.published_date 98 | self.modified_date = html_meta.modified_date 99 | self.scraped_date = html_meta.scraped_date 100 | 101 | def get_contenthash(self): 102 | """Generate md5 hash over title and body copy in order to keep track 103 | of changes made to a text, do diffs if necessary 104 | """ 105 | contentstring = (self.title + self.fulltext).encode("utf-8") 106 | self.contenthash = hashlib.md5(contentstring).hexdigest() 107 | return self.contenthash 108 | 109 | def get_reading_time(self): 110 | """Calculate average reading time in seconds""" 111 | if not self.fulltext: return None 112 | wordcount = len(self.fulltext.split()) 113 | self.reading_time = math.floor(wordcount / 300 * 60) 114 | 115 | def get_all(self): 116 | start_time = time.time() 117 | self.sanitize_html() 118 | self.extract_text() 119 | self.extract_metadata() 120 | self.detect_language() 121 | self.get_contenthash() 122 | self.get_reading_time() 123 | logger.debug("--- extraction module %s seconds ---" % (time.time() - start_time)) 124 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🔹 metadoc 2 | [![Coverage Status](https://coveralls.io/repos/github/psolbach/metadoc/badge.svg?branch=master)](https://coveralls.io/github/psolbach/metadoc?branch=master) 3 | 4 | Metadoc is a lightning-fast news article metadata extraction library. It does social media activity lookup, source authenticity rating, checksum creation, json-ld and metatag parsing as well as information extraction for named entities, pullquotes, fulltext and other useful things based off of arbitrary article URLs. 5 | 6 | ## Example 7 | 8 | You just throw it any news article URL, and Metadoc will yield. 9 | ```python 10 | from metadoc import Metadoc 11 | url = "https://theintercept.com/2016/11/17/iphones-secretly-send-call-history-to-apple-security-firm-says" 12 | metadoc = Metadoc(url=url) 13 | res = metadoc.query() 14 | ``` 15 | => 16 | ```python 17 | { 18 | '__version__': '0.9.0', 19 | 'authors': ['Kim Zetter'], 20 | 'canonical_url': 'https://theintercept.com/2016/11/17/iphones-secretly-send-call-history-to-apple-security-firm-says/', 21 | 'domain': { 22 | 'credibility': { 23 | 'fake_confidence': '0.00', 24 | 'is_blacklisted': False 25 | }, 26 | 'date_registered': None, 27 | 'favicon': 'https://logo.clearbit.com/theintercept.com?size=200', 28 | 'name': 'theintercept.com'}, 29 | 'entities': { 30 | 'keywords': [ 31 | 'cellebrite', 32 | 'fbi', 33 | 'skype', 34 | 'intercept' 35 | ... 36 | ] 37 | } 38 | }, 39 | 'image': 'https://theintercept.imgix.net/wp-uploads/sites/1/2016/11/GettyImages-578052668-s.jpg?auto=compress%2Cformat&q=90&fit=crop&w=1200&h=800', 40 | 'language': 'en', 41 | 'modified_date': None, 42 | 'published_date': '2016-11-17T11:00:36+00:00', 43 | 'scraped_date': '2018-07-10T12:13:46+00:00', 44 | 'social': [{ 45 | 'metrics': [{ 46 | 'count': 7340, 'label': 'sharecount' 47 | }], 48 | 'provider': 'facebook' 49 | }], 50 | 'text': { 51 | 'contenthash': '940a62c70db255b4aec378529ae7a2c8', 52 | 'fulltext': 'a guardian of user privacy this year after fighting FBI 53 | demands to help crack into San Bernardino shooter Syed ...', 54 | 'reading_time': 439, 55 | 'summary': 'Your call logs get sent to Apple’s servers whenever iCloud is on — something Apple does not disclose.' 56 | }, 57 | 'title': 'iPhones Secretly Send Call\xa0History to Apple, Security Firm Says', 58 | 'url': 'https://theintercept.com/2016/11/17/iphones-secretly-send-call-history-to-apple-security-firm-says' 59 | } 60 | ``` 61 | 62 | ## Trustworthiness Check 63 | Metadoc does a basic background check on article sources. This means a simple blacklist-lookup via `whois` data on the domain. Blacklists taken into account include the controversial [PropOrNot](http://www.propornot.com/p/the-list.html). Thus, only if a domain is found on every blacklist do we spit out a `fake_confidence` of 1. The resulting metadata should be taken with a grain of salt. 64 | 65 | ## Part-of-speech tagging 66 | For speed and simplicity, we decided against `nltk` and instead rely on the Averaged Perceptron as imagined by Matthew Honnibal [@explosion](https://github.com/explosion). The pip install comes pre-trained with a [CoNLL 2000](http://www.cnts.ua.ac.be/conll2000/) training set which works reasonably well to detect proper nouns. Since training is non-deterministic, unwanted stopwords might slip through. If you want to try out other datasets, simply replace `metadoc/extract/data/training_set.txt` with your own and run `metadoc.extract.pos.do_train`. 67 | 68 | ## Install 69 | Requires python 3.5. 70 | 71 | #### Using pip 72 | ```shell 73 | pip install metadoc 74 | ``` 75 | 76 | ## Develop 77 | 78 | #### Mac OS 79 | ```shell 80 | brew install python3 libxml2 libxslt libtiff libjpeg webp little-cms2 81 | ``` 82 | #### Ubuntu 83 | ```shell 84 | apt-get install -y python3 libxml2-dev libxslt-dev libtiff-dev libjpeg-dev webp whois 85 | ``` 86 | #### Fedora/Redhat 87 | ```shell 88 | dnf install libxml2-devel libxslt-devel libtiff-devel libjpeg-devel libjpeg-turbo-devel libwebp whois 89 | ``` 90 | #### Then 91 | ```shell 92 | pip3 install -r requirements-dev.txt 93 | python serve.py => serving @ 6060 94 | ``` 95 | 96 | ## Test 97 | ```shell 98 | py.test -v tests 99 | ``` 100 | If you happen to run into an error with OSX 10.11 concerning a lazy bound library in PIL, 101 | just remove `/PIL/.dylibs/liblzma.5.dylib`. 102 | 103 | ## Todo 104 | * Page concatenation is needed in order to properly calculate wordcount and reading time. 105 | * Authenticity heuristic with sharecount deviance detection (requires state). 106 | * ~~Perf: Worst offender is nltk's pos tagger. Roll own w/ Average Perceptron.~~ 107 | * ~~Newspaper's summarize produces pullquotes, fulltext takes a while. Move to libextract?~~ 108 | 109 | ## Contributors 110 | [Martin Borho](https://github.com/mborho) 111 | [Paul Solbach](https://github.com/___paul) 112 | 113 | --- 114 | 115 | Meteadoc is a software product of FanMatics, Hamburg. 116 | Metadoc stems from a pedigree of nice libraries like [goose3](https://github.com/goose3/goose3/tree/master/goose3), [langdetect](https://github.com/Mimino666/langdetect) and [nltk](https://github.com/nltk/nltk). 117 | Metadoc leans on [this](https://github.com/hankcs/AveragedPerceptronPython) perceptron implementation inspired by Matthew Honnibal. 118 | Metadoc is a work-in-progress. 119 | -------------------------------------------------------------------------------- /tests/test_html.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import asyncio 5 | import asynctest 6 | 7 | from asynctest.mock import patch 8 | from metadoc.extract.html import HtmlMeta 9 | 10 | def get_html_meta(article_path): 11 | with open(article_path, 'r') as article: 12 | html = article.read() 13 | meta = HtmlMeta(html) 14 | meta.extract() 15 | return meta 16 | return None 17 | 18 | class MetadocHtmMetaTest(asynctest.TestCase): 19 | 20 | @asynctest.ignore_loop 21 | def test_extract(self): 22 | paths = [ 23 | "guardian.com/florida-shooting-suspect-charged-questions-nikolas-cruz.html", 24 | "zeit.de/pressefreiheit-tuerkei-inhaftierte-journalisten-deniz-yuecel-freedeniz.html", 25 | "theintercept.com/iphones-secretly-send-call-history-to-apple-security-firm-says.html", 26 | "nytimes/skeleton-ghana-jamaica.html", 27 | "wired.com/inside-the-mind-of-amanda-feilding-countess-of-psychedelic-science.html", 28 | "theverge.com/spacex-falcon-9-launch-starlink-microsat-2a-2b-paz-watch-live.html", 29 | "faz.net/dass-wir-ueberwacht-werden-ist-klar-aber-von-wem-und-wie-eine-spurensuche-15445555.html", 30 | "time.com/jared-kushner-security-clearance-trump-kelly.html", 31 | "netzpolitik.org/index.html", 32 | "invalid/invalid.html", 33 | "bloomberg.com/brexit-talks-in-peril-as-may-rejects-eu-draft-as-unacceptable", 34 | "buzzfeed.com/so-viel-dreck", 35 | "bostonreview.net/thad-williamson-almost-inevitable-failure-justice", 36 | "washingtonpost.com/i-need-loyalty-james-comeys-riveting-prepared-testimony-about-what-trump-asked-him-annotated.html", 37 | "washingtonpost.com/trump-to-nominate-carson-to-lead-u-s-housing-urban-policy.html", 38 | "bellingcat.com/six-months-medical-facilities-still-fire.html", 39 | "slate.com/how_facebook_s_news_feed_algorithm_works.html", 40 | "mashable.com/australia-heat-records-bom.html", 41 | "telegraph.co.uk/When-Stephen-Fry-met-Jony-Ive-the-self-confessed-fanboi-meets-Apples-newly-promoted-chief-design-officer.html", 42 | "nautil.us/the-strange-persistence-of-first-languages.html", 43 | "businessinsider.com/dropbox-vp-todd-jackson-leaves-for-first-round-capital-2018-4.html", 44 | ] 45 | objs = [get_html_meta("tests/fixtures/"+path) for path in paths] 46 | 47 | # published_data 48 | assert objs[0].published_date == "2018-02-16T00:01:52+00:00" 49 | assert objs[1].published_date == "2018-02-16T10:59:47+00:00" 50 | assert objs[2].published_date == "2016-11-17T11:00:36+00:00" 51 | assert objs[3].published_date == "2018-02-15T18:44:34+00:00" 52 | assert objs[4].published_date == "2018-02-15T20:40:04+00:00" 53 | assert objs[5].published_date == "2018-02-15T18:54:21+00:00" 54 | assert objs[6].published_date == "2018-02-15T08:22:05+00:00" 55 | assert objs[7].published_date == "2018-02-28T03:11:27+00:00" 56 | assert objs[8].published_date == "2018-02-16T13:46:24+00:00" 57 | assert objs[9].published_date == None 58 | 59 | # modified_date 60 | assert objs[0].modified_date == "2018-02-16T09:51:54+00:00" 61 | assert objs[1].modified_date == "2018-02-16T10:59:47+00:00" 62 | assert objs[2].modified_date == None 63 | assert objs[3].modified_date == "2018-02-16T05:45:23+00:00" 64 | assert objs[4].modified_date == "2018-02-15T20:40:03+00:00" 65 | assert objs[5].modified_date == "2018-02-15T18:54:21+00:00" 66 | assert objs[6].modified_date == "2018-02-15T09:29:16+00:00" 67 | assert objs[7].modified_date == "2018-02-28T15:45:06+00:00" 68 | assert objs[8].modified_date == "2018-02-16T17:16:57+00:00" 69 | assert objs[9].modified_date == None 70 | 71 | # title 72 | assert objs[4].title == "Inside the Mind of Amanda Feilding, Countess of Psychedelic Science" 73 | assert objs[8].title == "Bundeswehr bereitet sich auf den Kampf gegen Killer-Roboter vor" 74 | assert objs[9].title == None 75 | 76 | # authors 77 | assert objs[2].authors == ["Kim Zetter"] 78 | assert objs[3].authors == ["Randal C. Archibold"] 79 | assert objs[5].authors == ["Loren Grush"] 80 | assert objs[8].authors == ["Alexander Fanta"] 81 | assert objs[9].authors == [] 82 | assert objs[10].authors == ["Tim Ross", "Ian Wishart"] 83 | assert objs[11].authors == ["Becky Barnicoat"] 84 | assert objs[12].authors == ["Thad Williamson"] 85 | assert objs[13].authors == ["Amber Phillips", "Peter W. Stevenson"] 86 | assert objs[14].authors == ["Elise Viebeck"] 87 | assert objs[15].authors == [] # link stripped 88 | assert objs[16].authors == ["Will Oremus"] 89 | assert objs[17].authors == ["Johnny Lieu"] 90 | assert objs[18].authors == ["Stephen Fry"] 91 | assert objs[19].authors == ["Julie Sedivy"] 92 | assert objs[20].authors == ["Becky Peterson, Business Insider"] 93 | 94 | # summary 95 | assert objs[8].description.startswith("Wissenschafter und Aktivisten warnen seit") == True 96 | assert objs[9].description == "" 97 | 98 | # canonical url 99 | assert objs[4].canonical_url == "https://www.wired.com/story/inside-the-mind-of-amanda-feilding-countess-of-psychedelic-science/" 100 | assert objs[9].canonical_url == None 101 | 102 | # images 103 | assert objs[6].image == "http://media2.faz.net/ppmedia/1912312546/1.5445566/article_multimedia_overview/scoring-teaser.png" 104 | assert objs[9].image== None 105 | 106 | """for x, obj in enumerate(objs): 107 | #print(x, obj.jsonld) 108 | print(x, obj.canonical_url) 109 | print(x, obj.image)""" 110 | 111 | 112 | -------------------------------------------------------------------------------- /metadoc/extract/html.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import json 4 | import logging 5 | import lxml.etree, lxml.html 6 | from datetime import datetime 7 | from dateutil.parser import parse 8 | from dateutil.tz import tzoffset 9 | from collections import ChainMap 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | class HtmlMeta(object): 14 | """Extract metadata from html. 15 | Needs work, e.g. handling multiple @property=author tags, 16 | detect if author content is a social media destination. 17 | """ 18 | def __init__(self, html, encoding="UTF-8", tree=None): 19 | self.html = html or None 20 | if tree is not None: 21 | # reuse tree already parsed 22 | self.document = tree 23 | else: 24 | self.parser = lxml.html.HTMLParser(encoding=encoding) 25 | self.document = lxml.html.fromstring(html, parser=self.parser) 26 | self._jsonld_xpath = lxml.etree.XPath('descendant-or-self::script[@type="application/ld+json"]') 27 | self._metatag_xpath = lxml.etree.XPath("//meta") 28 | self._links_xpath = lxml.etree.XPath("//link") 29 | 30 | self.links = {} 31 | self.jsonld = {} 32 | self.metatags = {} 33 | 34 | @property 35 | def title(self): 36 | return self.jsonld.get("headline") \ 37 | or self.metatags.get("og:title") \ 38 | or self.extract_title() 39 | 40 | @property 41 | def description(self): 42 | return self.metatags.get("og:description") \ 43 | or self.metatags.get("description", "").strip() 44 | 45 | @property 46 | def canonical_url(self): 47 | return self.links.get("canonical") 48 | 49 | @property 50 | def image(self): 51 | return self.metatags.get("og:image") \ 52 | or self.jsonld.get("thumbnailUrl") 53 | 54 | def _extract_ld_authors(self): 55 | # extract from jsonld 56 | ld_authors = self.jsonld.get("author", {}) 57 | 58 | # Return if unparseable 59 | if not ld_authors: 60 | return None 61 | 62 | # sanitize ld structure 63 | if type(ld_authors) == str: 64 | ld_authors = {"name": ld_authors} 65 | 66 | ld_authors = [a["name"] for a in ld_authors] if type(ld_authors) == list else ld_authors.get("name", False) 67 | return ld_authors 68 | 69 | @property 70 | def authors(self): 71 | # get a value from trove 72 | authors = self._extract_ld_authors() \ 73 | or self.metatags.get("author") \ 74 | or self.metatags.get("article:author") \ 75 | or self.metatags.get("dcterms.creator") \ 76 | or self.metatags.get("article:authorName") \ 77 | or self.metatags.get("citation_author") \ 78 | or self.jsonld.get("authors") # intercept 79 | 80 | if authors: 81 | # ensure list 82 | if type(authors) != list: 83 | authors = [authors] 84 | # strip links 85 | authors = [a for a in authors if a.startswith("http") == False] 86 | 87 | if not authors: 88 | # washingtonpost 89 | xauthors = self.document.xpath("(//span[@itemprop='author'])[1]//span[@itemprop='name']/text()") 90 | if xauthors: 91 | authors = xauthors 92 | 93 | return authors if authors else [] 94 | 95 | @property 96 | def published_date(self): 97 | res = None 98 | xpaths = [ 99 | "//meta[@name='date']/@content", 100 | "//meta[@property='article:published_time']/@content", 101 | "//meta[@property='article:published']/@content", 102 | "//meta[@name='parsely-pub-date']/@content", 103 | "//meta[@name='DC.date.issued']/@content", 104 | "//time[@itemprop='datePublished']/@datetime", 105 | ] 106 | res = self._query_date(xpaths) 107 | if res is None: 108 | ld_date = self.jsonld.get("datePublished") or self.jsonld.get("dateCreated") 109 | if ld_date: 110 | res = self._format_date(ld_date) 111 | return res 112 | 113 | @property 114 | def modified_date(self): 115 | res = None 116 | xpaths = [ 117 | "//meta[@property='article:modified_time']/@content", 118 | "//meta[@property='article:modified']/@content", 119 | "//meta[@name='last-modified']/@content", 120 | ] 121 | res = self._query_date(xpaths) 122 | if res is None: 123 | ld_date = self.jsonld.get("dateModified") 124 | if ld_date: 125 | res = self._format_date(ld_date) 126 | return res 127 | 128 | @property 129 | def scraped_date(self): 130 | return self._format_date(datetime.now()) 131 | 132 | def extract(self): 133 | self.metatags = self._extract_items(self._get_metatag_item, self._metatag_xpath) 134 | self.jsonld = self._extract_items(self._get_jsonld_item, self._jsonld_xpath) 135 | self.links = self._extract_items(self._get_link_item, self._links_xpath) 136 | 137 | def _extract_items(self, get_item, xpath): 138 | items = [item for item in map(get_item, xpath(self.document)) if item] 139 | return dict(ChainMap(*items)) 140 | 141 | def _get_metatag_item(self, node): 142 | name = node.xpath('@property') or node.xpath('@itemprop') or node.xpath('@name') 143 | content = node.xpath('@content') 144 | 145 | return {name[0]: content[0]} \ 146 | if (name and content) else None 147 | 148 | def _get_link_item(self, node): 149 | name = node.xpath('@rel') 150 | content = node.xpath('@href') 151 | 152 | return {name[0]: content[0]} \ 153 | if (name and content) else None 154 | 155 | def _get_jsonld_item(self, node): 156 | ld = None 157 | try: 158 | ld_text = node.text.strip() 159 | # sanitize if neccessary 160 | if ld_text.find(" -1: 161 | ld_text = ld_text[ld_text.find("{"):ld_text.rfind("}")+1] 162 | 163 | ld = json.loads(ld_text) 164 | if type(ld) is list: 165 | for item in[i for i in ld if i.get("@type") == "NewsArticle"]: 166 | return item 167 | except Exception as exc: 168 | logger.error("JSON-LD parsing failed") 169 | logger.exception(exc) 170 | return ld if ld else {} 171 | 172 | def extract_title(self): 173 | title = self.document.xpath("(//title)[1]//text()") 174 | return title[0] if len(title) else None 175 | 176 | def _format_date(self, date_in): 177 | date = parse(date_in) if type(date_in) is str else date_in 178 | return date.astimezone().astimezone( 179 | tzoffset(None, 0)).replace(microsecond=0).isoformat() 180 | 181 | def _query_date(self, xpath_rules): 182 | for xpath_rule in xpath_rules: 183 | dates = self.document.xpath(xpath_rule) 184 | if len(dates) > 0: 185 | try: 186 | return self._format_date(str(dates[0]))#.get("content")) 187 | except: 188 | pass 189 | return None 190 | -------------------------------------------------------------------------------- /metadoc/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __title__ = 'Metadoc - Postmodern news article metadata service' 4 | __copyright__ = 'Copyright 2016, Paul Solbach' 5 | __author__ = 'Paul Solbach' 6 | __license__ = 'MIT' 7 | __version__ = '0.10.5' 8 | 9 | import asyncio 10 | import time 11 | import concurrent 12 | import requests 13 | import urllib.parse 14 | import os 15 | import re 16 | import sys 17 | import logging 18 | 19 | from .domain import Domaintools 20 | from .extract import Extractor 21 | from .social import ActivityCount 22 | 23 | logger = logging.getLogger() 24 | logger.setLevel(os.environ.get("LOGLEVEL", "INFO")) 25 | formatter = logging.Formatter('%(asctime)s [%(name)s] %(levelname)s %(message)s') 26 | 27 | # set user agent 28 | USER_AGENT = os.environ.get("USER_AGENT", 29 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36") 30 | 31 | if not os.environ.get("LAMBDA_TASK_ROOT", False): 32 | # add stream handler, except for AWS Lambda 33 | ch = logging.StreamHandler(sys.stdout) 34 | ch.setFormatter(formatter) 35 | logger.addHandler(ch) 36 | 37 | class Metadoc(object): 38 | 39 | def __init__(self, url=None, html=None, **kwargs): 40 | """Metadoc API, initialize with 41 | :param url: The article url we shall investigate, required. 42 | :param html: You can pass in the article html manually, optional. 43 | """ 44 | logger.info("Processing url: {}".format(url)) 45 | 46 | self.errors = [] 47 | self.html = html or None 48 | self.url = url or None 49 | 50 | if not self.url: 51 | raise AttributeError('Missing \"url\" attribute.') 52 | 53 | self.extractor = None 54 | self.activity = None 55 | self.domain = None 56 | 57 | def _prepare(self): 58 | if not self.html: 59 | self.html = self._request_url() 60 | self.extractor = Extractor(html=self.html) # Named entities, synthetic summaries 61 | self.activity = ActivityCount(url=self.url) # Social activity from various networks 62 | self.domain = Domaintools(url=self.url) # Domain whois date, blacklisting 63 | 64 | def query(self, mode=None, fmt=None): 65 | data = None 66 | try: 67 | self._prepare() 68 | calls = { 69 | "social": self._query_social, 70 | "domain": self._query_domain, 71 | "extract": self._query_extract, 72 | } 73 | calls.get(mode, self._query_all)() 74 | data = self._render_social() if fmt == "social" else self._render() 75 | if mode is None: 76 | self._check_result(data) 77 | except Exception as exc: 78 | logger.error("Error when processing {}".format(self.url)) 79 | logger.exception(exc) 80 | self.errors.append(str(exc)) 81 | 82 | # return data or error 83 | if data is None or self.errors: 84 | return self._render_errors() 85 | return data 86 | 87 | def _query_all(self): 88 | """Combine all available resources""" 89 | subtasks = [] 90 | loop = asyncio.new_event_loop() 91 | asyncio.set_event_loop(loop) 92 | 93 | executor = concurrent.futures.ThreadPoolExecutor(max_workers=3) 94 | subtasks.append(loop.run_in_executor(executor, self.extractor.get_all)) 95 | subtasks.append(loop.run_in_executor(executor, self.domain.get_all)) 96 | subtasks.append(self.activity.get_all(loop)) 97 | 98 | loop.run_until_complete(asyncio.wait(subtasks, loop=loop)) 99 | loop.close() 100 | 101 | def _query_domain(self): 102 | self.domain.get_all() 103 | 104 | def _query_social(self): 105 | loop = asyncio.new_event_loop() 106 | asyncio.set_event_loop(loop) 107 | 108 | loop.run_until_complete(self.activity.get_all(loop)) 109 | loop.close() 110 | 111 | def _query_extract(self): 112 | self.extractor.get_all() 113 | 114 | def _render_errors(self): 115 | return { 116 | "errors": self.errors 117 | } 118 | 119 | def _render_social(self): 120 | return { 121 | "url": self.url, 122 | "social": getattr(self.activity, "responses", None), 123 | "__version__": __version__ 124 | } 125 | 126 | def _render(self): 127 | """Construct response dict after partial or complete 128 | queries to various sources 129 | """ 130 | return { 131 | "url": self.url, 132 | "title": getattr(self.extractor, "title", None), 133 | "authors": getattr(self.extractor, "authors", None), 134 | "canonical_url": getattr(self.extractor, "canonical_url", None), 135 | "image": getattr(self.extractor, "image", None), 136 | "social": getattr(self.activity, "responses", None), 137 | "language": getattr(self.extractor, "language", None), 138 | "published_date": getattr(self.extractor, "published_date", None), 139 | "modified_date": getattr(self.extractor, "modified_date", None), 140 | "scraped_date": getattr(self.extractor, "scraped_date", None), 141 | "text": { 142 | "fulltext": getattr(self.extractor, "fulltext", None), 143 | "summary": getattr(self.extractor, "description", "No summary found."), 144 | "reading_time": getattr(self.extractor, "reading_time", None), 145 | "contenthash": getattr(self.extractor, "contenthash", None) 146 | }, 147 | "entities": { 148 | "names": getattr(self.extractor, "names", None), 149 | "keywords": getattr(self.extractor, "keywords", None), 150 | }, 151 | "domain": { 152 | "name": getattr(self.domain, "domain", None), 153 | "credibility": getattr(self.domain, "credibility", None), 154 | "date_registered": getattr(self.domain, "date_registered_iso", None), 155 | "favicon": "https://logo.clearbit.com/{0}?size=200".format(getattr(self.domain, "domain", None)), 156 | }, 157 | "__version__": __version__ 158 | } 159 | 160 | def _check_result(self, res): 161 | if not res.get("title"): 162 | logger.warning("No title: {}".format(self.url)) 163 | if not res.get("canonical_url"): 164 | logger.warning("No canonical url: {}".format(self.url)) 165 | if len(res.get("text", {}).get("fulltext", [])) < 50: 166 | logger.warning("No or little text: {}".format(self.url)) 167 | if not res.get("entities", {}).get("names"): 168 | logger.warning("No names: {}".format(self.url)) 169 | if not res.get("entities", {}).get("keywords"): 170 | logger.warning("No keywords: {}".format(self.url)) 171 | if not res.get("domain", {}).get("name"): 172 | logger.warning("No domain name: {}".format(self.url)) 173 | 174 | def _request_url(self): 175 | """In case no html parameter was provided to the constructor""" 176 | 177 | p = urllib.parse.urlparse(self.url) 178 | netloc = p.netloc or p.path 179 | path = p.path if p.netloc else '' 180 | # if not netloc.startswith('www.'): 181 | # netloc = 'www.' + netloc 182 | 183 | p = urllib.parse.ParseResult(p.scheme, netloc, path, *p[3:]) 184 | url = p.geturl() 185 | 186 | req = requests.get(url, headers={ 187 | 'Accept-Encoding': 'identity, gzip, deflate, *', 188 | 'User-Agent': USER_AGENT 189 | }) 190 | 191 | if req.status_code != 200: 192 | raise Exception('Requesting article body failed with {} status code.'.format(req.status_code)) 193 | 194 | if self._check_invalid_encoding(req.text): 195 | # check for encoding conflicts (e.g. t3n.de) 196 | enc_apparent = req.apparent_encoding.lower() 197 | if req.encoding.lower() != enc_apparent and \ 198 | enc_apparent != "windows-1254": 199 | logger.info("Switching html encoding: {} -> {}".format(req.encoding, enc_apparent)) 200 | req.encoding = enc_apparent 201 | return req.text 202 | 203 | def _check_invalid_encoding(self, html): 204 | r=r'(ü|ä|ö|ü)' 205 | return True if re.search(r, html, re.I|re.M) else False 206 | -------------------------------------------------------------------------------- /metadoc/domain/blacklists.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | blacklists = { 5 | "propornot": [ 6 | "4thmedia.org", 7 | "nsnbc.me", 8 | "presstv.com", 9 | "theunhivedmind.com", 10 | "sana.sy", 11 | "activistpost.com", 12 | "americasfreedomfighters.com", 13 | "beforeitsnews.com", 14 | "corbettreport.com", 15 | "drudgereport.com", 16 | "endingthefed.com", 17 | "globalresearch.ca", 18 | "hangthebankers.com", 19 | "infowars.com", 20 | "naturalnews.com", 21 | "paulcraigroberts.org", 22 | "ronpaulinstitute.org", 23 | "southfront.org", 24 | "theantimedia.org", 25 | "trueactivist.com", 26 | "veteranstoday.com", 27 | "washingtonsblog.com", 28 | "yournewswire.com", 29 | "zerohedge.com", 30 | "4threvolutionarywar.wordpress.com", 31 | "abeldanger.net", 32 | "ahtribune.com", 33 | "allnewspipeline.com", 34 | "americanlookout.com", 35 | "amren.com", 36 | "amtvmedia.com", 37 | "ancient-code.com", 38 | "anonews.co", 39 | "anonhq.com", 40 | "antiwar.com", 41 | "asia-pacificresearch.com", 42 | "assassinationscience.com", 43 | "baltimoregazette.com", 44 | "barenakedislam.com", 45 | "bignuggetnews.com", 46 | "blackagendareport.com", 47 | "blacklistednews.com", 48 | "christianfightback.com", 49 | "collective-evolution.com", 50 | "conservativedailypost.com", 51 | "consortiumnews.com", 52 | "cosmicscientist.com", 53 | "countercurrents.org", 54 | "counterinformation.wordpress.com", 55 | "dailyoccupation.com", 56 | "dailystormer.com", 57 | "darkmoon.me", 58 | "darkpolitricks.com", 59 | "davidstockmanscontracorner.com", 60 | "dcclothesline.com", 61 | "dcleaks.com", 62 | "defenddemocracy.press", 63 | "dennismichaellynch.com", 64 | "disclose.tv", 65 | "disclosuremedia.net", 66 | "educate-yourself.org", 67 | "educateinspirechange.org", 68 | "endoftheamericandream.com", 69 | "endtime.com", 70 | "eutopia.buzz", 71 | "ewao.com", 72 | "eyeopening.info", 73 | "fellowshipoftheminds.com", 74 | "filmsforaction.org", 75 | "floridasunpost.com", 76 | "foreignpolicyjournal.com", 77 | "fourwinds10.net", 78 | "freedomoutpost.com", 79 | "gaia.com", 80 | "galacticconnection.com", 81 | "gatesofvienna.net", 82 | "geopolmonitor.com", 83 | "godlikeproductions.com", 84 | "govtslaves.info", 85 | "greanvillepost.com", 86 | "guccifer2.wordpress.com", 87 | "healthnutnews.com", 88 | "henrymakow.com", 89 | "heresyblog.net", 90 | "humansarefree.com", 91 | "ihavethetruth.com", 92 | "ihavethetruth.com", 93 | "in5d.com", 94 | "informationclearinghouse.info", 95 | "intellihub.com", 96 | "intrepidreport.com", 97 | "investmentresearchdynamics.com", 98 | "investmentwatchblog.com", 99 | "jackpineradicals.com", 100 | "jamesrgrangerjr.com", 101 | "jewsnews.co.il", 102 | "journal-neo.org", 103 | "katehon.com", 104 | "katehon.org", 105 | "kingworldnews.com", 106 | "lewrockwell.com", 107 | "libertyblitzkrieg.com", 108 | "libertywritersnews.com", 109 | "makeamericagreattoday.com", 110 | "mintpressnews.com", 111 | "moonofalabama.org", 112 | "nakedcapitalism.com", 113 | "naturalblaze.com", 114 | "newcoldwar.org", 115 | "newstarget.com", 116 | "newswithviews.com", 117 | "nowtheendbegins.com", 118 | "off-guardian.org", 119 | "oftwominds.com", 120 | "opednews.com", 121 | "orientalreview.org", 122 | "patriotrising.com", 123 | "platosguns.com", 124 | "pravda.ru", 125 | "pravdareport.com", 126 | "prepperwebsite.com", 127 | "prisonplanet.com", 128 | "rbth.com", 129 | "readynutrition.com", 130 | "redflagnews.com", 131 | "regated.com", 132 | "rense.com", 133 | "righton.com", 134 | "rinf.com", 135 | "rt.com", 136 | "rumormillnews.com", 137 | "ruptly.tv", 138 | "russia-insider.com", 139 | "sentinelblog.com", 140 | "sgtreport.com", 141 | "shiftfrequency.com", 142 | "shtfplan.com", 143 | "silentmajoritypatriots.com", 144 | "silverdoctors.com", 145 | "sott.net", 146 | "sputniknews.com", 147 | "stormcloudsgathering.com", 148 | "strategic-culture.org", 149 | "superstation95.com", 150 | "survivopedia.com", 151 | "the-newspapers.com", 152 | "thecommonsenseshow.com", 153 | "thedailybell.com", 154 | "thedailysheeple.com", 155 | "theduran.com", 156 | "theearthchild.co.za", 157 | "theeconomiccollapseblog.com", 158 | "theeventchronicle.com", 159 | "thefederalistpapers.org", 160 | "thefreethoughtproject.com", 161 | "themindunleashed.org", 162 | "thenewsdoctors.com", 163 | "therebel.media", 164 | "therussophile.org", 165 | "thesaker.is", 166 | "thesleuthjournal.com", 167 | "thetruenews.info", 168 | "thetruthseeker.co.uk", 169 | "thirdworldtraveler.com", 170 | "toprightnews.com", 171 | "trunews.com", 172 | "truth-out.org", 173 | "truthandaction.org", 174 | "truthdig.com", 175 | "truthfeed.com", 176 | "truthkings.com", 177 | "ufoholic.com", 178 | "undergroundworldnews.com", 179 | "unz.com", 180 | "usanewshome.com", 181 | "usapoliticsnow.com", 182 | "usasupreme.com", 183 | "usdcrisis.com", 184 | "usslibertyveterans.org", 185 | "vdare.com", 186 | "veteransnewsnow.com", 187 | "vigilantcitizen.com", 188 | "viralliberty.com", 189 | "voltairenet.org", 190 | "wakeupthesheep.com", 191 | "wakingtimes.com", 192 | "wearechange.org", 193 | "weshapelife.org", 194 | "whatdoesitmean.com", 195 | "whatreallyhappened.com", 196 | "wikileaks.com", 197 | "wikileaks.org", 198 | "wikispooks.com", 199 | "worldnewspolitics.com", 200 | "worldpoliticsus.com", 201 | "www.fort-russ.com", 202 | "oilgeopolitics.net", 203 | "gangstergovernment.com", 204 | "memoryholeblog.com", 205 | "eutimes.net", 206 | "intersectionproject.eu" 207 | ], 208 | "fortliberty.org": [ 209 | "21stcenturywire.com", 210 | "800whistleblower.com", 211 | "activistpost.com", 212 | "alternet.org", 213 | "americannews.com", 214 | "antiwar.com", 215 | "beforeitsnews.com", 216 | "bigpzone.com", 217 | "chronicle.su", 218 | "consciouslifenews.com", 219 | "conspiracywire.com", 220 | "countdowntozerotime.com", 221 | "counterpsyops.com", 222 | "dailybuzzlive.com", 223 | "dailycurrant.com", 224 | "dcclothesline.com", 225 | "disclose.tv", 226 | "duffelblog.com", 227 | "duhprogressive.com", 228 | "elitereaders.com", 229 | "empirenews.net", 230 | "english.ruvr.ru", 231 | "eutimes.net", 232 | "federalistpress.com", 233 | "freepatriot.org", 234 | "fromthetrenchesworldreport.com", 235 | "geoengineeringwatch.org", 236 | "globalresearch.ca", 237 | "gonzoglobe.com", 238 | "govtslaves.info", 239 | "guardianlv.com", 240 | "gulagbound.com", 241 | "hangthebankers.com", 242 | "healthimpactnews.com", 243 | "humansarefree.com", 244 | "huzlers.com", 245 | "infowars.com", 246 | "intellihub.com", 247 | "lewrockwell.com", 248 | "libertynews.com", 249 | "livefreelivenatural.com", 250 | "nationalreport.net", 251 | "naturalcuresnotmedicine.com", 252 | "naturalnews.com", 253 | "newswire-24.com", 254 | "nodisinfo.com", 255 | "notallowedto.com", 256 | "now8news.com", 257 | "nowtheendbegins.com", 258 | "pakalertpress.com", 259 | "politicalblindspot.com", 260 | "presstv.ir", 261 | "prisonplanet.com", 262 | "randpaulreview.com", 263 | "rawforbeauty.com", 264 | "realfarmacy.com", 265 | "redflagnews.com", 266 | "responsibletechnology.org", 267 | "rt.com", 268 | "secretsofthefed.com", 269 | "southweb.org", 270 | "thecommonsenseshow.com", 271 | "thecontroversialfiles.net", 272 | "thedailysheeple.com", 273 | "thefreethoughtproject", 274 | "thelastgreatstand.com", 275 | "thenewamerican.com", 276 | "theracketreport.com", 277 | "therightplanet.com", 278 | "therundownlive.com", 279 | "theuspatriot.com", 280 | "topinfopost.com", 281 | "truthandaction.org", 282 | "truthbroadcastnetwork.com", 283 | "turnerradionetwork.com", 284 | "undergroundhealth.com", 285 | "usahitman.com", 286 | "veteranstoday.com", 287 | "westernjournalism.com", 288 | "whydontyoutrythis.com", 289 | "worldnewsdailyreport.com", 290 | "worldtruth.tv", 291 | "yournewswire.com" 292 | ], 293 | "zimdar": [ 294 | "enduringvision.com", 295 | "70news.wordpress.com", 296 | "abcnews.com.co", 297 | "politicalo.com", 298 | "americannews.com", 299 | "indecisionforever.com", 300 | "realnewsrightnow.com", 301 | "infowars.com", 302 | "rilenews.com", 303 | "civictribune.com", 304 | "mediamass.net", 305 | "megynkelly.us", 306 | "msnbc.com.co", 307 | "msnbc.website", 308 | "nationalreport.net", 309 | "creambmp.com", 310 | "news-hound.com", 311 | "newsbiscuit.com", 312 | "dcgazette.com", 313 | "politicops.com", 314 | "newsmutiny.com", 315 | "drudgereport.com.co", 316 | "empirenews.net" 317 | ] 318 | } -------------------------------------------------------------------------------- /metadoc/extract/pos.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Averaged perceptron classifier. Implementation geared for simplicity rather than 3 | efficiency. Adapted from @hankcs, cf. https://github.com/hankcs/AveragedPerceptronPython/blob/master/LICENSE 4 | Based on http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/ 5 | """ 6 | from collections import defaultdict 7 | import pickle 8 | import random 9 | import logging 10 | import os 11 | 12 | PICKLE = os.path.join(os.path.dirname(__file__), "data/tagger.pickle") 13 | TRAINING_SET = os.path.join(os.path.dirname(__file__), "data/training_set.txt") 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | def do_train(): 18 | tagger = AveragedPerceptronTagger(autoload=False) 19 | logger.info('Reading corpus.') 20 | training_data = [] 21 | sentence = ([], []) 22 | 23 | for line in open(TRAINING_SET): 24 | params = line.split(' ') 25 | if len(params) != 2: continue 26 | 27 | sentence[0].append(params[0]) 28 | sentence[1].append(params[1]) 29 | 30 | if params[0] == '.': 31 | training_data.append(sentence) 32 | sentence = ([], []) 33 | 34 | logger.info('training corpus size : %d', len(training_data)) 35 | logger.info('Start training...') 36 | tagger.train(training_data, save_loc=PICKLE) 37 | 38 | class AveragedPerceptron(object): 39 | '''An averaged perceptron, as implemented by Matthew Honnibal. 40 | ''' 41 | 42 | def __init__(self): 43 | # Each feature gets its own weight vector, so weights is a dict-of-dicts 44 | self.weights = {} 45 | self.classes = set() 46 | # The accumulated values, for the averaging. These will be keyed by 47 | # feature/clas tuples 48 | self._totals = defaultdict(int) 49 | # The last time the feature was changed, for the averaging. Also 50 | # keyed by feature/clas tuples 51 | # (tstamps is short for timestamps) 52 | self._tstamps = defaultdict(int) 53 | # Number of instances seen 54 | self.i = 0 55 | 56 | def predict(self, features): 57 | '''Dot-product the features and current weights and return the best label.''' 58 | scores = defaultdict(float) 59 | for feat, value in features.items(): 60 | if feat not in self.weights or value == 0: 61 | continue 62 | weights = self.weights[feat] 63 | for label, weight in weights.items(): 64 | scores[label] += value * weight 65 | # Do a secondary alphabetic sort, for stability 66 | return max(self.classes, key=lambda label: (scores[label], label)) 67 | 68 | def update(self, truth, guess, features): 69 | '''Update the feature weights.''' 70 | def upd_feat(c, f, w, v): 71 | param = (f, c) 72 | self._totals[param] += (self.i - self._tstamps[param]) * w 73 | self._tstamps[param] = self.i 74 | self.weights[f][c] = w + v 75 | 76 | self.i += 1 77 | if truth == guess: 78 | return None 79 | for f in features: 80 | weights = self.weights.setdefault(f, {}) 81 | upd_feat(truth, f, weights.get(truth, 0.0), 1.0) 82 | upd_feat(guess, f, weights.get(guess, 0.0), -1.0) 83 | return None 84 | 85 | def average_weights(self): 86 | '''Average weights from all iterations.''' 87 | for feat, weights in self.weights.items(): 88 | new_feat_weights = {} 89 | for clas, weight in weights.items(): 90 | param = (feat, clas) 91 | total = self._totals[param] 92 | total += (self.i - self._tstamps[param]) * weight 93 | averaged = round(total / float(self.i), 3) 94 | if averaged: 95 | new_feat_weights[clas] = averaged 96 | self.weights[feat] = new_feat_weights 97 | return None 98 | 99 | # def save(self, path): 100 | # '''Save the pickled model weights.''' 101 | # return pickle.dump(dict(self.weights), open(path, 'w')) 102 | 103 | # def load(self, path): 104 | # '''Load the pickled model weights.''' 105 | # self.weights = pickle.load(open(path)) 106 | # return None 107 | 108 | class AveragedPerceptronTagger(object): 109 | '''Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal. 110 | :param load: Load the pickled model upon instantiation. 111 | ''' 112 | START = ['-START-', '-START2-'] 113 | END = ['-END-', '-END2-'] 114 | AP_MODEL_LOC = PICKLE 115 | 116 | def __init__(self, autoload=False): 117 | self.model = AveragedPerceptron() 118 | self.tagdict = {} 119 | self.classes = set() 120 | 121 | if autoload: 122 | self.load(self.AP_MODEL_LOC) 123 | 124 | def tag(self, corpus): 125 | '''Tags a string `corpus`.''' 126 | # Assume untokenized corpus has \n between sentences and ' ' between words 127 | s_split = lambda t: t.split('\n') 128 | w_split = lambda s: s.split() 129 | 130 | def split_sents(corpus): 131 | for s in s_split(corpus): 132 | yield w_split(s) 133 | 134 | prev, prev2 = self.START 135 | tokens = [] 136 | 137 | for words in split_sents(corpus): 138 | context = self.START + [self._normalize(w) for w in words] + self.END 139 | for i, word in enumerate(words): 140 | tag = self.tagdict.get(word) 141 | if not tag: 142 | features = self._get_features(i, word, context, prev, prev2) 143 | tag = self.model.predict(features) 144 | 145 | tokens.append((word, tag.strip())) 146 | prev2 = prev 147 | prev = tag 148 | 149 | return tokens 150 | 151 | def named_entities(self, tags): 152 | '''return sequential named entities, 153 | IO classification isn't as accurate here, since we're not differentiating between PERSON and ORGANIZATION. 154 | Still, this is fast and in many cases suited to the task. 155 | 156 | [('The', 'DT'), ('extraordinary', 'JJ'), ('phenomenon', 'NN'), ('of', 'IN'), ('fake', 'JJ'), 157 | ('news', 'NN'), ('spread', 'NN'), ('by', 'IN'), ('Facebook', 'NNP'), ('and', ''), ('other', 'JJ'), 158 | ('social', 'JJ'), ('media', 'NNS'), ('during', 'IN'), ('the', 'DT'), ('2016', 'CD'), ('presidential', 'JJ'), 159 | ('election', 'NN'), ('has', 'VBZ'), ('been', 'VBN'), ('largely', 'RB'), ('portrayed', 'VBN'), ('as', 'IN'), 160 | ('a', 'DT'), ('lucky', 'JJ'), ('break', 'NN'), ('for', 'IN'), ('Donald', 'NNP'), ('Trump', 'NNP')] 161 | ''' 162 | 163 | ent, entities = [], [] 164 | tags_len = len(tags)-1 165 | push_ent = lambda x: entities.append(" ".join(ent)) 166 | 167 | for i, tag in enumerate(tags): 168 | if tag[1] == "NNP": 169 | ent.append(tag[0]) 170 | if i == tags_len: 171 | push_ent(ent) 172 | 173 | elif len(ent): 174 | push_ent(ent) 175 | ent = [] 176 | 177 | return entities 178 | 179 | def train(self, sentences, save_loc=None, nr_iter=5): 180 | '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter`` 181 | controls the number of Perceptron training iterations. 182 | :param sentences: A list of (words, tags) tuples. 183 | :param save_loc: If not ``None``, saves a pickled model in this location. 184 | :param nr_iter: Number of training iterations. 185 | ''' 186 | self._make_tagdict(sentences) 187 | self.model.classes = self.classes 188 | for iter_ in range(nr_iter): 189 | c = 0 190 | n = 0 191 | for words, tags in sentences: 192 | prev, prev2 = self.START 193 | context = self.START + [self._normalize(w) for w in words] \ 194 | + self.END 195 | for i, word in enumerate(words): 196 | guess = self.tagdict.get(word) 197 | if not guess: 198 | feats = self._get_features(i, word, context, prev, prev2) 199 | guess = self.model.predict(feats) 200 | self.model.update(tags[i], guess, feats) 201 | prev2 = prev 202 | prev = guess 203 | c += guess == tags[i] 204 | n += 1 205 | random.shuffle(sentences) 206 | logger.info("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n))) 207 | self.model.average_weights() 208 | 209 | # Pickle as a binary file 210 | if save_loc is not None: 211 | pickle.dump((self.model.weights, self.tagdict, self.classes), 212 | open(save_loc, 'wb'), -1) 213 | 214 | return None 215 | 216 | def load(self, loc=None): 217 | '''Load a pickled model.''' 218 | try: 219 | w_td_c = pickle.load(open(loc, 'rb')) 220 | except IOError: 221 | raise IOError("Invalid perceptrontagger.pickle file.") 222 | 223 | self.model.weights, self.tagdict, self.classes = w_td_c 224 | self.model.classes = self.classes 225 | return None 226 | 227 | def _normalize(self, word): 228 | '''Normalization used in pre-processing. 229 | - All words are lower cased 230 | - Digits in the range 1800-2100 are represented as !YEAR; 231 | - Other digits are represented as !DIGITS 232 | :rtype: str 233 | ''' 234 | if '-' in word and word[0] != '-': 235 | return '!HYPHEN' 236 | elif word.isdigit() and len(word) == 4: 237 | return '!YEAR' 238 | elif word[0].isdigit(): 239 | return '!DIGITS' 240 | else: 241 | return word.lower() 242 | 243 | def _get_features(self, i, word, context, prev, prev2): 244 | '''Map tokens into a feature representation, implemented as a 245 | {hashable: float} dict. If the features change, a new model must be 246 | trained. 247 | ''' 248 | 249 | def add(name, *args): 250 | features[' '.join((name,) + tuple(args))] += 1 251 | 252 | i += len(self.START) 253 | features = defaultdict(int) 254 | # It's useful to have a constant feature, which acts sort of like a prior 255 | add('bias') 256 | add('i suffix', word[-3:]) 257 | add('i pref1', word[0]) 258 | add('i-1 tag', prev) 259 | add('i-2 tag', prev2) 260 | add('i tag+i-2 tag', prev, prev2) 261 | add('i word', context[i]) 262 | add('i-1 tag+i word', prev, context[i]) 263 | add('i-1 word', context[i - 1]) 264 | add('i-1 suffix', context[i - 1][-3:]) 265 | add('i-2 word', context[i - 2]) 266 | add('i+1 word', context[i + 1]) 267 | add('i+1 suffix', context[i + 1][-3:]) 268 | add('i+2 word', context[i + 2]) 269 | return features 270 | 271 | def _make_tagdict(self, sentences): 272 | '''Make a tag dictionary for single-tag words.''' 273 | counts = defaultdict(lambda: defaultdict(int)) 274 | 275 | for words, tags in sentences: 276 | for word, tag in zip(words, tags): 277 | counts[word][tag] += 1 278 | self.classes.add(tag) 279 | 280 | freq_thresh = 20 281 | ambiguity_thresh = 0.97 282 | 283 | for word, tag_freqs in counts.items(): 284 | tag, mode = max(tag_freqs.items(), key=lambda item: item[1]) 285 | n = sum(tag_freqs.values()) 286 | # Don't add rare words to the tag dictionary 287 | # Only add quite unambiguous words 288 | if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh: 289 | self.tagdict[word] = tag 290 | 291 | 292 | def _pc(n, d): 293 | return (float(n) / d) * 100 294 | -------------------------------------------------------------------------------- /tests/fixtures/netzpolitik.org/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | Bundeswehr bereitet sich auf den Kampf gegen Killer-Roboter vor – netzpolitik.org 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 35 | 36 | 37 | 38 | 39 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 |
89 | 91 | 92 | 151 | 152 |
153 | 154 |
155 |
156 | 157 | 158 |
159 |
160 | 161 | Linkschleuder 162 |

Bundeswehr bereitet sich auf den Kampf gegen Killer-Roboter vor

163 |
164 | 165 | 166 | 167 | 171 | 172 |
173 | 174 |
175 |
176 | Putzig, aber tödlich: Autonome Waffensysteme stehen nicht unter direkter menschlicher Kontrolle. CC public domain Pascal
177 |
178 | 179 |
180 | 181 |

Wissenschafter und Aktivisten warnen seit längerem vor dem Einsatz autonomer Waffensysteme. Die deutsche Bundeswehr hat nun offiziell ausgeschlossen, solche Systeme ankaufen zu wollen. „Wir haben eine sehr klare Position. Wir haben keine Absicht, autonome Systeme zu erwerben“, sagte Generalleutnant Ludwig Leinhos, dem das „Kommando Cyber- und Informationsraum“ untersteht, gestern im Vorfeld der Münchner Sicherheitskonferenz. Er betonte aber, dass Deutschland sich darauf vorbereiten müsse, sich gegen den Einsatz von Killer-Robotern durch andere Staaten zu verteidigen.

182 | 189 |
190 |

Wir finanzieren uns zu fast 100 % aus Spenden von Leserinnen und Lesern. Unterstütze unsere Arbeit mit einer Spende oder einem Dauerauftrag.

191 |
192 |
193 |

194 |

Einer internationale Kampagne zur Ächtung von Roboter-Kampfsystemen haben sich inzwischen 22 Staaten angeschlossen, berichtet die Nachrichtenseite Politico. Dennoch arbeiten einige Staaten, allen voran der Rüstungsweltmeister USA, an solchen Systemen. Politico schildert in seiner Geschichte die Möglichkeiten der Technologie:

195 |

Im Oktober 2016 lud das US-Verteidigungsministerium eine TV-Crew in die Wüste Kaliforniens ein und entließ dort aus einem Flugzeug einen Schwarm von 103 Drohnen in Vogelgröße in den blauen Himmel. Wie Schwalben flog der Schwarm in Formation, änderte akkordiert seine Richtung, positionierte sich laufend neu und reagierte auf seine Umgebung – die dafür nötigen Entscheidungen traf der Schwarm kollektiv, ohne menschliche Hilfe. Nichts illustriert die revolutionäre Natur von vollständig autonomen Waffensystemen besser als die Neuentwicklung solcher „Schwarm-Drohnen“ – kleiner, unbemannter Flugobjekte, die in Gruppen operieren und schon bald die existierende Militärtechnologie in der Leistung überholen könnten, zu einem Bruchteil der Kosten. [Eigene Übersetzung]

196 |

197 | 198 | 208 | 209 | 210 |
211 | 212 | 213 | Weitersagen und Unterstützen. Danke! 214 | 215 | 216 | 217 | 226 | 227 | 236 | 237 | 246 | 247 | 256 | 257 | 258 | 267 | 268 | 277 | 278 | 287 | 288 |
289 | 290 |
291 | 292 | 304 | 305 | 306 | 317 | 318 |
319 |
320 | 321 | 484 | 485 | 486 | 487 | 488 |
489 | 490 |
491 | 492 | 493 | 7 Kommentare 494 | 495 | 496 | 497 |
    498 |
  1. 499 |
    500 | 512 | 513 |
    514 |

    Witzig dabei ist das die Bundeswehr nicht einmal in der Lage ist den konventionellen Job ordentlich zu erledigen. Völlig unnormal ist das solche Systeme überhaupt entwickelt werden. Es mag abgedreht klingen, aber ich denke zuerst an T2 und frage mich ob solche Systeme jemals völlig kontrollierbar sind und wie es sein kann das man Tod automatisiert.
    515 | Unfassbar.

    516 |
    517 | 518 |
    519 |
  2. 520 |
  3. 521 |
    522 | 534 | 535 |
    536 |

    Was soll daraus werden? Autonome Truppen kämpfen gegen autonome Truppen? Das ist eine Fortführung der abstrusitäten im Krieg. Ab dem Moment wo der Oberbefehlshaber nicht mehr voranritt hat der Krieg seine „ehrenhaftigkeit“ verloren.

    537 |
    538 | 539 |
    540 |
      541 |
    1. 542 |
      543 | 555 | 556 |
      557 |

      Die Vorstellung von Krieg als „ehrenhaft“ ist Romantik. In der Realitaet geht es um das Durchsetzen von Zielen mit Gewalt, da will man primaer gewinnen und nicht ehrenvoller aber uU toter Zweiter werden.

      558 |

      Das ist nicht notwendigermassen ein Widerspruch zur Regulierung oder Aechtung von Waffensystemen, bei denen keiner wirklich etwas gewinnt. Weswegen wir teilautonome Waffensysteme bereits haben und behalten werden, und Verteidugungswaffen immer schneller und dafuer autonomer werden. Letzteres ist halt nicht so problematisch, da sich idR keine friedlichen Objekte mit Schallgeschwidigkeit auf Kollisionskurs begeben…

      559 |
      560 | 561 |
      562 |
        563 |
      1. 564 |
        565 | 577 | 578 |
        579 |

        Wie sagte Worf? Nur der Sieg ist ehrenhaft.
        580 | Mein Zusatz, in erster Linie deshalb, weil man danach die Geschichtsbücher schreiben kann.
        581 | Bitte jetzt nicht mit Nazis kommen. Ich weiß das das Verbrecher waren.

        582 |
        583 | 584 |
        585 |
      2. 586 |
      587 |
    2. 588 |
    589 |
  4. 590 |
  5. 591 |
    592 | 604 | 605 |
    606 |

    16. Februar 2018, von MSC

    607 |

    MSC 2018 – AGENDA UND TEILNEHMERLISTE VERÖFFENTLICHT
    608 | Von Freitag bis Sonntag kommen mehr als 30 Staats- und Regierungschefs und über 100 Minister aus aller Welt zur Münchner Sicherheitskonferenz (MSC) zusammen, um über aktuelle Herausforderungen in der internationalen…

    609 |

    Ergo :Die MSC- beginnt erst heute. Das ist wohl so eine Sache mit den vorgefertigten Pressemitteilungen. Meistens ist das Gegenteil richtig.

    610 |
    611 | 612 |
    613 |
      614 |
    1. 615 |
      616 | 628 | 629 |
      630 |

      Stimmt, die eigentliche Konferenz beginnt erst heute. Die Veranstaltung, bei der Leinhos sprach, fand im Vorfeld dazu statt.

      631 |
      632 | 633 |
      634 |
    2. 635 |
    636 |
  6. 637 |
  7. 638 |
    639 | 651 | 652 |
    653 |

    Wie immer ganz vorne mit dabei: die EU-Kommission.

    654 |

    https://www.heise.de/newsticker/meldung/Dual-Use-EU-soll-autonome-nicht-toedliche-Waffensysteme-vorantreiben-3975637.html

    655 |

    Weil nicht-toedliche Waffensysteme ja kein Problem sind, klar. Und dann wundern, dass Leute der EU-Kommission nicht ueber den Weg trauen oder aus dieser EU raus wollen.

    656 |
    657 | 658 |
    659 |
  8. 660 |
661 | 662 |
663 |

Schreibe einen Kommentar

664 |

Deine E-Mail-Adresse wird nicht veröffentlicht. Erforderliche Felder sind mit * markiert.

665 | 666 |

667 |

668 | 669 |

670 |
671 | 672 |
673 |
674 | 675 |
676 |
677 | 678 | 679 |
680 | 681 | 723 |
724 | 725 | 731 | 736 | 737 | 738 | 743 | 744 | 745 | 746 | 747 | 748 | -------------------------------------------------------------------------------- /tests/fixtures/businessinsider.com/dropbox-vp-todd-jackson-leaves-for-first-round-capital-2018-4.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Dropbox VP Todd Jackson leaves for First Round Capital - Business Insider Deutschland 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 45 | 46 | 47 | 48 | 52 | 56 | 60 | 70 | 83 | 84 | 85 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 163 | 164 | 167 | 170 | 174 | 175 | 179 | 180 | 181 | 182 | 188 | 189 | 190 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 242 | 243 | 244 | 245 | 246 | 247 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 270 | 319 | 320 | 321 | 322 | 323 | 332 | 333 | 334 | 335 | 336 | 370 | 371 | 372 | 373 | 374 | 375 | 376 |
377 | 378 | 379 | 380 | 381 |
382 | 383 | 384 | 394 | 395 | 396 |
397 |
398 |
399 |
400 | 407 | 408 |
409 |
410 |

International

411 |
412 | 413 | 452 |
453 |
454 | 455 | 462 | 463 | 498 |
499 | 500 | 501 | 502 | 503 |
504 | 508 | 509 |
510 | 511 |
512 | 513 |
514 |
515 | 516 | 520 | 521 |
522 | 523 |
524 | 525 |
526 |
527 | 531 |
532 |
533 | 534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 | 543 | 544 | 545 | 546 |
547 | 588 |
589 | 590 | 591 | 592 | 593 | 594 |
595 |
596 | 597 | 598 | 599 | 600 | 601 |

A key Dropbox exec quit just weeks after its blockbuster IPO — but experts say it could be good in the long run

602 | 603 |
604 |
605 | 606 | 633 | 634 | 674 |
675 | 676 | 677 | 678 | 679 |
680 |
681 | 682 | 683 |
Dropbox CEO Drew HoustonDropbox CEO Drew Houston needs to find a new VP of product.Reuters/Mike Blake

684 |
    685 |
  • Dropbox VP of product Todd Jackson has left the company to join First Round Capital as a founder in residence. 
  • 686 |
  • His move comes on the heels of the cloud storage company's successful IPO.
  • 687 |
  • It also comes just as Dropbox is looking to expand its focus on enterprise products as a means of growing revenue and, eventually, reaching profitability.
  • 688 |
  • Despite Jackson's status a superstar in the consumer space, investors think his departure could actually help Dropbox accomplish its product strategy.
  • 689 |
690 |
691 |


Dropbox lost its superstar VP of product Todd Jackson, just two weeks after the file-sharing service's huge IPO. And analysts say it could be just what Dropbox needs to execute on its promise to refocus on selling its services to larger businesses. 

692 |

Jackson announced via Twitter on Monday a new role at First Round Capital as its first-ever founder in residence. His job will be to advise startup founders in the First Round portfolio, and to launch a Los Angeles edition of the firm's Product Program — a masterclass for early-career product managers.  

693 |

"We're grateful for all the contributions Todd made to Dropbox over the past 2.5 years. While we'll miss him greatly, we wish him all the best in his new role at First Round," a Dropbox spokesperson tells Business Insider. Jackson could not be reached for comment.

694 |

Dropbox hasn't named a replacement for Jackson yet, but for the time being, the company has confirmed that its product organization will be managed by Quentin Clark, Dropbox’s senior VP of engineering, product and design. Clark, who was Jackson's boss, joined Dropbox in September, after two years at SAP and two decades at Microsoft — which is to say he's extremely familiar with business-to-business technology products. 

695 |

"From the looks of things, one would say this news was a hard-to-turn-down opportunity for Todd, but also a leadership adjustment to put Dropbox on a more solid money-making path. That might be welcomed news for investors," Gartner research director Karen Hobert said. 

696 |

Before joining Dropbox in 2015, Jackson spent his career doing product management across some of the biggest companies in tech: Google, Facebook and Twitter. He's a big name in Silicon Valley — but not one associated with enterprise-grade subscription market that Dropbox is looking to conquer. 

697 |

Todd JacksonDropbox VP of product Todd Jackson left the company this week.First Round

"Todd’s background is predominantly startups for consumer products over pure business products," Hobert said. "Given that, his leaving might be an indication of more investment in monetizing and growing the enterprise side of the business — which Dropbox needs to do." 

698 |

When Dropbox filed for its IPO at the end of February, the 11-year-old company made public for the first time that the company isn't profitable. 

699 |

Only 11 million of Dropbox's 500 million registered users pay to use the service, according to its S-1. But the company has 300 million users that it has identified as likely to convert to the paid model, and winning over these users is part of its strategy to become profitable. 

700 |

"They need a solid enterprise leader for that plan," Hobert said.

701 |

And while it's a big deal for any company to lose an executive so soon after its IPO, analysts don't foresee there being a big impact on business in the near term.

702 |

"Customers are a trailing indicator and product changes will begin to show up in the roadmap in six months or beyond," IDC research director Terry Frazier said. "If the current product and roadmap are acceptable, they will buy. If that changes in six months or a year, they will not. That’s when customers will begin voting on the change. "

703 |

First Round was an early investor in Jackson's startup 

704 |

What may have been a liability at Dropbox is an asset in the eyes of First Round. 

705 |

"This career arc makes Todd a rare breed," First Round said in an announcement. "He’s experienced enough to have helped build some of the most iconic products in tech, including Gmail. But he’s also worked at the scrappiest end of the spectrum, scaling high-performance teams from scratch."

706 |

Jackson first met with First Round in 2013 when he was fundraising for Cover, a startup he founded. Cover, which made a smart Android lock screen, was acquired by Twitter in 2014, and Jackson became Twitter's director of product management. In 2014, his wife Arielle — a veteran of Google and Square — joined First Round as a marketing expert in residence. 

707 |

Jackson left Twitter for Dropbox in 2015, just a few months after CEO Jack Dorsey reclaimed his role at the helm of social media company.

708 |

Incidentally, when Jackson first joined Dropbox, he replaced Ilya Fushman, who left the company to join Index Ventures as a general partner. Fushman has since moved to another VC firm, Kleiner Perkins Caufield & Byers.

709 | 710 | 711 |
712 | 713 | 714 | 715 |
716 | 717 |
718 | Mehr: 719 | 720 | Dropbox 721 | First Round 722 | Venture Capital 723 | BI Prime 724 | 725 |
726 |
727 | 728 |
729 |
730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 |
739 | 742 |
743 | 744 |
745 | 746 | 747 | 748 | 749 | 794 | 795 | 796 | 797 | 798 | 799 | 800 | 801 |
802 |

A key Dropbox exec quit just weeks after its blockbuster IPO — but experts say it could be good in the long run

803 | 804 | 805 |

Dropbox VP of product Todd Jackson has left the...

806 |
807 | 808 | 809 | 810 |
811 |
812 |
813 |
814 | 815 |
816 |
817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | 826 | 827 | 828 |
829 | 830 |
831 |
832 | 833 | 834 | 835 | 836 | 837 | 838 | 839 | 840 | 841 | 842 | 843 | 844 | 845 | 846 | 847 | 848 | 849 | 850 | 866 | 867 |
868 | 869 |
870 | 875 |
876 | 877 |
878 | 879 | 880 | 881 | 882 | 883 | 884 | 885 | 886 | 887 |
888 | 889 | 890 |
891 | 892 | 893 | 894 |
895 |

Junior-Depot

896 |
897 | 898 |
899 | 908 |
Anzeige
909 |
910 | 911 |

 

912 |

 

913 |

WhatsApp-Newsletter

914 |
915 | 922 |
923 |
924 | 925 | 926 | 927 |
928 | 929 | 930 |
931 |
932 | 933 |
934 | 935 |
936 |
937 |
938 |
939 | 940 |
941 |
942 | 943 | 944 |
945 |
946 | 947 |
948 | 949 |
950 | 951 |
952 | 1011 | 1012 | 1013 | 1014 | 1015 | 1017 | 1018 | 1019 | 1021 | 1026 | 1027 | 1028 | 1029 | 1038 | 1039 | 1040 | 1041 | 1042 | 1043 | 1050 | 1053 | 1054 | 1055 | 1056 | 1081 | 1082 | 1083 | 1084 | 1085 | 1103 | 1104 | 1105 | 1106 | 1117 | 1118 | 1119 | 1120 | 1121 | 1122 | 1123 | 1124 | 1125 | 1128 | 1139 | 1140 | 1141 |
1142 | 1143 |
1144 |
1145 |
1146 | 1147 |
1148 |
1149 | 1150 | 1151 | 1152 | --------------------------------------------------------------------------------