├── .bumpversion.cfg ├── .cookiecutterrc ├── .coveragerc ├── .editorconfig ├── .gitignore ├── .travis.yml ├── AUTHORS.rst ├── CONTRIBUTING.rst ├── HISTORY.rst ├── LICENSE ├── MANIFEST.in ├── Makefile ├── Notes.md ├── README.rst ├── TODO.rst ├── VERSION ├── docs ├── Makefile ├── authors.rst ├── conf.py ├── contributing.rst ├── history.rst ├── index.rst ├── installation.rst ├── make.bat ├── readme.rst └── requirements.txt ├── pytest.ini ├── requirements-dev.txt ├── requirements-install.txt ├── requirements-setup.txt ├── requirements-tests.txt ├── setup.cfg ├── setup.py ├── src └── extract_social_media │ └── __init__.py ├── tests ├── test_package_import.py └── test_social.py └── tox.ini /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.4.0 3 | commit = False 4 | tag = False 5 | parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P\w+))? 6 | serialize = 7 | {major}.{minor}.{patch}-{release} 8 | {major}.{minor}.{patch} 9 | 10 | [bumpversion:part:release] 11 | optional_value = placeholder 12 | values = 13 | dev 14 | placeholder 15 | 16 | [bumpversion:file:VERSION] 17 | search = {current_version} 18 | replace = {new_version} 19 | 20 | [bumpversion:file:src/extract_social_media/__init__.py] 21 | search = __version__ = '{current_version}' 22 | replace = __version__ = '{new_version}' 23 | 24 | [bumpversion:file:.cookiecutterrc] 25 | search = version: {current_version} 26 | replace = version: {new_version} 27 | 28 | [bumpversion:file:HISTORY.rst] 29 | search = .. comment:: bumpversion marker 30 | replace = .. comment:: bumpversion marker 31 | 32 | {new_version} ({now:%Y-%m-%d}) 33 | ------------------ 34 | 35 | -------------------------------------------------------------------------------- /.cookiecutterrc: -------------------------------------------------------------------------------- 1 | # This file exists so you can easily regenerate your project. 2 | # 3 | # `cookiepatcher` is a convenient shim around `cookiecutter` 4 | # for regenerating projects (it will generate a .cookiecutterrc 5 | # automatically for any template). To use it: 6 | # 7 | # pip install cookiepatcher 8 | # cookiepatcher gh:ionelmc/cookiecutter-pylibrary project-path 9 | # 10 | # See: 11 | # https://pypi.python.org/pypi/cookiecutter 12 | # 13 | # Alternatively, you can run: 14 | # 15 | # cookiecutter --overwrite-if-exists --config-file=project-path/.cookiecutterrc gh:ionelmc/cookiecutter-pylibrary 16 | 17 | default_context: 18 | email: johannes@fluquid.com 19 | full_name: Johannes Ahlmann 20 | github_username: fluquid 21 | project_name: Extract Social Media 22 | project_package: extract_social_media 23 | project_short_description: Extract social media links from websites 24 | project_slug: extract-social-media 25 | pypi_username: fluquid 26 | use_codecov: y 27 | use_cython: n 28 | use_landscape: n 29 | use_pypi_deployment_with_travis: y 30 | use_pytest: y 31 | use_requiresio: y 32 | version: 0.4.0 33 | year: 2017 34 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [paths] 2 | source = 3 | src 4 | 5 | [run] 6 | branch = true 7 | source = 8 | extract_social_media 9 | parallel = true 10 | 11 | [report] 12 | show_missing = true 13 | precision = 2 14 | omit = 15 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.bat] 14 | indent_style = tab 15 | end_of_line = crlf 16 | 17 | [LICENSE] 18 | insert_final_newline = false 19 | 20 | [Makefile] 21 | indent_style = tab 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | 56 | # Sphinx documentation 57 | docs/_build/ 58 | 59 | # PyBuilder 60 | target/ 61 | 62 | # rope-vim 63 | .ropeproject 64 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # This file was autogenerated and will overwrite each time you run travis_pypi_setup.py 2 | after_failure: 3 | - more .tox/log/* | cat 4 | - more .tox/*/log/* | cat 5 | after_success: 6 | - '# Codecov requires a single .coverage and will run ''coverage xml'' to 7 | 8 | # generate the report. 9 | 10 | coverage combine 11 | 12 | bash <(curl -s https://codecov.io/bash) 13 | 14 | ' 15 | before_cache: 16 | - rm -fr $HOME/.cache/pip/log 17 | before_install: 18 | - python --version 19 | - uname -a 20 | - lsb_release -a 21 | cache: 22 | directories: 23 | - $HOME/.cache/pip 24 | deploy: 25 | provider: pypi 26 | true: 27 | condition: $TOXENV == py27 28 | repo: fluquid/extract-social-media 29 | tags: true 30 | distributions: sdist bdist_wheel 31 | user: fluquid 32 | password: 33 | secure: !!binary | 34 | d2wvRkVmOTQvcGxjQ0lhTTlNeUkxR1dRNEN2ckZ3SFptcytRSzMxSHRpQmRwSDVPTFNwMDFXTXlQ 35 | V21rMVg5MlZXeC9HWm5jQWF6dk9IbEwzczdmSHZ4aDRKZkZSUW1QNVpiUUxLL1U5RzIwSXNVZGRY 36 | L2swWUhOb080SVJFVFJValZKd2UwSDhEM25IbzRabFJuTzdQWFh2bDdUbWd3cUdGUGFuQjBtRGV1 37 | cUJWVEp4Z2h0Sjd6cDJBdVl1VExhelVaem5tZ2VPWVZMVDAwbFJYeHZGQ1MvUzRQclZxTDFVZmhR 38 | S3RVSGh2S1BLRDdjMm9SYThGQXlkT1d6bUVLbjRrcUgzM1FOUTVvYnpNcDRVREcyekMvUE1qQkR5 39 | UG9MNUJHRzZJRzJoaHVuWG5qWUxzMTlpQ3pNS0FxMUxGNUpRc09vZThoZGJJNUJXUmRpeWdpNHNM 40 | TVVIelFYYXgwVlVLRDMzc1JBMEUrWGtGNlNPTlkrOWc3MXU5c2s5WVNtNDhzQWpoM2dic2dvZ0d4 41 | M0dKTGl4a3NudUpHSmFCbmRqdjN3bGxkaEdCd3NjSWx4blBhNFlWKzZkcTRTdGNGR2pjKys2RU56 42 | dmJZUE5ldXZOTVpuODg0MHRjOTlMd2FMUE9yWWd2ZEFQbzR5ai8xU2k0VWJiRjF2UVNIdmlFTjN5 43 | UjI4ekdKWmV1L0ZRNW1DdFUxODNBaHg1clBMM1UvMWNxaEJ3ZXBXQkJVZ0pXRlVwS2RQMkd4ZStn 44 | eno2Y216RHFac0xwV2xCWDlVeFR0NHRSQUtpd2hXcGlLZHZ2NXRRS2F4V3N5Qmc3Qy9KY3hCTld4 45 | cU4vSWFNbzFwYlZKbXBUNUFpa21iWnVJYlo5dGR6V1czM2dYc1EzUWJydm4zdStyQzdaaEUwRjQ9 46 | env: 47 | - TOXENV=py35 48 | - TOXENV=py34 49 | - TOXENV=py27 50 | - TOXENV=pypy 51 | install: 52 | - pip install -U tox 53 | - pip install -U coverage 54 | - pip install -U twine 55 | - virtualenv --version 56 | - easy_install --version 57 | - pip --version 58 | - tox --version 59 | language: python 60 | matrix: 61 | allow_failures: 62 | - env: TOXENV=pypy 63 | notifications: 64 | email: 65 | on_failure: always 66 | on_sucess: never 67 | python: 3.5 68 | script: 69 | - tox 70 | sudo: false 71 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Development Lead 6 | ---------------- 7 | 8 | * Johannes Ahlmann 9 | 10 | Contributors 11 | ------------ 12 | 13 | None yet. Why not be the first? 14 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ============ 4 | Contributing 5 | ============ 6 | 7 | Contributions are welcome, and they are greatly appreciated! Every 8 | little bit helps, and credit will always be given. 9 | 10 | You can contribute in many ways: 11 | 12 | Types of Contributions 13 | ---------------------- 14 | 15 | Report Bugs 16 | ~~~~~~~~~~~ 17 | 18 | Report bugs at https://github.com/fluquid/extract-social-media/issues. 19 | 20 | If you are reporting a bug, please include: 21 | 22 | * Your operating system name and version. 23 | * Any details about your local setup that might be helpful in troubleshooting. 24 | * Detailed steps to reproduce the bug. 25 | 26 | Fix Bugs 27 | ~~~~~~~~ 28 | 29 | Look through the GitHub issues for bugs. Anything tagged with "bug" 30 | is open to whoever wants to implement it. 31 | 32 | Implement Features 33 | ~~~~~~~~~~~~~~~~~~ 34 | 35 | Look through the GitHub issues for features. Anything tagged with "feature" 36 | is open to whoever wants to implement it. 37 | 38 | Write Documentation 39 | ~~~~~~~~~~~~~~~~~~~ 40 | 41 | Extract Social Media could always use more documentation, whether as part of the 42 | official Extract Social Media docs, in docstrings, or even on the web in blog posts, 43 | articles, and such. 44 | 45 | Submit Feedback 46 | ~~~~~~~~~~~~~~~ 47 | 48 | The best way to send feedback is to file an issue at https://github.com/fluquid/extract-social-media/issues. 49 | 50 | If you are proposing a feature: 51 | 52 | * Explain in detail how it would work. 53 | * Keep the scope as narrow as possible, to make it easier to implement. 54 | * Remember that this is a volunteer-driven project, and that contributions 55 | are welcome :) 56 | 57 | Get Started! 58 | ------------ 59 | 60 | Ready to contribute? Here's how to set up `extract-social-media` for local development. 61 | 62 | 1. Fork the `extract-social-media` repo on GitHub. 63 | 2. Clone your fork locally:: 64 | 65 | $ git clone git@github.com:your_name_here/extract-social-media.git 66 | 67 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: 68 | 69 | $ mkvirtualenv extract-social-media 70 | $ cd extract-social-media/ 71 | $ python setup.py develop 72 | 73 | 4. Create a branch for local development:: 74 | 75 | $ git checkout -b name-of-your-bugfix-or-feature 76 | 77 | Now you can make your changes locally. 78 | 79 | 5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox:: 80 | 81 | $ flake8 extract_social_media tests 82 | $ python setup.py test or py.test 83 | $ tox 84 | 85 | To get flake8 and tox, just pip install them into your virtualenv. 86 | 87 | 6. Commit your changes and push your branch to GitHub:: 88 | 89 | $ git add . 90 | $ git commit -m "Your detailed description of your changes." 91 | $ git push origin name-of-your-bugfix-or-feature 92 | 93 | 7. Submit a pull request through the GitHub website. 94 | 95 | Pull Request Guidelines 96 | ----------------------- 97 | 98 | Before you submit a pull request, check that it meets these guidelines: 99 | 100 | 1. The pull request should include tests. 101 | 2. If the pull request adds functionality, the docs should be updated. Put 102 | your new functionality into a function with a docstring, and add the 103 | feature to the list in README.rst. 104 | 3. The pull request should work for Python 2.6, 2.7, 3.3, 3.4 and 3.5, and for PyPy. Check 105 | https://travis-ci.org/fluquid/extract-social-media/pull_requests 106 | and make sure that the tests pass for all supported Python versions. 107 | 108 | Tips 109 | ---- 110 | 111 | To run a subset of tests:: 112 | 113 | $ py.test tests.test_extract_social_media 114 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | History 3 | ======= 4 | 5 | .. comment:: bumpversion marker 6 | 7 | 0.4.0 (2017-08-18) 8 | ------------------ 9 | 10 | * naive blacklisting for photos, videos, search, tweets, etc. 11 | 12 | 0.3.0 (2017-08-18) 13 | ------------------ 14 | 15 | * fixed exception when "href" is empty or non-string 16 | 17 | 0.2.0 (2017-06-08) 18 | ------------------ 19 | 20 | * better test coverage 21 | * accepting data-href 22 | 23 | 0.1.0 (unreleased) 24 | ------------------ 25 | 26 | * First release on PyPI. 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017, Johannes Ahlmann 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft docs 2 | graft src 3 | graft tests 4 | 5 | include *.in 6 | include *.ini 7 | include *.rst 8 | include *.txt 9 | include *.md 10 | 11 | include LICENSE 12 | include VERSION 13 | include Makefile 14 | 15 | global-exclude __pycache__ *.py[cod] 16 | global-exclude *.so *.dylib 17 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean-so clean-test clean-pyc clean-build clean-docs clean 2 | .PHONY: docs check check-manifest check-setup check-history lint 3 | .PHONY: test test-all coverage 4 | .PHONY: compile-reqs install-reqs 5 | .PHONY: release dist install build-inplace 6 | define BROWSER_PYSCRIPT 7 | import os, webbrowser, sys 8 | try: 9 | from urllib import pathname2url 10 | except: 11 | from urllib.request import pathname2url 12 | 13 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) 14 | endef 15 | export BROWSER_PYSCRIPT 16 | BROWSER := python -c "$$BROWSER_PYSCRIPT" 17 | 18 | SPHINX_BUILD := html 19 | 20 | help: 21 | @echo "check - check setup, code style, setup, etc" 22 | @echo "check-manifest - check manifest" 23 | @echo "check-setup - check setup" 24 | @echo "check-history - check history" 25 | @echo "clean - remove all build, test, coverage and Python artifacts" 26 | @echo "clean-build - remove build artifacts" 27 | @echo "clean-docs - remove docs artifacts" 28 | @echo "clean-pyc - remove Python file artifacts" 29 | @echo "clean-test - remove test and coverage artifacts" 30 | @echo "clean-so - remove compiled extensions" 31 | @echo "lint - check style with flake8" 32 | @echo "test - run tests quickly with the default Python" 33 | @echo "test-all - run tests on every Python version with tox" 34 | @echo "coverage - check code coverage quickly with the default Python" 35 | @echo "compile-reqs - compile requirements" 36 | @echo "install-reqs - install requirements" 37 | @echo "docs - generate Sphinx HTML documentation, including API docs" 38 | @echo "release - package and upload a release" 39 | @echo "dist - package" 40 | @echo "develop - install package in develop mode" 41 | @echo "install - install the package to the active Python's site-packages" 42 | 43 | check: check-setup check-manifest check-history lint 44 | 45 | check-setup: 46 | @echo "Checking package metadata (name, description, etc)" 47 | python setup.py check --strict --metadata --restructuredtext 48 | 49 | check-manifest: 50 | @echo "Checking MANIFEST.in" 51 | check-manifest --ignore ".*" 52 | 53 | check-history: 54 | @echo "Checking latest version in HISTORY" 55 | VERSION=`cat VERSION`; grep "^$${VERSION}\b" HISTORY.rst 56 | 57 | clean: clean-build clean-docs clean-pyc clean-test clean-so 58 | 59 | clean-build: 60 | rm -fr build/ 61 | rm -fr dist/ 62 | rm -fr .eggs/ 63 | find . -name '*.egg-info' -exec rm -fr {} + 64 | find . -name '*.egg' -exec rm -f {} + 65 | 66 | clean-docs: 67 | $(MAKE) -C docs clean 68 | 69 | clean-pyc: 70 | find . -name '*.pyc' -exec rm -f {} + 71 | find . -name '*.pyo' -exec rm -f {} + 72 | find . -name '*~' -exec rm -f {} + 73 | find . -name '__pycache__' -exec rm -fr {} + 74 | 75 | clean-test: 76 | rm -fr .tox/ 77 | rm -f .coverage 78 | rm -fr htmlcov/ 79 | 80 | clean-so: 81 | find . -name '*.so' -exec rm -f {} + 82 | 83 | lint: 84 | flake8 src tests 85 | 86 | build-inplace: 87 | python setup.py build_ext --inplace 88 | 89 | develop: clean 90 | pip install -e . 91 | 92 | test: develop 93 | py.test 94 | 95 | test-all: 96 | tox -v 97 | 98 | coverage: develop 99 | coverage run -m py.test 100 | coverage combine 101 | coverage report 102 | 103 | coverage-html: coverage 104 | coverage html 105 | $(BROWSER) htmlcov/index.html 106 | 107 | docs-build: develop 108 | rm -f docs/extract_social_media.rst 109 | rm -f docs/modules.rst 110 | sphinx-apidoc -o docs/ src/extract_social_media 111 | $(MAKE) -C docs clean 112 | $(MAKE) -C docs $(SPHINX_BUILD) 113 | 114 | docs: docs-build 115 | $(BROWSER) docs/_build/$(SPHINX_BUILD)/index.html 116 | 117 | servedocs: docs 118 | watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D . 119 | 120 | release: clean check dist 121 | git branch | grep '* master' 122 | # Tagging release. 123 | VERSION=`cat VERSION`; git tag -a v$$VERSION 124 | git push --follow-tags 125 | twine upload dist/* 126 | 127 | dist: clean 128 | python setup.py sdist 129 | python setup.py bdist_wheel 130 | ls -l dist 131 | 132 | install: clean 133 | pip install . 134 | 135 | REQUIREMENTS_IN := $(wildcard requirements*.in) 136 | .PHONY: $(REQUIREMENTS_IN) 137 | 138 | requirements%.txt: requirements%.in 139 | pip-compile -v $< -o $@ 140 | 141 | REQUIREMENTS_TXT := $(REQUIREMENTS_IN:.in=.txt) 142 | ifndef REQUIREMENTS_TXT 143 | REQUIREMENTS_TXT := $(wildcard requirements*.txt) 144 | endif 145 | 146 | compile-reqs: $(REQUIREMENTS_TXT) 147 | @test -z "$$REQUIREMENTS_TXT" && echo "No 'requirements*.in' files. Nothing to do" 148 | 149 | install-reqs: 150 | @test -z "$$REQUIREMENTS_TXT" && echo "No 'requirements*.txt' files. Nothing to do" 151 | $(foreach req,$(REQUIREMENTS_TXT),pip install -r $(req);) 152 | -------------------------------------------------------------------------------- /Notes.md: -------------------------------------------------------------------------------- 1 | 2 | Social Notes: 3 | - some use outbound link stuff, or bitly 4 | 5 | - "fa fa-facebook", 206 6 | - "fa fa-twitter", 203 7 | - "fa fa-instagram", 70 8 | - "fa fa-youtube", 52 9 | - "fa fa-google-plus", 98 10 | - "icon-twitter" 11 | - "icon-facebook" 12 | - "icon-linkedin" 13 | - "icon-youtube-large" 14 | 15 | `facebook.com/plugins/like` 16 | `` 17 | `` 18 | "//connect.facebook.net/en_US/sdk.js#xfbml=1&version=v2.9"; 19 |
22 | 82 | """ 83 | for link in tree.xpath('//*[@href or @data-href]'): 84 | href = link.get('href') or link.get('data-href') 85 | if (href and 86 | isinstance(href, (six.string_types, six.text_type)) and 87 | matches_string(href)): 88 | yield href 89 | 90 | for script in tree.xpath('//script[not(@src)]/text()'): 91 | for match in SOCIAL_REX.findall(script): 92 | if not BLACKLIST_RE.search(match): 93 | yield match 94 | 95 | for script in tree.xpath('//meta[contains(@name, "twitter:")]'): 96 | name = script.get('name') 97 | if name in ('twitter:site', 'twitter:creator'): 98 | # FIXME: track fact that source is twitter 99 | yield script.get('content') 100 | -------------------------------------------------------------------------------- /tests/test_package_import.py: -------------------------------------------------------------------------------- 1 | import extract_social_media 2 | 3 | 4 | def test_package_metadata(): 5 | assert extract_social_media.__author__ 6 | assert extract_social_media.__email__ 7 | assert extract_social_media.__version__ 8 | -------------------------------------------------------------------------------- /tests/test_social.py: -------------------------------------------------------------------------------- 1 | from extract_social_media import matches_string, find_links_tree 2 | from lxml import etree 3 | 4 | """ 5 | TODO POS: 6 | http://mp.weixin.qq.com/s?__biz=MzA5NjM5MjU2OA==&mid=249883494&idx=1&sn=fe698c9e27082fe5520777245a752d9e&3rd=MzA3MDU4NTYzMw==&scene=6#rd 7 | 8 | TODO NEG: 9 | https://www.youtube.com/yt/copyright/ 10 | """ 11 | 12 | LINK_SAMPLES = """ 13 | http://www.flickr.com/photos/lenovophotolibrary 14 | http://www.weibo.com/elletw?sudaref=data.elle.com.tw 15 | http://www.weibo.com/parentingcw 16 | http://facebook.com/tinybuddha 17 | http://www.facebook.com/3Ireland 18 | http://www.facebook.com/LenovoUKandIreland 19 | http://www.facebook.com/daft.ie 20 | http://www.facebook.com/pages/Marc-and-Angel-Hack-Life-Practical-Tips-for-Productive-Living/60187856377 21 | http://www.facebook.com/pages/SurveyMonkey/65225997627 22 | http://www.facebook.com/pages/Vodafone-Ireland/39948747919?utm_campaign=vfcontactusfb&utm_medium=facebook&utm_source=onlineteamjd&utm_content=vfcontactusfb 23 | http://www.facebook.com/positivelypositive/ 24 | http://www.facebook.com/thejournal.ie 25 | https://fr-fr.facebook.com/Caisse.Epargne 26 | https://fr-fr.facebook.com/SFR 27 | https://flipboard.com/@techcrunch 28 | http://instagram.com/lifehackorg?ref=footer-browse-instagram 29 | http://instagram.com/newegg/ 30 | https://instagram.com/okcupid 31 | https://instagram.com/pospositive/ 32 | https://instagram.com/snapdeal/?hl=en 33 | http://pinterest.com/bange16/marc-and-angel/ 34 | http://pinterest.com/lifehack/?ref=footer-browse-pinterest 35 | http://pinterest.com/mindbodygreen 36 | http://pinterest.com/thinksimplenow/ 37 | http://www.pinterest.com/instructables 38 | http://www.pinterest.com/thepositivepin/ 39 | https://pinterest.com/EmilySchuman 40 | http://plus.google.com/107307393263977088342/about 41 | https://plus.google.com/+Coursera 42 | https://plus.google.com/+Dropbox/posts 43 | https://plus.google.com/+P%C3%BAblico/posts 44 | https://plus.google.com/+duolingo 45 | https://plus.google.com/+snapdeal/posts 46 | https://plus.google.com/100371967013117528205 47 | https://plus.google.com/111984034088692092819?prsrc=3 48 | https://plus.google.com/116623388763634190489 49 | https://plus.google.com/117330593038325285345/posts 50 | https://plus.google.com/u/0/+eventbrite/ 51 | https://plus.google.com/u/1/115964001953967461416?pageId=114804279025961350651&authuser=1 52 | https://telegram.me/publico_es 53 | http://twitter.com/#!/SurveyMonkey 54 | http://twitter.com/ThreeIreland 55 | http://twitter.com/VodafoneIreland?utm_campaign=vfcontactustw&utm_medium=twitter&utm_source=onlinejd&utm_content=vfcontactustw 56 | http://twitter.com/thejournal_ie 57 | http://www.twitter.com/MrjWells 58 | http://www.twitter.com/tinybuddha 59 | https://twitter.com/AskAIB 60 | https://twitter.com/Independent_ie 61 | https://twitter.com/XFINITY 62 | https://twitter.com/irishmirror 63 | https://twitter.com/lenovo_uki 64 | https://twitter.com/lifehackorg/?ref=footer-browse-twitter 65 | https://twitter.com/mindbodygreen 66 | https://twitter.com/rte 67 | https://www.twitter.com/Eventbrite 68 | https://www.facebook.com/Atlassian 69 | https://www.facebook.com/DoneDealIreland 70 | https://www.facebook.com/ExploreRTE/ 71 | https://www.facebook.com/Independent.ie 72 | https://www.facebook.com/TED 73 | https://www.facebook.com/cupcakesandcashmere 74 | https://www.facebook.com/eir 75 | https://www.facebook.com/lifehackorg/?ref=footer-browse-facebook 76 | https://www.facebook.com/monepositiveblog/ 77 | https://www.google.com/+Thechangeblog 78 | https://www.instagram.com/imdb/ 79 | https://www.instagram.com/mindvalley 80 | http://www.linkedin.com/company/362798 81 | http://www.linkedin.com/company/aib/products/ 82 | http://www.linkedin.com/company/investopedia-ulc 83 | http://www.linkedin.com/company/techcrunch 84 | http://www.linkedin.com/in/mrjwells 85 | https://www.linkedin.com/pub/whois-api/88/573/6b2 86 | https://www.periscope.tv/le_Parisien 87 | https://www.pinterest.com/snapdeal/ 88 | https://www.pinterest.com/tednews 89 | https://www.pinterest.com/tinybuddha/pins/ 90 | https://www.snapchat.com/add/positivepresent 91 | http://www.youtube.com/ThreeIreland 92 | http://www.youtube.com/aib 93 | http://www.youtube.com/positivelypositive1 94 | http://www.youtube.com/user/instructablestv 95 | http://www.youtube.com/user/mrjWells 96 | http://www.youtube.com/user/positivelypresent 97 | http://www.youtube.com/user/techcrunch 98 | https://www.youtube.com/channel/UCVimQoXNCZuEnZRVAbuYMiw 99 | https://www.youtube.com/channel/UCfHn_8-ehdem86fEvlFg-Gw 100 | https://www.youtube.com/ted 101 | https://www.youtube.com/user/DoneDealers 102 | https://www.youtube.com/user/LifehackOrg/?ref=footer-browse-youtube 103 | https://www.youtube.com/user/rte 104 | https://www.youtube.com/user/xfinity?feature=results_main 105 | https://soundcloud.com/uwebristol 106 | https://feeds.feedburner.com/TroyHunt 107 | https://vimeo.com/kadence 108 | https://eg.linkedin.com/in/sayed-gharib-51b05133?trk=pub-pbmap 109 | https://www.linkedin.com/company/dichter-&-neira-research-network?trk=fc_badge 110 | https://dk.linkedin.com/in/carolinehorten 111 | https://plus.google.com/u/0/111494755084642562984/posts 112 | http://www.slideshare.net/haystackinternational 113 | https://www.facebook.com/pages/Robas-Research/357181737690559 114 | http://www.facebook.com/pages/TNS-Global/55944527541 115 | http://plus.google.com/108198427863983309725/ 116 | http://www.youtube.com/tnsglobal 117 | http://feeds.feedburner.com/TnsGlobalPressReleases 118 | https://www.facebook.com/AMR-Advanced-Market-Research-GmbH-152914324834256/timeline/ 119 | https://uk.linkedin.com/in/mihajlopopesku 120 | http://www.vkontakte.ru/fom.media 121 | http://www.slideshare.net/fom-media/ 122 | https://www.pinterest.com/globalvoxpopuli/ 123 | http://fb.co/OReilly 124 | """ 125 | # tumblr, whatsapp, blogspot, PENGYOU, RENREN, KAIXIN 001, TENCENT WEIBO 126 | # SINA WEIBO, Baidu, WECHAT 127 | 128 | SOCIAL_NEGATIVE = """ 129 | https://www.linkedin.com/salary/ 130 | https://www.linkedin.com/learning/me 131 | https://about.twitter.com/company 132 | https://www.youtube.com/t/terms 133 | https://www.youtube.com/yt/policyandsafety/ 134 | https://www.facebook.com/privacy/explanation 135 | https://www.facebook.com/directory/celebrities/ 136 | https://www.facebook.com/mobile/?ref=pf 137 | https://www.facebook.com/directory/people/ 138 | https://www.facebook.com/places/ 139 | https://www.facebook.com/games/ 140 | https://www.facebook.com/careers/?ref=pf 141 | https://about.pinterest.com/en 142 | https://www.pinterest.com/_/_/about/ 143 | https://www.instagram.com/about/us/ 144 | https://www.instagram.com/developer/ 145 | https://www.instagram.com/legal/terms/ 146 | https://business.instagram.com/ 147 | https://www.snapchat.com/geofilters 148 | https://www.snapchat.com/jobs 149 | https://www.snapchat.com/terms 150 | https://www.snapchat.com/beta/ 151 | https://business.snapchat.com/ 152 | https://www.flickr.com/cameras 153 | https://www.flickr.com/about 154 | https://www.flickr.com/explore/ 155 | https://www.flickr.com/jobs 156 | https://www.xing.com/news/pages/f-a-z-wirtschaft-finanzen-90 157 | """ 158 | 159 | 160 | def split_lines(lines): 161 | return [x for x in lines.split('\n') if x.strip()] 162 | 163 | 164 | def test_positives(): 165 | for sample in split_lines(LINK_SAMPLES): 166 | assert matches_string(sample), (sample, ) 167 | 168 | 169 | def test_href(): 170 | href = etree.HTML(""" 171 | 172 | 173 | 177 |