├── .bumpversion.cfg
├── .cookiecutterrc
├── .coveragerc
├── .editorconfig
├── .gitignore
├── .travis.yml
├── AUTHORS.rst
├── CONTRIBUTING.rst
├── HISTORY.rst
├── LICENSE
├── MANIFEST.in
├── Makefile
├── Notes.md
├── README.rst
├── TODO.rst
├── VERSION
├── docs
    ├── Makefile
    ├── authors.rst
    ├── conf.py
    ├── contributing.rst
    ├── history.rst
    ├── index.rst
    ├── installation.rst
    ├── make.bat
    ├── readme.rst
    └── requirements.txt
├── pytest.ini
├── requirements-dev.txt
├── requirements-install.txt
├── requirements-setup.txt
├── requirements-tests.txt
├── setup.cfg
├── setup.py
├── src
    └── extract_social_media
    │   └── __init__.py
├── tests
    ├── test_package_import.py
    └── test_social.py
└── tox.ini


/.bumpversion.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.4.0
 3 | commit = False
 4 | tag = False
 5 | parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>\w+))?
 6 | serialize = 
 7 | 	{major}.{minor}.{patch}-{release}
 8 | 	{major}.{minor}.{patch}
 9 | 
10 | [bumpversion:part:release]
11 | optional_value = placeholder
12 | values = 
13 | 	dev
14 | 	placeholder
15 | 
16 | [bumpversion:file:VERSION]
17 | search = {current_version}
18 | replace = {new_version}
19 | 
20 | [bumpversion:file:src/extract_social_media/__init__.py]
21 | search = __version__ = '{current_version}'
22 | replace = __version__ = '{new_version}'
23 | 
24 | [bumpversion:file:.cookiecutterrc]
25 | search = version: {current_version}
26 | replace = version: {new_version}
27 | 
28 | [bumpversion:file:HISTORY.rst]
29 | search = .. comment:: bumpversion marker
30 | replace = .. comment:: bumpversion marker
31 | 	
32 | 	{new_version} ({now:%Y-%m-%d})
33 | 	------------------
34 | 
35 | 


--------------------------------------------------------------------------------
/.cookiecutterrc:
--------------------------------------------------------------------------------
 1 | # This file exists so you can easily regenerate your project.
 2 | #
 3 | # `cookiepatcher` is a convenient shim around `cookiecutter`
 4 | # for regenerating projects (it will generate a .cookiecutterrc 
 5 | # automatically for any template). To use it:
 6 | #
 7 | #    pip install cookiepatcher
 8 | #    cookiepatcher gh:ionelmc/cookiecutter-pylibrary project-path
 9 | #
10 | # See:
11 | #    https://pypi.python.org/pypi/cookiecutter
12 | #
13 | # Alternatively, you can run:
14 | #
15 | #    cookiecutter --overwrite-if-exists --config-file=project-path/.cookiecutterrc gh:ionelmc/cookiecutter-pylibrary
16 | 
17 | default_context:
18 |     email: johannes@fluquid.com
19 |     full_name: Johannes Ahlmann
20 |     github_username: fluquid
21 |     project_name: Extract Social Media
22 |     project_package: extract_social_media
23 |     project_short_description: Extract social media links from websites
24 |     project_slug: extract-social-media
25 |     pypi_username: fluquid
26 |     use_codecov: y
27 |     use_cython: n
28 |     use_landscape: n
29 |     use_pypi_deployment_with_travis: y
30 |     use_pytest: y
31 |     use_requiresio: y
32 |     version: 0.4.0
33 |     year: 2017
34 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | [paths]
 2 | source =
 3 |     src
 4 | 
 5 | [run]
 6 | branch = true
 7 | source =
 8 |     extract_social_media
 9 | parallel = true
10 | 
11 | [report]
12 | show_missing = true
13 | precision = 2
14 | omit = 
15 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # http://editorconfig.org
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | indent_style = space
 7 | indent_size = 4
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = true
10 | charset = utf-8
11 | end_of_line = lf
12 | 
13 | [*.bat]
14 | indent_style = tab
15 | end_of_line = crlf
16 | 
17 | [LICENSE]
18 | insert_final_newline = false
19 | 
20 | [Makefile]
21 | indent_style = tab
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | .hypothesis/
48 | 
49 | # Translations
50 | *.mo
51 | *.pot
52 | 
53 | # Django stuff:
54 | *.log
55 | 
56 | # Sphinx documentation
57 | docs/_build/
58 | 
59 | # PyBuilder
60 | target/
61 | 
62 | # rope-vim
63 | .ropeproject
64 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # This file was autogenerated and will overwrite each time you run travis_pypi_setup.py
 2 | after_failure:
 3 | - more .tox/log/* | cat
 4 | - more .tox/*/log/* | cat
 5 | after_success:
 6 | - '# Codecov requires a single .coverage and will run ''coverage xml'' to
 7 | 
 8 |   # generate the report.
 9 | 
10 |   coverage combine
11 | 
12 |   bash <(curl -s https://codecov.io/bash)
13 | 
14 |   '
15 | before_cache:
16 | - rm -fr $HOME/.cache/pip/log
17 | before_install:
18 | - python --version
19 | - uname -a
20 | - lsb_release -a
21 | cache:
22 |   directories:
23 |   - $HOME/.cache/pip
24 | deploy:
25 |   provider: pypi
26 |   true:
27 |     condition: $TOXENV == py27
28 |     repo: fluquid/extract-social-media
29 |     tags: true
30 |   distributions: sdist bdist_wheel
31 |   user: fluquid
32 |   password:
33 |     secure: !!binary |
34 |       d2wvRkVmOTQvcGxjQ0lhTTlNeUkxR1dRNEN2ckZ3SFptcytRSzMxSHRpQmRwSDVPTFNwMDFXTXlQ
35 |       V21rMVg5MlZXeC9HWm5jQWF6dk9IbEwzczdmSHZ4aDRKZkZSUW1QNVpiUUxLL1U5RzIwSXNVZGRY
36 |       L2swWUhOb080SVJFVFJValZKd2UwSDhEM25IbzRabFJuTzdQWFh2bDdUbWd3cUdGUGFuQjBtRGV1
37 |       cUJWVEp4Z2h0Sjd6cDJBdVl1VExhelVaem5tZ2VPWVZMVDAwbFJYeHZGQ1MvUzRQclZxTDFVZmhR
38 |       S3RVSGh2S1BLRDdjMm9SYThGQXlkT1d6bUVLbjRrcUgzM1FOUTVvYnpNcDRVREcyekMvUE1qQkR5
39 |       UG9MNUJHRzZJRzJoaHVuWG5qWUxzMTlpQ3pNS0FxMUxGNUpRc09vZThoZGJJNUJXUmRpeWdpNHNM
40 |       TVVIelFYYXgwVlVLRDMzc1JBMEUrWGtGNlNPTlkrOWc3MXU5c2s5WVNtNDhzQWpoM2dic2dvZ0d4
41 |       M0dKTGl4a3NudUpHSmFCbmRqdjN3bGxkaEdCd3NjSWx4blBhNFlWKzZkcTRTdGNGR2pjKys2RU56
42 |       dmJZUE5ldXZOTVpuODg0MHRjOTlMd2FMUE9yWWd2ZEFQbzR5ai8xU2k0VWJiRjF2UVNIdmlFTjN5
43 |       UjI4ekdKWmV1L0ZRNW1DdFUxODNBaHg1clBMM1UvMWNxaEJ3ZXBXQkJVZ0pXRlVwS2RQMkd4ZStn
44 |       eno2Y216RHFac0xwV2xCWDlVeFR0NHRSQUtpd2hXcGlLZHZ2NXRRS2F4V3N5Qmc3Qy9KY3hCTld4
45 |       cU4vSWFNbzFwYlZKbXBUNUFpa21iWnVJYlo5dGR6V1czM2dYc1EzUWJydm4zdStyQzdaaEUwRjQ9
46 | env:
47 | - TOXENV=py35
48 | - TOXENV=py34
49 | - TOXENV=py27
50 | - TOXENV=pypy
51 | install:
52 | - pip install -U tox
53 | - pip install -U coverage
54 | - pip install -U twine
55 | - virtualenv --version
56 | - easy_install --version
57 | - pip --version
58 | - tox --version
59 | language: python
60 | matrix:
61 |   allow_failures:
62 |   - env: TOXENV=pypy
63 | notifications:
64 |   email:
65 |     on_failure: always
66 |     on_sucess: never
67 | python: 3.5
68 | script:
69 | - tox
70 | sudo: false
71 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Credits
 3 | =======
 4 | 
 5 | Development Lead
 6 | ----------------
 7 | 
 8 | * Johannes Ahlmann <johannes@fluquid.com>
 9 | 
10 | Contributors
11 | ------------
12 | 
13 | None yet. Why not be the first?
14 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
  1 | .. highlight:: shell
  2 | 
  3 | ============
  4 | Contributing
  5 | ============
  6 | 
  7 | Contributions are welcome, and they are greatly appreciated! Every
  8 | little bit helps, and credit will always be given.
  9 | 
 10 | You can contribute in many ways:
 11 | 
 12 | Types of Contributions
 13 | ----------------------
 14 | 
 15 | Report Bugs
 16 | ~~~~~~~~~~~
 17 | 
 18 | Report bugs at https://github.com/fluquid/extract-social-media/issues.
 19 | 
 20 | If you are reporting a bug, please include:
 21 | 
 22 | * Your operating system name and version.
 23 | * Any details about your local setup that might be helpful in troubleshooting.
 24 | * Detailed steps to reproduce the bug.
 25 | 
 26 | Fix Bugs
 27 | ~~~~~~~~
 28 | 
 29 | Look through the GitHub issues for bugs. Anything tagged with "bug"
 30 | is open to whoever wants to implement it.
 31 | 
 32 | Implement Features
 33 | ~~~~~~~~~~~~~~~~~~
 34 | 
 35 | Look through the GitHub issues for features. Anything tagged with "feature"
 36 | is open to whoever wants to implement it.
 37 | 
 38 | Write Documentation
 39 | ~~~~~~~~~~~~~~~~~~~
 40 | 
 41 | Extract Social Media could always use more documentation, whether as part of the
 42 | official Extract Social Media docs, in docstrings, or even on the web in blog posts,
 43 | articles, and such.
 44 | 
 45 | Submit Feedback
 46 | ~~~~~~~~~~~~~~~
 47 | 
 48 | The best way to send feedback is to file an issue at https://github.com/fluquid/extract-social-media/issues.
 49 | 
 50 | If you are proposing a feature:
 51 | 
 52 | * Explain in detail how it would work.
 53 | * Keep the scope as narrow as possible, to make it easier to implement.
 54 | * Remember that this is a volunteer-driven project, and that contributions
 55 |   are welcome :)
 56 | 
 57 | Get Started!
 58 | ------------
 59 | 
 60 | Ready to contribute? Here's how to set up `extract-social-media` for local development.
 61 | 
 62 | 1. Fork the `extract-social-media` repo on GitHub.
 63 | 2. Clone your fork locally::
 64 | 
 65 |     $ git clone git@github.com:your_name_here/extract-social-media.git
 66 | 
 67 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
 68 | 
 69 |     $ mkvirtualenv extract-social-media
 70 |     $ cd extract-social-media/
 71 |     $ python setup.py develop
 72 | 
 73 | 4. Create a branch for local development::
 74 | 
 75 |     $ git checkout -b name-of-your-bugfix-or-feature
 76 | 
 77 |    Now you can make your changes locally.
 78 | 
 79 | 5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox::
 80 | 
 81 |     $ flake8 extract_social_media tests
 82 |     $ python setup.py test or py.test
 83 |     $ tox
 84 | 
 85 |    To get flake8 and tox, just pip install them into your virtualenv.
 86 | 
 87 | 6. Commit your changes and push your branch to GitHub::
 88 | 
 89 |     $ git add .
 90 |     $ git commit -m "Your detailed description of your changes."
 91 |     $ git push origin name-of-your-bugfix-or-feature
 92 | 
 93 | 7. Submit a pull request through the GitHub website.
 94 | 
 95 | Pull Request Guidelines
 96 | -----------------------
 97 | 
 98 | Before you submit a pull request, check that it meets these guidelines:
 99 | 
100 | 1. The pull request should include tests.
101 | 2. If the pull request adds functionality, the docs should be updated. Put
102 |    your new functionality into a function with a docstring, and add the
103 |    feature to the list in README.rst.
104 | 3. The pull request should work for Python 2.6, 2.7, 3.3, 3.4 and 3.5, and for PyPy. Check
105 |    https://travis-ci.org/fluquid/extract-social-media/pull_requests
106 |    and make sure that the tests pass for all supported Python versions.
107 | 
108 | Tips
109 | ----
110 | 
111 | To run a subset of tests::
112 | 
113 |     $ py.test tests.test_extract_social_media
114 | 


--------------------------------------------------------------------------------
/HISTORY.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | History
 3 | =======
 4 | 
 5 | .. comment:: bumpversion marker
 6 | 
 7 | 0.4.0 (2017-08-18)
 8 | ------------------
 9 | 
10 | * naive blacklisting for photos, videos, search, tweets, etc.
11 | 
12 | 0.3.0 (2017-08-18)
13 | ------------------
14 | 
15 | * fixed exception when "href" is empty or non-string
16 | 
17 | 0.2.0 (2017-06-08)
18 | ------------------
19 | 
20 | * better test coverage
21 | * accepting data-href
22 | 
23 | 0.1.0 (unreleased)
24 | ------------------
25 | 
26 | * First release on PyPI.
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017, Johannes Ahlmann
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 7 | of the Software, and to permit persons to whom the Software is furnished to do
 8 | so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | graft docs
 2 | graft src
 3 | graft tests
 4 | 
 5 | include *.in
 6 | include *.ini
 7 | include *.rst
 8 | include *.txt
 9 | include *.md
10 | 
11 | include LICENSE
12 | include VERSION
13 | include Makefile
14 | 
15 | global-exclude __pycache__ *.py[cod]
16 | global-exclude *.so *.dylib
17 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | .PHONY: clean-so clean-test clean-pyc clean-build clean-docs clean
  2 | .PHONY: docs check check-manifest check-setup check-history lint
  3 | .PHONY: test test-all coverage
  4 | .PHONY: compile-reqs install-reqs
  5 | .PHONY: release dist install build-inplace
  6 | define BROWSER_PYSCRIPT
  7 | import os, webbrowser, sys
  8 | try:
  9 | 	from urllib import pathname2url
 10 | except:
 11 | 	from urllib.request import pathname2url
 12 | 
 13 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
 14 | endef
 15 | export BROWSER_PYSCRIPT
 16 | BROWSER := python -c "$$BROWSER_PYSCRIPT"
 17 | 
 18 | SPHINX_BUILD := html
 19 | 
 20 | help:
 21 | 	@echo "check - check setup, code style, setup, etc"
 22 | 	@echo "check-manifest - check manifest"
 23 | 	@echo "check-setup - check setup"
 24 | 	@echo "check-history - check history"
 25 | 	@echo "clean - remove all build, test, coverage and Python artifacts"
 26 | 	@echo "clean-build - remove build artifacts"
 27 | 	@echo "clean-docs - remove docs artifacts"
 28 | 	@echo "clean-pyc - remove Python file artifacts"
 29 | 	@echo "clean-test - remove test and coverage artifacts"
 30 | 	@echo "clean-so - remove compiled extensions"
 31 | 	@echo "lint - check style with flake8"
 32 | 	@echo "test - run tests quickly with the default Python"
 33 | 	@echo "test-all - run tests on every Python version with tox"
 34 | 	@echo "coverage - check code coverage quickly with the default Python"
 35 | 	@echo "compile-reqs - compile requirements"
 36 | 	@echo "install-reqs - install requirements"
 37 | 	@echo "docs - generate Sphinx HTML documentation, including API docs"
 38 | 	@echo "release - package and upload a release"
 39 | 	@echo "dist - package"
 40 | 	@echo "develop - install package in develop mode"
 41 | 	@echo "install - install the package to the active Python's site-packages"
 42 | 
 43 | check: check-setup check-manifest check-history lint
 44 | 
 45 | check-setup:
 46 | 	@echo "Checking package metadata (name, description, etc)"
 47 | 	python setup.py check --strict --metadata --restructuredtext
 48 | 
 49 | check-manifest:
 50 | 	@echo "Checking MANIFEST.in"
 51 | 	check-manifest --ignore ".*"
 52 | 
 53 | check-history:
 54 | 	@echo "Checking latest version in HISTORY"
 55 | 	VERSION=`cat VERSION`; grep "^$${VERSION}\b" HISTORY.rst
 56 | 
 57 | clean: clean-build clean-docs clean-pyc clean-test clean-so
 58 | 
 59 | clean-build:
 60 | 	rm -fr build/
 61 | 	rm -fr dist/
 62 | 	rm -fr .eggs/
 63 | 	find . -name '*.egg-info' -exec rm -fr {} +
 64 | 	find . -name '*.egg' -exec rm -f {} +
 65 | 
 66 | clean-docs:
 67 | 	$(MAKE) -C docs clean
 68 | 
 69 | clean-pyc:
 70 | 	find . -name '*.pyc' -exec rm -f {} +
 71 | 	find . -name '*.pyo' -exec rm -f {} +
 72 | 	find . -name '*~' -exec rm -f {} +
 73 | 	find . -name '__pycache__' -exec rm -fr {} +
 74 | 
 75 | clean-test:
 76 | 	rm -fr .tox/
 77 | 	rm -f .coverage
 78 | 	rm -fr htmlcov/
 79 | 
 80 | clean-so:
 81 | 	find . -name '*.so' -exec rm -f {} +
 82 | 
 83 | lint:
 84 | 	flake8 src tests
 85 | 
 86 | build-inplace:
 87 | 	python setup.py build_ext --inplace
 88 | 
 89 | develop: clean
 90 | 	pip install -e .
 91 | 
 92 | test: develop
 93 | 	py.test
 94 | 
 95 | test-all:
 96 | 	tox -v
 97 | 
 98 | coverage: develop
 99 | 	coverage run -m py.test
100 | 	coverage combine
101 | 	coverage report
102 | 
103 | coverage-html: coverage
104 | 	coverage html
105 | 	$(BROWSER) htmlcov/index.html
106 | 
107 | docs-build: develop
108 | 	rm -f docs/extract_social_media.rst
109 | 	rm -f docs/modules.rst
110 | 	sphinx-apidoc -o docs/ src/extract_social_media
111 | 	$(MAKE) -C docs clean
112 | 	$(MAKE) -C docs $(SPHINX_BUILD)
113 | 
114 | docs: docs-build
115 | 	$(BROWSER) docs/_build/$(SPHINX_BUILD)/index.html
116 | 
117 | servedocs: docs
118 | 	watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
119 | 
120 | release: clean check dist
121 | 	git branch | grep '* master'
122 | 	# Tagging release.
123 | 	VERSION=`cat VERSION`; git tag -a v$$VERSION
124 | 	git push --follow-tags
125 | 	twine upload dist/*
126 | 
127 | dist: clean
128 | 	python setup.py sdist
129 | 	python setup.py bdist_wheel
130 | 	ls -l dist
131 | 
132 | install: clean
133 | 	pip install .
134 | 
135 | REQUIREMENTS_IN := $(wildcard requirements*.in)
136 | .PHONY: $(REQUIREMENTS_IN)
137 | 
138 | requirements%.txt: requirements%.in
139 | 	pip-compile -v $< -o $@
140 | 
141 | REQUIREMENTS_TXT := $(REQUIREMENTS_IN:.in=.txt)
142 | ifndef REQUIREMENTS_TXT
143 | REQUIREMENTS_TXT := $(wildcard requirements*.txt)
144 | endif
145 | 
146 | compile-reqs: $(REQUIREMENTS_TXT)
147 | 	@test -z "$$REQUIREMENTS_TXT" && echo "No 'requirements*.in' files. Nothing to do"
148 | 
149 | install-reqs:
150 | 	@test -z "$$REQUIREMENTS_TXT" && echo "No 'requirements*.txt' files. Nothing to do"
151 | 	$(foreach req,$(REQUIREMENTS_TXT),pip install -r $(req);)
152 | 


--------------------------------------------------------------------------------
/Notes.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Social Notes:
 3 | - some use outbound link stuff, or bitly
 4 | 
 5 | - "fa fa-facebook", 206
 6 | - "fa fa-twitter", 203
 7 | - "fa fa-instagram", 70
 8 | - "fa fa-youtube", 52
 9 | - "fa fa-google-plus", 98
10 | - "icon-twitter"
11 | - "icon-facebook"
12 | - "icon-linkedin"
13 | - "icon-youtube-large"
14 | 
15 | `facebook.com/plugins/like`
16 | `<script src="https://apis.google.com/js/plusone.js"></script>`
17 | `<script type="text/javascript" src="http://platform.twitter.com/widgets.js"></script>`
18 | "//connect.facebook.net/en_US/sdk.js#xfbml=1&version=v2.9";
19 | <div class="fb-like" data-href="https://developers.facebook.com/docs/plugins/"
20 | data-layout="standard" data-action="like" data-size="small"
21 | data-show-faces="true" data-share="true"></div>
22 | <a class="twitter-follow-button" href="https://twitter.com/NASA">
23 | <iframe src="http://www.facebook.com/plugins/like.php?app_id=242508025777888&amp;href=http%3A%2F%2Fwidgetsplus.com&amp;send=false&amp;layout=button_count&amp;width=250&amp;show_faces=true&amp;action=like"></iframe>
24 | 
25 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | ====================
 2 | Extract Social Media
 3 | ====================
 4 | 
 5 | .. image:: https://img.shields.io/pypi/v/extract-social-media.svg
 6 |         :target: https://pypi.python.org/pypi/extract-social-media
 7 | 
 8 | .. image:: https://img.shields.io/pypi/pyversions/extract-social-media.svg
 9 |         :target: https://pypi.python.org/pypi/extract-social-media
10 | 
11 | .. image:: https://img.shields.io/travis/fluquid/extract-social-media.svg
12 |         :target: https://travis-ci.org/fluquid/extract-social-media
13 | 
14 | .. image:: https://codecov.io/github/fluquid/extract-social-media/coverage.svg?branch=master
15 |     :alt: Coverage Status
16 |     :target: https://codecov.io/github/fluquid/extract-social-media
17 | 
18 | .. image:: https://requires.io/github/fluquid/extract-social-media/requirements.svg?branch=master
19 |     :alt: Requirements Status
20 |     :target: https://requires.io/github/fluquid/extract-social-media/requirements/?branch=master
21 | 
22 | Extract social media links from websites.
23 | 
24 | Many websites reference their facebook, twitter, linkedin, youtube accounts
25 | and these can be invaluable to gather 360 degree information about a company.
26 | 
27 | This library allows to extract links or handles for the most commonly used
28 | international social media networks.
29 | 
30 | * Free software: MIT license
31 | * Python versions: 2.7, 3.4+
32 | 
33 | Features
34 | --------
35 | 
36 | * Extract social media links/handles from html content
37 | * Attempts to extract links/handles also from widgets, scripts, etc.
38 | * Supports most widely used social networks
39 | 
40 |   * facebook
41 |   * linkedin
42 |   * twitter
43 |   * youtube
44 |   * github
45 |   * google plus
46 |   * pinterest
47 |   * instagram
48 |   * snapchat
49 |   * flipboard
50 |   * flickr
51 |   * weibo
52 |   * periscope
53 |   * telegram
54 |   * soundcloud
55 |   * feedburner
56 |   * vimeo
57 |   * slideshare
58 |   * vkontakte
59 |   * xing
60 | 
61 | Quickstart
62 | ----------
63 | 
64 | .. code:: python
65 | 
66 |    import requests
67 |    from html_to_etree import parse_html_bytes
68 |    res = requests.get('https://techcrunch.com/contact/')
69 |    tree = parse_html_bytes(res.content, res.headers.get('content-type'))
70 | 
71 |    set(find_links_tree(tree))
72 | 
73 |    {'http://pinterest.com/techcrunch/',
74 |     'http://www.youtube.com/user/techcrunch',
75 |     'http://www.linkedin.com/company/techcrunch',
76 |     'https://www.facebook.com/techcrunch',
77 |     'https://flipboard.com/@techcrunch',
78 |     'http://instagram.com/techcrunch',
79 |     'https://plus.google.com/+TechCrunch',
80 |     'https://instagram.com/techcrunch',
81 |     'https://twitter.com/techcrunch'}
82 | 
83 | Caveats
84 | -------
85 | 
86 | * currently finds all social media links on a page
87 | 
88 |   * need to look into finding most relevant links based on link location,
89 |     link context, company name, etc.
90 | 
91 | Credits
92 | -------
93 | 
94 | This package was created with Cookiecutter_ and the `fluquid/cookiecutter-pypackage`_ project template.
95 | 
96 | .. _Cookiecutter: https://github.com/audreyr/cookiecutter
97 | .. _`fluquid/cookiecutter-pypackage`: https://github.com/fluquid/cookiecutter-pypackage
98 | 


--------------------------------------------------------------------------------
/TODO.rst:
--------------------------------------------------------------------------------
1 | TODO
2 | ====
3 | 
4 | * Make a TODO
5 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 0.4.0
2 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	rm -rf $(BUILDDIR)/*
 51 | 
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | dirhtml:
 58 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 59 | 	@echo
 60 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 61 | 
 62 | singlehtml:
 63 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 66 | 
 67 | pickle:
 68 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 69 | 	@echo
 70 | 	@echo "Build finished; now you can process the pickle files."
 71 | 
 72 | json:
 73 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 74 | 	@echo
 75 | 	@echo "Build finished; now you can process the JSON files."
 76 | 
 77 | htmlhelp:
 78 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 79 | 	@echo
 80 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 81 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 82 | 
 83 | qthelp:
 84 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 85 | 	@echo
 86 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 87 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 88 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/extract-social-media.qhcp"
 89 | 	@echo "To view the help file:"
 90 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/extract-social-media.qhc"
 91 | 
 92 | devhelp:
 93 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 94 | 	@echo
 95 | 	@echo "Build finished."
 96 | 	@echo "To view the help file:"
 97 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/extract-social-media"
 98 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/extract-social-media"
 99 | 	@echo "# devhelp"
100 | 
101 | epub:
102 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | 	@echo
104 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 | 
106 | latex:
107 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | 	@echo
109 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | 	      "(use \`make latexpdf' here to do that automatically)."
112 | 
113 | latexpdf:
114 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | 	@echo "Running LaTeX files through pdflatex..."
116 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 | 
119 | latexpdfja:
120 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
122 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 | 
125 | text:
126 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | 	@echo
128 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
129 | 
130 | man:
131 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | 	@echo
133 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 | 
135 | texinfo:
136 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | 	@echo
138 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
140 | 	      "(use \`make info' here to do that automatically)."
141 | 
142 | info:
143 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | 	@echo "Running Texinfo files through makeinfo..."
145 | 	make -C $(BUILDDIR)/texinfo info
146 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 | 
148 | gettext:
149 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | 	@echo
151 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 | 
153 | changes:
154 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | 	@echo
156 | 	@echo "The overview file is in $(BUILDDIR)/changes."
157 | 
158 | linkcheck:
159 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | 	@echo
161 | 	@echo "Link check complete; look for any errors in the above output " \
162 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
163 | 
164 | doctest:
165 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | 	@echo "Testing of doctests in the sources finished, look at the " \
167 | 	      "results in $(BUILDDIR)/doctest/output.txt."
168 | 
169 | xml:
170 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | 	@echo
172 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 | 
174 | pseudoxml:
175 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | 	@echo
177 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 | 


--------------------------------------------------------------------------------
/docs/authors.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../AUTHORS.rst
2 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # extract-social-media documentation build configuration file, created by
  5 | # sphinx-quickstart on Tue Jul  9 22:26:36 2013.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | import os
 17 | import re
 18 | 
 19 | # If extensions (or modules to document with autodoc) are in another
 20 | # directory, add these directories to sys.path here. If the directory is
 21 | # relative to the documentation root, use os.path.abspath to make it
 22 | # absolute, like shown here.
 23 | #sys.path.insert(0, os.path.abspath('.'))
 24 | 
 25 | # Get the project root dir, which is the parent dir of this
 26 | project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 27 | 
 28 | # -- General configuration ---------------------------------------------
 29 | 
 30 | # If your documentation needs a minimal Sphinx version, state it here.
 31 | #needs_sphinx = '1.0'
 32 | 
 33 | # Add any Sphinx extension module names here, as strings. They can be
 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 35 | extensions = [
 36 |     'sphinx.ext.autodoc',
 37 |     'sphinx.ext.napoleon',
 38 |     'sphinx.ext.viewcode',
 39 | ]
 40 | 
 41 | # Add any paths that contain templates here, relative to this directory.
 42 | templates_path = ['_templates']
 43 | 
 44 | # The suffix of source filenames.
 45 | source_suffix = '.rst'
 46 | 
 47 | # The encoding of source files.
 48 | #source_encoding = 'utf-8-sig'
 49 | 
 50 | # The master toctree document.
 51 | master_doc = 'index'
 52 | 
 53 | # General information about the project.
 54 | project = u'Extract Social Media'
 55 | copyright = u'2017, Johannes Ahlmann'
 56 | 
 57 | # The version info for the project you're documenting, acts as replacement
 58 | # for |version| and |release|, also used in various other places throughout
 59 | # the built documents.
 60 | #
 61 | # The full version, including alpha/beta/rc tags.
 62 | release = open(os.path.join(project_root, 'VERSION')).read().strip()
 63 | # The short X.Y version.
 64 | version = re.findall(r'\d+\.\d+\.\d+', release)[0]
 65 | 
 66 | # The language for content autogenerated by Sphinx. Refer to documentation
 67 | # for a list of supported languages.
 68 | #language = None
 69 | 
 70 | # There are two options for replacing |today|: either, you set today to
 71 | # some non-false value, then it is used:
 72 | #today = ''
 73 | # Else, today_fmt is used as the format for a strftime call.
 74 | #today_fmt = '%B %d, %Y'
 75 | 
 76 | # List of patterns, relative to source directory, that match files and
 77 | # directories to ignore when looking for source files.
 78 | exclude_patterns = ['_build']
 79 | 
 80 | # The reST default role (used for this markup: `text`) to use for all
 81 | # documents.
 82 | #default_role = None
 83 | 
 84 | # If true, '()' will be appended to :func: etc. cross-reference text.
 85 | #add_function_parentheses = True
 86 | 
 87 | # If true, the current module name will be prepended to all description
 88 | # unit titles (such as .. function::).
 89 | #add_module_names = True
 90 | 
 91 | # If true, sectionauthor and moduleauthor directives will be shown in the
 92 | # output. They are ignored by default.
 93 | #show_authors = False
 94 | 
 95 | # The name of the Pygments (syntax highlighting) style to use.
 96 | pygments_style = 'sphinx'
 97 | 
 98 | # A list of ignored prefixes for module index sorting.
 99 | #modindex_common_prefix = []
100 | 
101 | # If true, keep warnings as "system message" paragraphs in the built
102 | # documents.
103 | #keep_warnings = False
104 | 
105 | 
106 | # -- Options for HTML output -------------------------------------------
107 | 
108 | # The theme to use for HTML and HTML Help pages.  See the documentation for
109 | # a list of builtin themes.
110 | html_theme = 'default'
111 | 
112 | # Theme options are theme-specific and customize the look and feel of a
113 | # theme further.  For a list of options available for each theme, see the
114 | # documentation.
115 | #html_theme_options = {}
116 | 
117 | # Add any paths that contain custom themes here, relative to this directory.
118 | #html_theme_path = []
119 | 
120 | # The name for this set of Sphinx documents.  If None, it defaults to
121 | # "<project> v<release> documentation".
122 | #html_title = None
123 | 
124 | # A shorter title for the navigation bar.  Default is the same as
125 | # html_title.
126 | #html_short_title = None
127 | 
128 | # The name of an image file (relative to this directory) to place at the
129 | # top of the sidebar.
130 | #html_logo = None
131 | 
132 | # The name of an image file (within the static path) to use as favicon
133 | # of the docs.  This file should be a Windows icon file (.ico) being
134 | # 16x16 or 32x32 pixels large.
135 | #html_favicon = None
136 | 
137 | # Add any paths that contain custom static files (such as style sheets)
138 | # here, relative to this directory. They are copied after the builtin
139 | # static files, so a file named "default.css" will overwrite the builtin
140 | # "default.css".
141 | html_static_path = ['_static']
142 | 
143 | # If not '', a 'Last updated on:' timestamp is inserted at every page
144 | # bottom, using the given strftime format.
145 | #html_last_updated_fmt = '%b %d, %Y'
146 | 
147 | # If true, SmartyPants will be used to convert quotes and dashes to
148 | # typographically correct entities.
149 | #html_use_smartypants = True
150 | 
151 | # Custom sidebar templates, maps document names to template names.
152 | #html_sidebars = {}
153 | 
154 | # Additional templates that should be rendered to pages, maps page names
155 | # to template names.
156 | #html_additional_pages = {}
157 | 
158 | # If false, no module index is generated.
159 | #html_domain_indices = True
160 | 
161 | # If false, no index is generated.
162 | #html_use_index = True
163 | 
164 | # If true, the index is split into individual pages for each letter.
165 | #html_split_index = False
166 | 
167 | # If true, links to the reST sources are added to the pages.
168 | #html_show_sourcelink = True
169 | 
170 | # If true, "Created using Sphinx" is shown in the HTML footer.
171 | # Default is True.
172 | #html_show_sphinx = True
173 | 
174 | # If true, "(C) Copyright ..." is shown in the HTML footer.
175 | # Default is True.
176 | #html_show_copyright = True
177 | 
178 | # If true, an OpenSearch description file will be output, and all pages
179 | # will contain a <link> tag referring to it.  The value of this option
180 | # must be the base URL from which the finished HTML is served.
181 | #html_use_opensearch = ''
182 | 
183 | # This is the file name suffix for HTML files (e.g. ".xhtml").
184 | #html_file_suffix = None
185 | 
186 | # Output file base name for HTML help builder.
187 | htmlhelp_basename = 'extract_social_mediadoc'
188 | 
189 | 
190 | # -- Options for LaTeX output ------------------------------------------
191 | 
192 | latex_elements = {
193 |     # The paper size ('letterpaper' or 'a4paper').
194 |     #'papersize': 'letterpaper',
195 | 
196 |     # The font size ('10pt', '11pt' or '12pt').
197 |     #'pointsize': '10pt',
198 | 
199 |     # Additional stuff for the LaTeX preamble.
200 |     #'preamble': '',
201 | }
202 | 
203 | # Grouping the document tree into LaTeX files. List of tuples
204 | # (source start file, target name, title, author, documentclass
205 | # [howto/manual]).
206 | latex_documents = [
207 |     ('index', 'extract_social_media.tex',
208 |      u'Extract Social Media Documentation',
209 |      u'Johannes Ahlmann', 'manual'),
210 | ]
211 | 
212 | # The name of an image file (relative to this directory) to place at
213 | # the top of the title page.
214 | #latex_logo = None
215 | 
216 | # For "manual" documents, if this is true, then toplevel headings
217 | # are parts, not chapters.
218 | #latex_use_parts = False
219 | 
220 | # If true, show page references after internal links.
221 | #latex_show_pagerefs = False
222 | 
223 | # If true, show URL addresses after external links.
224 | #latex_show_urls = False
225 | 
226 | # Documents to append as an appendix to all manuals.
227 | #latex_appendices = []
228 | 
229 | # If false, no module index is generated.
230 | #latex_domain_indices = True
231 | 
232 | 
233 | # -- Options for manual page output ------------------------------------
234 | 
235 | # One entry per manual page. List of tuples
236 | # (source start file, name, description, authors, manual section).
237 | man_pages = [
238 |     ('index', 'extract_social_media',
239 |      u'Extract Social Media Documentation',
240 |      [u'Johannes Ahlmann'], 1)
241 | ]
242 | 
243 | # If true, show URL addresses after external links.
244 | #man_show_urls = False
245 | 
246 | 
247 | # -- Options for Texinfo output ----------------------------------------
248 | 
249 | # Grouping the document tree into Texinfo files. List of tuples
250 | # (source start file, target name, title, author,
251 | #  dir menu entry, description, category)
252 | texinfo_documents = [
253 |     ('index', 'extract_social_media',
254 |      u'Extract Social Media Documentation',
255 |      u'Johannes Ahlmann',
256 |      'extract-social-media',
257 |      'One line description of project.',
258 |      'Miscellaneous'),
259 | ]
260 | 
261 | # Documents to append as an appendix to all manuals.
262 | #texinfo_appendices = []
263 | 
264 | # If false, no module index is generated.
265 | #texinfo_domain_indices = True
266 | 
267 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
268 | #texinfo_show_urls = 'footnote'
269 | 
270 | # If true, do not generate a @detailmenu in the "Top" node's menu.
271 | #texinfo_no_detailmenu = False
272 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../CONTRIBUTING.rst
2 | 


--------------------------------------------------------------------------------
/docs/history.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../HISTORY.rst
2 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. extract-social-media documentation master file, created by
 2 |    sphinx-quickstart on Tue Jul  9 22:26:36 2013.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to Extract Social Media's documentation!
 7 | ======================================
 8 | 
 9 | Contents:
10 | 
11 | .. toctree::
12 |    :maxdepth: 2
13 | 
14 |    readme
15 |    installation
16 |    history
17 | 
18 | Indices and tables
19 | ==================
20 | 
21 | * :ref:`genindex`
22 | * :ref:`modindex`
23 | * :ref:`search`
24 | 
25 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | .. highlight:: shell
 2 | 
 3 | ============
 4 | Installation
 5 | ============
 6 | 
 7 | 
 8 | Stable release
 9 | --------------
10 | 
11 | To install Extract Social Media, run this command in your terminal:
12 | 
13 | .. code-block:: console
14 | 
15 |     $ pip install extract-social-media
16 | 
17 | If you don't have `pip`_ installed, this `Python installation guide`_ can guide
18 | you through the process.
19 | 
20 | .. _pip: https://pip.pypa.io
21 | .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/
22 | 
23 | 
24 | From sources
25 | ------------
26 | 
27 | The sources for Extract Social Media can be downloaded from the `Github repo`_.
28 | 
29 | You can either clone the public repository:
30 | 
31 | .. code-block:: console
32 | 
33 |     $ git clone git://github.com/fluquid/extract-social-media
34 | 
35 | Or download the `tarball`_:
36 | 
37 | .. code-block:: console
38 | 
39 |     $ curl  -OL https://github.com/fluquid/extract-social-media/tarball/master
40 | 
41 | Once you have a copy of the source, you can install it with:
42 | 
43 | .. code-block:: console
44 | 
45 |     $ pip install -e .
46 | 
47 | 
48 | .. _Github repo: https://github.com/fluquid/extract-social-media
49 | .. _tarball: https://github.com/fluquid/extract-social-media/tarball/master
50 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	goto end
 41 | )
 42 | 
 43 | if "%1" == "clean" (
 44 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 45 | 	del /q /s %BUILDDIR%\*
 46 | 	goto end
 47 | )
 48 | 
 49 | 
 50 | %SPHINXBUILD% 2> nul
 51 | if errorlevel 9009 (
 52 | 	echo.
 53 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 54 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 55 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 56 | 	echo.may add the Sphinx directory to PATH.
 57 | 	echo.
 58 | 	echo.If you don't have Sphinx installed, grab it from
 59 | 	echo.http://sphinx-doc.org/
 60 | 	exit /b 1
 61 | )
 62 | 
 63 | if "%1" == "html" (
 64 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "dirhtml" (
 72 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "singlehtml" (
 80 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "pickle" (
 88 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can process the pickle files.
 92 | 	goto end
 93 | )
 94 | 
 95 | if "%1" == "json" (
 96 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 97 | 	if errorlevel 1 exit /b 1
 98 | 	echo.
 99 | 	echo.Build finished; now you can process the JSON files.
100 | 	goto end
101 | )
102 | 
103 | if "%1" == "htmlhelp" (
104 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
105 | 	if errorlevel 1 exit /b 1
106 | 	echo.
107 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
108 | .hhp project file in %BUILDDIR%/htmlhelp.
109 | 	goto end
110 | )
111 | 
112 | if "%1" == "qthelp" (
113 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
114 | 	if errorlevel 1 exit /b 1
115 | 	echo.
116 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
117 | .qhcp project file in %BUILDDIR%/qthelp, like this:
118 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\extract-social-media.qhcp
119 | 	echo.To view the help file:
120 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\extract-social-media.ghc
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "devhelp" (
125 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "epub" (
133 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "latex" (
141 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "latexpdf" (
149 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
150 | 	cd %BUILDDIR%/latex
151 | 	make all-pdf
152 | 	cd %BUILDDIR%/..
153 | 	echo.
154 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
155 | 	goto end
156 | )
157 | 
158 | if "%1" == "latexpdfja" (
159 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
160 | 	cd %BUILDDIR%/latex
161 | 	make all-pdf-ja
162 | 	cd %BUILDDIR%/..
163 | 	echo.
164 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
165 | 	goto end
166 | )
167 | 
168 | if "%1" == "text" (
169 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
170 | 	if errorlevel 1 exit /b 1
171 | 	echo.
172 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
173 | 	goto end
174 | )
175 | 
176 | if "%1" == "man" (
177 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
178 | 	if errorlevel 1 exit /b 1
179 | 	echo.
180 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
181 | 	goto end
182 | )
183 | 
184 | if "%1" == "texinfo" (
185 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
186 | 	if errorlevel 1 exit /b 1
187 | 	echo.
188 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
189 | 	goto end
190 | )
191 | 
192 | if "%1" == "gettext" (
193 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
194 | 	if errorlevel 1 exit /b 1
195 | 	echo.
196 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
197 | 	goto end
198 | )
199 | 
200 | if "%1" == "changes" (
201 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
202 | 	if errorlevel 1 exit /b 1
203 | 	echo.
204 | 	echo.The overview file is in %BUILDDIR%/changes.
205 | 	goto end
206 | )
207 | 
208 | if "%1" == "linkcheck" (
209 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
210 | 	if errorlevel 1 exit /b 1
211 | 	echo.
212 | 	echo.Link check complete; look for any errors in the above output ^
213 | or in %BUILDDIR%/linkcheck/output.txt.
214 | 	goto end
215 | )
216 | 
217 | if "%1" == "doctest" (
218 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
219 | 	if errorlevel 1 exit /b 1
220 | 	echo.
221 | 	echo.Testing of doctests in the sources finished, look at the ^
222 | results in %BUILDDIR%/doctest/output.txt.
223 | 	goto end
224 | )
225 | 
226 | if "%1" == "xml" (
227 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
228 | 	if errorlevel 1 exit /b 1
229 | 	echo.
230 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
231 | 	goto end
232 | )
233 | 
234 | if "%1" == "pseudoxml" (
235 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
236 | 	if errorlevel 1 exit /b 1
237 | 	echo.
238 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
239 | 	goto end
240 | )
241 | 
242 | :end
243 | 


--------------------------------------------------------------------------------
/docs/readme.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../README.rst
2 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | # Readthedocs workaround.
2 | # This should be installed using pip from the root directory.
3 | -e .
4 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
 1 | [pytest]
 2 | norecursedirs =
 3 |     .*
 4 |     dist
 5 |     build
 6 | python_files =
 7 |     test_*.py
 8 |     *_test.py
 9 |     tests.py
10 | ignore =
11 |     setup.py
12 | addopts =
13 |     -rxEfsw -v
14 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | # This packages are requires only for development and release management.
 2 | Sphinx
 3 | bumpversion
 4 | check-manifest
 5 | pip-tools
 6 | twine
 7 | watchdog
 8 | wheel
 9 | pyyaml
10 | cryptography
11 | docutils
12 | coverage
13 | pytest
14 | 


--------------------------------------------------------------------------------
/requirements-install.txt:
--------------------------------------------------------------------------------
1 | # This packages are required to install and run our package.
2 | lxml
3 | requests
4 | html_to_etree
5 | six
6 | 


--------------------------------------------------------------------------------
/requirements-setup.txt:
--------------------------------------------------------------------------------
1 | # This packages are required before running setup (i.e. build commands require
2 | # to import this packages).
3 | 


--------------------------------------------------------------------------------
/requirements-tests.txt:
--------------------------------------------------------------------------------
1 | # This packages are required to run all the tests and perform checks.
2 | coverage
3 | flake8
4 | pytest
5 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [wheel]
2 | universal = 1
3 | 
4 | [flake8]
5 | exclude = docs, tests
6 | max-line-length = 120
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import io
 4 | from pkgutil import walk_packages
 5 | from setuptools import setup
 6 | 
 7 | 
 8 | def find_packages(path):
 9 |     # This method returns packages and subpackages as well.
10 |     return [name for _, name, is_pkg in walk_packages([path]) if is_pkg]
11 | 
12 | 
13 | def read_file(filename):
14 |     with io.open(filename) as fp:
15 |         return fp.read().strip()
16 | 
17 | 
18 | def read_rst(filename):
19 |     # Ignore unsupported directives by pypi.
20 |     content = read_file(filename)
21 |     return ''.join(line for line in io.StringIO(content)
22 |                    if not line.startswith('.. comment::'))
23 | 
24 | 
25 | def read_requirements(filename):
26 |     return [line.strip() for line in read_file(filename).splitlines()
27 |             if not line.startswith('#')]
28 | 
29 | 
30 | setup_attrs = dict(
31 |     name='extract-social-media',
32 |     version=read_file('VERSION'),
33 |     description="Extract social media links from websites",
34 |     long_description=read_rst('README.rst') + '\n\n' + read_rst('HISTORY.rst'),
35 |     author="Johannes Ahlmann",
36 |     author_email='johannes@fluquid.com',
37 |     url='https://github.com/fluquid/extract-social-media',
38 |     packages=list(find_packages('src')),
39 |     package_dir={'': 'src'},
40 |     setup_requires=read_requirements('requirements-setup.txt'),
41 |     install_requires=read_requirements('requirements-install.txt'),
42 |     include_package_data=True,
43 |     license="MIT",
44 |     keywords='extract-social-media',
45 |     classifiers=[
46 |         'Development Status :: 2 - Pre-Alpha',
47 |         'Intended Audience :: Developers',
48 |         'License :: OSI Approved :: MIT License',
49 |         'Natural Language :: English',
50 |         "Programming Language :: Python :: 2",
51 |         'Programming Language :: Python :: 2.7',
52 |         'Programming Language :: Python :: 3',
53 |         'Programming Language :: Python :: 3.4',
54 |         'Programming Language :: Python :: 3.5',
55 |     ],
56 | )
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     setup(**setup_attrs)
61 | 


--------------------------------------------------------------------------------
/src/extract_social_media/__init__.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import unicode_literals
  4 | import re
  5 | 
  6 | import six
  7 | 
  8 | __author__ = 'Johannes Ahlmann'
  9 | __email__ = 'johannes@fluquid.com'
 10 | __version__ = '0.4.0'
 11 | 
 12 | 
 13 | # FIXME: a lot wrong with the below
 14 | # - too permissive
 15 | # - likely too slow
 16 | PREFIX = r'https?://(?:www\.)?'
 17 | SITES = ['twitter.com/', 'youtube.com/',
 18 |          '(?:[a-z]{2}\.)?linkedin.com/(?:company/|in/|pub/)',
 19 |          'github.com/', '(?:[a-z]{2}-[a-z]{2}\.)?facebook.com/', 'fb.co',
 20 |          'plus\.google.com/', 'pinterest.com/', 'instagram.com/',
 21 |          'snapchat.com/', 'flipboard.com/', 'flickr.com',
 22 |          'google.com/+', 'weibo.com/', 'periscope.tv/',
 23 |          'telegram.me/', 'soundcloud.com', 'feeds.feedburner.com',
 24 |          'vimeo.com', 'slideshare.net', 'vkontakte.ru']
 25 | BETWEEN = ['user/', 'add/', 'pages/', '#!/', 'photos/',
 26 |            'u/0/']
 27 | ACCOUNT = r'[\w\+_@\.\-/%]+'
 28 | PATTERN = (
 29 |     r'%s(?:%s)(?:%s)?%s' %
 30 |     (PREFIX, '|'.join(SITES), '|'.join(BETWEEN), ACCOUNT))
 31 | SOCIAL_REX = re.compile(PATTERN, flags=re.I)
 32 | BLACKLIST_RE = re.compile(
 33 |     """
 34 |     sharer.php|
 35 |     /photos/.*\d{6,}|
 36 |     google.com/(?:ads/|
 37 |                   analytics$|
 38 |                   chrome$|
 39 |                   intl/|
 40 |                   maps/|
 41 |                   policies/|
 42 |                   search$
 43 |                )|
 44 |     instagram.com/p/|
 45 |     /share\?|
 46 |     /status/|
 47 |     /hashtag/|
 48 |     home\?status=|
 49 |     twitter.com/intent/|
 50 |     twitter.com/share|
 51 |     search\?|
 52 |     /search/|
 53 |     pinterest.com/pin/create/|
 54 |     vimeo.com/\d+$|
 55 |     /watch\?""",
 56 |     flags=re.VERBOSE)
 57 | 
 58 | 
 59 | def _from_url(url):  # pragma: no cover
 60 |     """ get list of social media links/handles given a url """
 61 |     import requests
 62 |     from html_to_etree import parse_html_bytes
 63 |     res = requests.get(url)
 64 |     tree = parse_html_bytes(res.content, res.headers.get('content-type'))
 65 | 
 66 |     return set(find_links_tree(tree))
 67 | 
 68 | 
 69 | def matches_string(string):
 70 |     """ check if a given string matches known social media url patterns """
 71 |     return SOCIAL_REX.match(string) and not BLACKLIST_RE.search(string)
 72 | 
 73 | 
 74 | def find_links_tree(tree):
 75 |     """
 76 |     find social media links/handles given an lxml etree.
 77 | 
 78 |     TODO:
 79 |     - `<fb:like href="http://www.facebook.com/elDiarioEs"`
 80 |     - `<g:plusone href="http://widgetsplus.com/"></g:plusone>`
 81 |     - <a class="reference external" href="https://twitter.com/intent/follow?screen_name=NASA">
 82 |     """
 83 |     for link in tree.xpath('//*[@href or @data-href]'):
 84 |         href = link.get('href') or link.get('data-href')
 85 |         if (href and
 86 |                 isinstance(href, (six.string_types, six.text_type)) and
 87 |                 matches_string(href)):
 88 |             yield href
 89 | 
 90 |     for script in tree.xpath('//script[not(@src)]/text()'):
 91 |         for match in SOCIAL_REX.findall(script):
 92 |             if not BLACKLIST_RE.search(match):
 93 |                 yield match
 94 | 
 95 |     for script in tree.xpath('//meta[contains(@name, "twitter:")]'):
 96 |         name = script.get('name')
 97 |         if name in ('twitter:site', 'twitter:creator'):
 98 |             # FIXME: track fact that source is twitter
 99 |             yield script.get('content')
100 | 


--------------------------------------------------------------------------------
/tests/test_package_import.py:
--------------------------------------------------------------------------------
1 | import extract_social_media
2 | 
3 | 
4 | def test_package_metadata():
5 |     assert extract_social_media.__author__
6 |     assert extract_social_media.__email__
7 |     assert extract_social_media.__version__
8 | 


--------------------------------------------------------------------------------
/tests/test_social.py:
--------------------------------------------------------------------------------
  1 | from extract_social_media import matches_string, find_links_tree
  2 | from lxml import etree
  3 | 
  4 | """
  5 | TODO POS:
  6 | http://mp.weixin.qq.com/s?__biz=MzA5NjM5MjU2OA==&mid=249883494&idx=1&sn=fe698c9e27082fe5520777245a752d9e&3rd=MzA3MDU4NTYzMw==&scene=6#rd
  7 | 
  8 | TODO NEG:
  9 | https://www.youtube.com/yt/copyright/
 10 | """
 11 | 
 12 | LINK_SAMPLES = """
 13 | http://www.flickr.com/photos/lenovophotolibrary
 14 | http://www.weibo.com/elletw?sudaref=data.elle.com.tw
 15 | http://www.weibo.com/parentingcw
 16 | http://facebook.com/tinybuddha
 17 | http://www.facebook.com/3Ireland
 18 | http://www.facebook.com/LenovoUKandIreland
 19 | http://www.facebook.com/daft.ie
 20 | http://www.facebook.com/pages/Marc-and-Angel-Hack-Life-Practical-Tips-for-Productive-Living/60187856377
 21 | http://www.facebook.com/pages/SurveyMonkey/65225997627
 22 | http://www.facebook.com/pages/Vodafone-Ireland/39948747919?utm_campaign=vfcontactusfb&utm_medium=facebook&utm_source=onlineteamjd&utm_content=vfcontactusfb
 23 | http://www.facebook.com/positivelypositive/
 24 | http://www.facebook.com/thejournal.ie
 25 | https://fr-fr.facebook.com/Caisse.Epargne
 26 | https://fr-fr.facebook.com/SFR
 27 | https://flipboard.com/@techcrunch
 28 | http://instagram.com/lifehackorg?ref=footer-browse-instagram
 29 | http://instagram.com/newegg/
 30 | https://instagram.com/okcupid
 31 | https://instagram.com/pospositive/
 32 | https://instagram.com/snapdeal/?hl=en
 33 | http://pinterest.com/bange16/marc-and-angel/
 34 | http://pinterest.com/lifehack/?ref=footer-browse-pinterest
 35 | http://pinterest.com/mindbodygreen
 36 | http://pinterest.com/thinksimplenow/
 37 | http://www.pinterest.com/instructables
 38 | http://www.pinterest.com/thepositivepin/
 39 | https://pinterest.com/EmilySchuman
 40 | http://plus.google.com/107307393263977088342/about
 41 | https://plus.google.com/+Coursera
 42 | https://plus.google.com/+Dropbox/posts
 43 | https://plus.google.com/+P%C3%BAblico/posts
 44 | https://plus.google.com/+duolingo
 45 | https://plus.google.com/+snapdeal/posts
 46 | https://plus.google.com/100371967013117528205
 47 | https://plus.google.com/111984034088692092819?prsrc=3
 48 | https://plus.google.com/116623388763634190489
 49 | https://plus.google.com/117330593038325285345/posts
 50 | https://plus.google.com/u/0/+eventbrite/
 51 | https://plus.google.com/u/1/115964001953967461416?pageId=114804279025961350651&authuser=1
 52 | https://telegram.me/publico_es
 53 | http://twitter.com/#!/SurveyMonkey
 54 | http://twitter.com/ThreeIreland
 55 | http://twitter.com/VodafoneIreland?utm_campaign=vfcontactustw&utm_medium=twitter&utm_source=onlinejd&utm_content=vfcontactustw
 56 | http://twitter.com/thejournal_ie
 57 | http://www.twitter.com/MrjWells
 58 | http://www.twitter.com/tinybuddha
 59 | https://twitter.com/AskAIB
 60 | https://twitter.com/Independent_ie
 61 | https://twitter.com/XFINITY
 62 | https://twitter.com/irishmirror
 63 | https://twitter.com/lenovo_uki
 64 | https://twitter.com/lifehackorg/?ref=footer-browse-twitter
 65 | https://twitter.com/mindbodygreen
 66 | https://twitter.com/rte
 67 | https://www.twitter.com/Eventbrite
 68 | https://www.facebook.com/Atlassian
 69 | https://www.facebook.com/DoneDealIreland
 70 | https://www.facebook.com/ExploreRTE/
 71 | https://www.facebook.com/Independent.ie
 72 | https://www.facebook.com/TED
 73 | https://www.facebook.com/cupcakesandcashmere
 74 | https://www.facebook.com/eir
 75 | https://www.facebook.com/lifehackorg/?ref=footer-browse-facebook
 76 | https://www.facebook.com/monepositiveblog/
 77 | https://www.google.com/+Thechangeblog
 78 | https://www.instagram.com/imdb/
 79 | https://www.instagram.com/mindvalley
 80 | http://www.linkedin.com/company/362798
 81 | http://www.linkedin.com/company/aib/products/
 82 | http://www.linkedin.com/company/investopedia-ulc
 83 | http://www.linkedin.com/company/techcrunch
 84 | http://www.linkedin.com/in/mrjwells
 85 | https://www.linkedin.com/pub/whois-api/88/573/6b2
 86 | https://www.periscope.tv/le_Parisien
 87 | https://www.pinterest.com/snapdeal/
 88 | https://www.pinterest.com/tednews
 89 | https://www.pinterest.com/tinybuddha/pins/
 90 | https://www.snapchat.com/add/positivepresent
 91 | http://www.youtube.com/ThreeIreland
 92 | http://www.youtube.com/aib
 93 | http://www.youtube.com/positivelypositive1
 94 | http://www.youtube.com/user/instructablestv
 95 | http://www.youtube.com/user/mrjWells
 96 | http://www.youtube.com/user/positivelypresent
 97 | http://www.youtube.com/user/techcrunch
 98 | https://www.youtube.com/channel/UCVimQoXNCZuEnZRVAbuYMiw
 99 | https://www.youtube.com/channel/UCfHn_8-ehdem86fEvlFg-Gw
100 | https://www.youtube.com/ted
101 | https://www.youtube.com/user/DoneDealers
102 | https://www.youtube.com/user/LifehackOrg/?ref=footer-browse-youtube
103 | https://www.youtube.com/user/rte
104 | https://www.youtube.com/user/xfinity?feature=results_main
105 | https://soundcloud.com/uwebristol
106 | https://feeds.feedburner.com/TroyHunt
107 | https://vimeo.com/kadence
108 | https://eg.linkedin.com/in/sayed-gharib-51b05133?trk=pub-pbmap
109 | https://www.linkedin.com/company/dichter-&-neira-research-network?trk=fc_badge
110 | https://dk.linkedin.com/in/carolinehorten
111 | https://plus.google.com/u/0/111494755084642562984/posts
112 | http://www.slideshare.net/haystackinternational
113 | https://www.facebook.com/pages/Robas-Research/357181737690559
114 | http://www.facebook.com/pages/TNS-Global/55944527541
115 | http://plus.google.com/108198427863983309725/
116 | http://www.youtube.com/tnsglobal
117 | http://feeds.feedburner.com/TnsGlobalPressReleases
118 | https://www.facebook.com/AMR-Advanced-Market-Research-GmbH-152914324834256/timeline/
119 | https://uk.linkedin.com/in/mihajlopopesku
120 | http://www.vkontakte.ru/fom.media
121 | http://www.slideshare.net/fom-media/
122 | https://www.pinterest.com/globalvoxpopuli/
123 | http://fb.co/OReilly
124 | """
125 | # tumblr, whatsapp, blogspot, PENGYOU, RENREN, KAIXIN 001, TENCENT WEIBO
126 | # SINA WEIBO, Baidu, WECHAT
127 | 
128 | SOCIAL_NEGATIVE = """
129 | https://www.linkedin.com/salary/
130 | https://www.linkedin.com/learning/me
131 | https://about.twitter.com/company
132 | https://www.youtube.com/t/terms
133 | https://www.youtube.com/yt/policyandsafety/
134 | https://www.facebook.com/privacy/explanation
135 | https://www.facebook.com/directory/celebrities/
136 | https://www.facebook.com/mobile/?ref=pf
137 | https://www.facebook.com/directory/people/
138 | https://www.facebook.com/places/
139 | https://www.facebook.com/games/
140 | https://www.facebook.com/careers/?ref=pf
141 | https://about.pinterest.com/en
142 | https://www.pinterest.com/_/_/about/
143 | https://www.instagram.com/about/us/
144 | https://www.instagram.com/developer/
145 | https://www.instagram.com/legal/terms/
146 | https://business.instagram.com/
147 | https://www.snapchat.com/geofilters
148 | https://www.snapchat.com/jobs
149 | https://www.snapchat.com/terms
150 | https://www.snapchat.com/beta/
151 | https://business.snapchat.com/
152 | https://www.flickr.com/cameras
153 | https://www.flickr.com/about
154 | https://www.flickr.com/explore/
155 | https://www.flickr.com/jobs
156 | https://www.xing.com/news/pages/f-a-z-wirtschaft-finanzen-90
157 | """
158 | 
159 | 
160 | def split_lines(lines):
161 |     return [x for x in lines.split('\n') if x.strip()]
162 | 
163 | 
164 | def test_positives():
165 |     for sample in split_lines(LINK_SAMPLES):
166 |         assert matches_string(sample), (sample, )
167 | 
168 | 
169 | def test_href():
170 |     href = etree.HTML("""
171 |         <a href="http://feeds.feedburner.com/TnsGlobalPressReleases">
172 |         <fb:like href="http://www.facebook.com/elDiarioEs">
173 |         <a class="twitter-follow-button" href="https://twitter.com/NASA">
174 |         <a class="github-button"
175 |             href="https://github.com/igrigorik/githubarchive.org"
176 |             data-count-href="/igrigorik/githubarchive.org/stargazers">
177 |         <div class="fb-page" data-href="https://www.facebook.com/facebook"
178 |               data-tabs="timeline" data-small-header="false">
179 |     """)
180 |     assert len(list(find_links_tree(href))) == 5, href
181 | 
182 | 
183 | def test_script():
184 |     # FIXME: need examples for `script` or `data-href`, etc.
185 |     pass
186 | 
187 | 
188 | def test_twitter():
189 |     href = etree.HTML("""
190 |         <meta name="twitter:site" content="@fluquid_ds">
191 |         <meta name="twitter:creator" content="@fluquid_ds">
192 |     """)
193 |     assert len(list(find_links_tree(href))) == 2, href
194 | 
195 | 
196 | def test_broken_href():
197 |     href = etree.HTML("""
198 |         <a href>
199 |     """)
200 |     assert len(list(find_links_tree(href))) == 0, href
201 | 
202 | 
203 | def not_running_negatives():
204 |     for sample in split_lines(SOCIAL_NEGATIVE):
205 |         assert not matches_string(sample), (sample, )
206 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py27, py35
 3 | 
 4 | [testenv]
 5 | deps =
 6 |     -rrequirements-setup.txt
 7 |     -rrequirements-install.txt
 8 |     -rrequirements-tests.txt
 9 | commands =
10 |     {posargs:coverage run -m py.test}
11 | 


--------------------------------------------------------------------------------