├── .flake8
├── .gitattributes
├── .github
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
├── PULL_REQUEST_TEMPLATE.md
└── workflows
│ └── codeql-analysis.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── docs
├── Makefile
├── conf.py
├── index.rst
├── instascrape.core.rst
├── instascrape.exceptions.rst
├── instascrape.rst
├── instascrape.scrapers.rst
├── make.bat
└── modules.rst
├── instascrape
├── __init__.py
├── core
│ ├── __init__.py
│ ├── _mappings.py
│ ├── _static_scraper.py
│ └── json_algos.py
├── exceptions
│ ├── __init__.py
│ └── exceptions.py
└── scrapers
│ ├── __init__.py
│ ├── comment.py
│ ├── hashtag.py
│ ├── igtv.py
│ ├── location.py
│ ├── post.py
│ ├── profile.py
│ ├── reel.py
│ └── scrape_tools.py
├── media
├── 6x6scatter_matrix.png
├── instascrape.gif
├── likes_heatmap.png
├── logo.png
├── logopic.png
├── realpython.png
├── scatter_matrix.png
└── techprofiles.gif
├── pypi.bash
├── pyproject.toml
├── pytest.ini
├── requirements.txt
├── setup.py
├── tests
├── __init__.py
└── scrapers
│ ├── __init__.py
│ ├── test_hashtag.py
│ ├── test_igtv.py
│ ├── test_location.py
│ ├── test_post.py
│ ├── test_profile.py
│ └── test_reel.py
└── tutorial
├── examples
├── DonaldTrump
│ ├── Donald Trump.ipynb
│ ├── donald_trump.csv
│ └── plots
│ │ ├── comments_per_post.png
│ │ ├── hashtags.png
│ │ ├── likes_per_post.png
│ │ ├── likes_vs_comments.png
│ │ ├── locations.png
│ │ ├── views_and_likes_per_view.png
│ │ └── views_per_video.png
├── JoeBiden
│ ├── joebiden.csv
│ ├── joebiden.png
│ ├── joebiden.py
│ └── joebiden_urls.txt
├── README.md
├── download_recent_photos
│ ├── 2020-09-08 09h06m.png
│ ├── 2020-09-09 10h24m.png
│ ├── 2020-09-14 10h05m.png
│ ├── 2020-09-17 17h49m.png
│ ├── 2020-09-24 11h01m.png
│ ├── 2020-09-25 10h18m.png
│ ├── 2020-09-26 11h38m.png
│ ├── 2020-09-27 09h27m.png
│ ├── 2020-09-28 12h17m.png
│ ├── 2020-10-14 12h36m.png
│ ├── 2020-10-15 13h11m.png
│ ├── 2020-10-16 14h39m.png
│ └── download_recent_photos.ipynb
├── max_liked_post.ipynb
└── simple_hashtag_comparison
│ └── simple_hashtag_comparison.ipynb
└── tutorial
├── Part 0 - Orientation.ipynb
└── Part 1 - Intro to the API.ipynb
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | exclude = media,docs
3 | ignore=E402,F401,F403,F405,F821
4 | max-line-length=120
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | tutorial/** linguist-language=Python
2 | tutorial/examples/** linguist-language=Python
3 |
--------------------------------------------------------------------------------
/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at chris@christophergreening.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 |
--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to instascrape
2 | We love developers and want to hear your input! Contributing to this project should be as easy and transparent as possible, whether it's:
3 |
4 | - Reporting a bug
5 | - Discussing the current state of the code
6 | - Submitting a fix
7 | - Proposing new features
8 | - Becoming a maintainer
9 | - etc.
10 |
11 | ## We develop with GitHub
12 | We use GitHub to host code, track issues and feature requests, as well as accept pull requests. Changes that have been accepted to `master` will also be uploaded to the instascrape PyPI package.
13 |
14 | ## We use the [GitHub flow](https://guides.github.com/introduction/flow/), so all code changes happen through pull requests
15 | Pull requests are the best way to propose changes to the codebase. We actively welcome your pull requests:
16 |
17 | 1. Fork the repo and create your branch from `master`.
18 | 2. If you've added code that should be tested, add tests.
19 | 3. If you've changed APIs, update the documentation.
20 | 4. Ensure the test suite passes.
21 | 5. Make sure your code lints.
22 | 6. Issue that pull request!
23 |
24 | ## Report bugs using Github's [issues](https://github.com/chris-greening/instascrape/issues)
25 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/chris-greening/instascrape/issues/new/choose).
26 |
27 | ## Write bug reports with detail, background, and sample code
28 |
29 | **Great Bug Reports** tend to have:
30 |
31 | - A quick summary and/or background
32 | - Steps to reproduce
33 | - Be specific!
34 | - Give sample code if you can.
35 | - What you expected would happen
36 | - What actually happens
37 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work)
38 |
39 |
46 |
47 | ## Any contributions you make will be under the MIT Software License
48 | In short, when you submit code changes, your submissions are understood to be under the same [MIT License](http://choosealicense.com/licenses/mit/) that covers the project. Feel free to contact the maintainers if that's a concern.
49 |
50 | ## References
51 | This document was adapted from [briandk's](https://gist.github.com/briandk/3d2e8b3ec8daf5a27a62) CONTRIBUTING.md template.
52 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Desktop (please complete the following information):**
27 | - OS: [e.g. iOS]
28 | - Browser [e.g. chrome, safari]
29 | - Version [e.g. 22]
30 |
31 | **Smartphone (please complete the following information):**
32 | - Device: [e.g. iPhone6]
33 | - OS: [e.g. iOS8.1]
34 | - Browser [e.g. stock browser, safari]
35 | - Version [e.g. 22]
36 |
37 | **Additional context**
38 | Add any other context about the problem here.
39 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ## Description
2 | Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change.
3 |
4 | Do not include any personal data.
5 |
6 | Fixes # (issue)
7 |
8 | ## Checklist
9 |
10 | * [ ] I followed the guidelines in our Contributing document
11 | * [ ] I added an explanation of my changes
12 | * [ ] I have written new tests for my changes, as applicable
13 | * [ ] I successfully ran tests with my changes locally
14 |
15 | ## Additional notes (optional)
16 |
17 |
--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | name: "CodeQL"
7 |
8 | on:
9 | push:
10 | branches: [master]
11 | pull_request:
12 | # The branches below must be a subset of the branches above
13 | branches: [master]
14 | schedule:
15 | - cron: '0 18 * * 3'
16 |
17 | jobs:
18 | analyze:
19 | name: Analyze
20 | runs-on: ubuntu-latest
21 |
22 | strategy:
23 | fail-fast: false
24 | matrix:
25 | # Override automatic language detection by changing the below list
26 | # Supported options are ['csharp', 'cpp', 'go', 'java', 'javascript', 'python']
27 | language: ['python']
28 | # Learn more...
29 | # https://docs.github.com/en/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#overriding-automatic-language-detection
30 |
31 | steps:
32 | - name: Checkout repository
33 | uses: actions/checkout@v2
34 | with:
35 | # We must fetch at least the immediate parents so that if this is
36 | # a pull request then we can checkout the head.
37 | fetch-depth: 2
38 |
39 | # If this run was triggered by a pull request event, then checkout
40 | # the head of the pull request instead of the merge commit.
41 | - run: git checkout HEAD^2
42 | if: ${{ github.event_name == 'pull_request' }}
43 |
44 | # Initializes the CodeQL tools for scanning.
45 | - name: Initialize CodeQL
46 | uses: github/codeql-action/init@v1
47 | with:
48 | languages: ${{ matrix.language }}
49 | # If you wish to specify custom queries, you can do so here or in a config file.
50 | # By default, queries listed here will override any specified in a config file.
51 | # Prefix the list here with "+" to use these queries and those in the config file.
52 | # queries: ./path/to/local/query, your-org/your-repo/queries@main
53 |
54 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
55 | # If this step fails, then you should remove it and run the build manually (see below)
56 | - name: Autobuild
57 | uses: github/codeql-action/autobuild@v1
58 |
59 | # ℹ️ Command-line programs to run using the OS shell.
60 | # 📚 https://git.io/JvXDl
61 |
62 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
63 | # and modify them (or add more) to build your code if your project
64 | # uses a compiled language
65 |
66 | #- run: |
67 | # make bootstrap
68 | # make release
69 |
70 | - name: Perform CodeQL Analysis
71 | uses: github/codeql-action/analyze@v1
72 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | .idea/
132 | <<<<<<< HEAD
133 |
134 | # Large file ignores
135 | tutorial/examples/Donald\ Trump/*.db*
136 | =======
137 | >>>>>>> aa8b12067d01e19fb94af91be18f49d8042feeac
138 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/timothycrosley/isort
3 | rev: 5.5.4
4 | hooks:
5 | - id: isort
6 | - repo: https://github.com/ambv/black
7 | rev: 20.8b1
8 | hooks:
9 | - id: black
10 | - repo: https://gitlab.com/pycqa/flake8
11 | rev: 3.8.4
12 | hooks:
13 | - id: flake8
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Christopher Greening
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | # _instascrape_: powerful Instagram data scraping toolkit
7 |
8 | ## Note: This module is no longer actively maintained.
9 |
10 | ## DISCLAIMER:
11 |
12 | Instagram has gotten increasingly strict with scraping and using this library can result in getting flagged for botting AND POSSIBLE DISABLING OF YOUR INSTAGRAM ACCOUNT. This is a research project and I am not responsible for how you use it. Independently, the library is designed to be responsible and respectful and it is up to you to decide what you do with it. I don't claim any responsibility if your Instagram account is affected by how you use this library.
13 |
14 | [](https://www.python.org/downloads/release/python-360/)
15 | [](https://pepy.tech/project/insta-scrape)
16 | [](https://pypi.org/project/insta-scrape/)
17 | [](https://opensource.org/licenses/MIT)
18 |
19 | [](https://github.com/chris-greening/instascrape)
20 | [](https://github.com/chris-greening/instascrape/blob/master/requirements.txt)
21 | [](https://github.com/chris-greening/instascrape/issues)
22 |
23 | ## What is it?
24 | _instascrape_ is a lightweight Python package that provides an expressive and flexible API for scraping Instagram data. It is geared towards being a high-level building block on the data scientist's toolchain and can be seamlessly integrated and extended with industry standard tools for web scraping, data science, and analysis.
25 |
26 |
27 |
28 | ## Key features
29 | Here are a few of the things that `instascrape` does well:
30 |
31 | * Powerful, object-oriented scraping tools for profiles, posts, hashtags, reels, and IGTV
32 | * Scrapes HTML, BeautifulSoup, and JSON
33 | * Download content to your computer as _png_, _jpg_, _mp4_, and _mp3_
34 | * Dynamically retrieve HTML embed code for posts
35 | * Expressive and consistent API for concise and elegant code
36 | * Designed for seamless integration with [_Selenium_](https://selenium-python.readthedocs.io/), [_Pandas_](https://pandas.pydata.org/), and other industry standard tools for data collection and analysis
37 | * Lightweight; no boilerplate or configurations necessary
38 | * The only hard dependencies are [_Requests_](https://requests.readthedocs.io/en/master/) and [_Beautiful Soup_](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
39 | ---
40 |
41 | ## Table of Contents
42 | * [Installation](#installation)
43 | * [Sample Usage](#features)
44 | * [Documentation](#documentation)
45 | * [Blog Posts](#blog-posts)
46 | * [Contributing](#contributing)
47 | * [Dependencies](#dependencies)
48 | * [License](#license)
49 | * [Support](#support)
50 |
51 | ---
52 |
53 | ## :computer: Installation
54 |
55 | ### Minimum Python version
56 |
57 | This library currently requires [Python 3.7](https://www.python.org/downloads/release/python-370/) or higher.
58 |
59 |
60 | ### pip
61 | Install from PyPI using
62 | ```shell
63 | $ pip3 install insta-scrape
64 | ```
65 | WARNING: make sure you install _insta-scrape_ and not a package with a similar name!
66 |
67 | ---
68 |
69 | ## :mag_right: Sample Usage
70 | All top-level, ready-to-use features can be imported using:
71 | ```python
72 | from instascrape import *
73 | ```
74 |
75 | _instascrape_ uses clean, consistent, and expressive syntax to make the developer experience as _painless_ as possible.
76 |
77 | ```python
78 | # Instantiate the scraper objects
79 | google = Profile('https://www.instagram.com/google/')
80 | google_post = Post('https://www.instagram.com/p/CG0UU3ylXnv/')
81 | google_hashtag = Hashtag('https://www.instagram.com/explore/tags/google/')
82 |
83 | # Scrape their respective data
84 | google.scrape()
85 | google_post.scrape()
86 | google_hashtag.scrape()
87 |
88 | print(google.followers)
89 | print(google_post['hashtags'])
90 | print(google_hashtag.amount_of_posts)
91 | >>> 12262794
92 | >>> ['growwithgoogle']
93 | >>> 9053408
94 | ```
95 |
96 | See the [Scraped data points](https://github.com/chris-greening/instascrape/wiki/Scraped-data-points) section of the [Wiki](https://github.com/chris-greening/instascrape/wiki) for a complete list of the scraped attributes provided by each scraper.
97 |
98 | ## :books: Documentation
99 | The official documentation can be found on [Read The Docs](https://instascrape.readthedocs.io/en/latest/index.html)
100 |
101 | ---
102 |
103 | ## :newspaper: Blog Posts
104 |
105 |
106 | Check out blog posts on the [official site](https://chris-greening.github.io/instascrape/blog/) or [DEV](https://dev.to/) for ideas and tutorials!
107 |
108 | - [Scrape data from Instagram with instascrape](https://dev.to/chrisgreening/scrape-data-from-instagram-with-instascrape-5e3e)
109 | - [Visualizing Instagram engagement with instascrape](https://dev.to/chrisgreening/visualizing-instagram-engagement-with-instascrape-326h)
110 | - [Exploratory data analysis of Instagram using instascrape and Python](https://dev.to/chrisgreening/exploratory-data-analysis-of-instagram-using-python-1o5c)
111 | - [Creating a scatter matrix of Instagram data using Python](https://dev.to/chrisgreening/visualizing-the-relationship-between-instagram-variables-using-python-55gg)
112 | - [Downloading an Instagram profile's recent photos using Python](https://dev.to/chrisgreening/downloading-an-instagram-profile-s-recent-photos-using-python-25b2)
113 | - [Scraping 25,000 data points from Joe Biden's Instagram using instascrape](https://dev.to/chrisgreening/scraping-25-000-data-points-from-joe-biden-s-instagram-using-instascrape-1026)
114 | - [Compare major tech Instagram page's with instascrape](https://dev.to/chrisgreening/compare-major-tech-instagram-page-s-with-instascrape-2419)
115 | - [Tracking an Instagram posts engagement in real time with instascrape](https://dev.to/chrisgreening/tracking-an-instagram-posts-engagement-in-real-time-with-instascrape-1m1j)
116 | - [Dynamically generate embeddable Instagram HTML with instascrape](https://dev.to/chrisgreening/dynamically-generate-embeddable-instagram-html-using-instascrape-3o4b)
117 | - [Scraping an Instagram location tag with instascrape](https://dev.to/chrisgreening/scraping-an-instagram-location-tag-with-instascrape-554f)
118 | - [Scraping Instagram reels with instascrape](https://dev.to/chrisgreening/scraping-instagram-reels-with-instascrape-3khb)
119 | - [Scraping IGTV data with instascrape](https://dev.to/chrisgreening/scraping-igtv-data-with-instascrape-595f)
120 | - [Scraping 10,000 data points from Donald Trump's Instagram with Python](https://dev.to/chrisgreening/scraping-10-000-data-points-from-donald-trump-s-instagram-page-with-python-2jcg)
121 | ---
122 |
123 | ## :pray: Contributing
124 | All contributions, bug reports, bug fixes, documentation improvements, enhancements, and ideas are welcome!
125 |
126 | Feel free to [open an Issue](https://github.com/chris-greening/instascrape/issues/new/choose), check out existing [Issues](https://github.com/chris-greening/instascrape/issues), or [start a discussion](https://github.com/chris-greening/instascrape/discussions).
127 |
128 | Beginners to open source are highly encouraged to participate and ask questions if you're unsure what to do/where to start :heart:
129 |
130 | ---
131 |
132 | ## :spider_web: Dependencies
133 |
134 | - [Requests](https://requests.readthedocs.io/en/master/)
135 | - [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
136 |
137 | ---
138 |
139 |
140 | ## :credit_card: License
141 | This library operates under the [MIT](LICENSE) license.
142 |
143 | ---
144 |
145 | ## :grey_question: Support
146 |
147 | Check out the [FAQ](https://github.com/chris-greening/instascrape/wiki/Frequently-Asked-Questions)
148 |
149 | Reach out to me if you want to connect or have any questions and I will do my best to get back to you
150 | * Email:
151 | * chris@christophergreening.com
152 | * Twitter:
153 | * [@ChrisGreening](https://twitter.com/ChrisGreening)
154 | * LinkedIn
155 | * [Chris Greening](https://www.linkedin.com/in/chris-greening-646411139/)
156 | * Personal contact form:
157 | * [www.christophergreening.com](https://www.christophergreening.com/contact)
158 | ---
159 |
160 |
161 |
162 |
163 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 |
16 | sys.path.insert(0, os.path.abspath(".."))
17 | # sys.path.insert(0, r'D:\Programming\pythonstuff\instascrape')
18 |
19 |
20 | # -- Project information -----------------------------------------------------
21 |
22 | project = "instascrape"
23 | copyright = "2020, Chris Greening"
24 | author = "Chris Greening"
25 |
26 | # The full version, including alpha/beta/rc tags
27 | release = "0.0.7"
28 |
29 |
30 | # -- General configuration ---------------------------------------------------
31 |
32 | # Add any Sphinx extension module names here, as strings. They can be
33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
34 | # ones.
35 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.napoleon"]
36 |
37 | master_doc = "index"
38 |
39 | # Add any paths that contain templates here, relative to this directory.
40 | templates_path = ["_templates"]
41 |
42 | # List of patterns, relative to source directory, that match files and
43 | # directories to ignore when looking for source files.
44 | # This pattern also affects html_static_path and html_extra_path.
45 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
46 |
47 |
48 | # -- Options for HTML output -------------------------------------------------
49 |
50 | # The theme to use for HTML and HTML Help pages. See the documentation for
51 | # a list of builtin themes.
52 | #
53 | html_theme = "sphinx_rtd_theme"
54 |
55 | # Add any paths that contain custom static files (such as style sheets) here,
56 | # relative to this directory. They are copied after the builtin static files,
57 | # so a file named "default.css" will overwrite the builtin "default.css".
58 | html_static_path = ["_static"]
59 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. instascrape documentation master file, created by
2 | sphinx-quickstart on Sat Sep 26 16:24:31 2020.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Welcome to instascrape's documentation!
7 | =======================================
8 |
9 | .. toctree::
10 | :maxdepth: 2
11 | :caption: Contents:
12 |
13 | instascrape.scrapers
14 | instascrape.exceptions
15 |
16 | Indices and tables
17 | ==================
18 |
19 | * :ref:`genindex`
20 | * :ref:`modindex`
21 | * :ref:`search`
22 |
--------------------------------------------------------------------------------
/docs/instascrape.core.rst:
--------------------------------------------------------------------------------
1 | instascrape.core package
2 | ========================
3 |
4 | Module contents
5 | ---------------
6 |
7 | .. automodule:: instascrape.core
8 | :members:
9 | :undoc-members:
10 | :show-inheritance:
11 |
--------------------------------------------------------------------------------
/docs/instascrape.exceptions.rst:
--------------------------------------------------------------------------------
1 | instascrape.exceptions package
2 | ==============================
3 |
4 | Submodules
5 | ----------
6 |
7 | instascrape.exceptions.exceptions module
8 | ----------------------------------------
9 |
10 | .. automodule:: instascrape.exceptions.exceptions
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 |
15 |
16 | Module contents
17 | ---------------
18 |
19 | .. automodule:: instascrape.exceptions
20 | :members:
21 | :undoc-members:
22 | :show-inheritance:
23 |
--------------------------------------------------------------------------------
/docs/instascrape.rst:
--------------------------------------------------------------------------------
1 | instascrape package
2 | ===================
3 |
4 | Subpackages
5 | -----------
6 |
7 | .. toctree::
8 |
9 | instascrape.core
10 | instascrape.scrapers
11 |
12 | Submodules
13 | ----------
14 |
15 | instascrape.instascrape module
16 | ------------------------------
17 |
18 | .. automodule:: instascrape.instascrape
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 |
23 |
24 | Module contents
25 | ---------------
26 |
27 | .. automodule:: instascrape
28 | :members:
29 | :undoc-members:
30 | :show-inheritance:
31 |
--------------------------------------------------------------------------------
/docs/instascrape.scrapers.rst:
--------------------------------------------------------------------------------
1 | instascrape.scrapers package
2 | ============================
3 |
4 | Submodules
5 | ----------
6 |
7 | instascrape.scrapers.hashtag module
8 | -----------------------------------
9 |
10 | .. automodule:: instascrape.scrapers.hashtag
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 | :inherited-members:
15 |
16 | instascrape.scrapers.post module
17 | --------------------------------
18 |
19 | .. automodule:: instascrape.scrapers.post
20 | :members:
21 | :undoc-members:
22 | :show-inheritance:
23 | :inherited-members:
24 |
25 | instascrape.scrapers.profile module
26 | -----------------------------------
27 |
28 | .. automodule:: instascrape.scrapers.profile
29 | :members:
30 | :undoc-members:
31 | :show-inheritance:
32 | :inherited-members:
33 |
34 | instascrape.scrapers.reel module
35 | --------------------------------
36 |
37 | .. automodule:: instascrape.scrapers.reel
38 | :members:
39 | :undoc-members:
40 | :show-inheritance:
41 | :inherited-members:
42 |
43 | instascrape.scrapers.location module
44 | ------------------------------------
45 |
46 | .. automodule:: instascrape.scrapers.location
47 | :members:
48 | :undoc-members:
49 | :show-inheritance:
50 | :inherited-members:
51 |
52 | instascrape.scrapers.igtv module
53 | --------------------------------
54 |
55 | .. automodule:: instascrape.scrapers.igtv
56 | :members:
57 | :undoc-members:
58 | :show-inheritance:
59 | :inherited-members:
60 |
61 | instascrape.scrapers.scrape_tools module
62 | --------------------------------
63 |
64 | .. automodule:: instascrape.scrapers.scrape_tools
65 | :members:
66 | :undoc-members:
67 | :show-inheritance:
68 | :inherited-members:
69 |
70 | Module contents
71 | ---------------
72 |
73 | .. automodule:: instascrape.scrapers
74 | :members:
75 | :undoc-members:
76 | :show-inheritance:
77 | :inherited-members:
78 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | instascrape
2 | ===========
3 |
4 | .. toctree::
5 | :maxdepth: 4
6 |
7 | instascrape
8 |
--------------------------------------------------------------------------------
/instascrape/__init__.py:
--------------------------------------------------------------------------------
1 | from instascrape.scrapers import *
--------------------------------------------------------------------------------
/instascrape/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chris-greening/instascrape/a720355474b2a0506bdbe32dc67f19d464de3556/instascrape/core/__init__.py
--------------------------------------------------------------------------------
/instascrape/core/_mappings.py:
--------------------------------------------------------------------------------
1 | """
2 | Mappings that tell the _JsonEngine the user facing attribute names and the
3 | steps needed to get there in a JSON dictionary
4 | """
5 |
6 | from __future__ import annotations
7 |
8 | from abc import ABC
9 | from collections import deque
10 | from copy import deepcopy
11 | from typing import Dict, List, Union
12 |
13 | # pylint: disable=used-before-assignment
14 |
15 |
16 | MappingObject = Union["_PostMapping", "_ProfileMapping", "_HashtagMapping", "_LoginMapping"]
17 |
18 |
19 | class _GeneralMapping(ABC):
20 | """
21 | Maps the user interfacing attribute names with their keys as given in a JSON
22 | dict that has been flattened using
23 | instascrape.core._json_flattener.JsonFlattener
24 |
25 | Attributes
26 | ----------
27 | mapping : Dict[str, deque]
28 | Each key: val pair represents one data point and the directive for
29 | traversing a JSON dict and accessing that value
30 |
31 | Methods
32 | -------
33 | return_mapping(keys: List[str]=[]) -> Dict[str, deque]
34 | Interface for returning only mapping directives that are specified in
35 | a list of keys
36 |
37 | """
38 |
39 | mapping = {
40 | # "csrf_token": deque(["csrf_token"]),
41 | # "viewer_id": deque(["viewerId"]),
42 | # "country_code": deque(["country_code"]),
43 | # "language_code": deque(["language_code"]),
44 | # "locale": deque(["locale"]),
45 | # "device_id": deque(["device_id"]),
46 | # "browser_push_pub_key": deque(["browser_push_pub_key"]),
47 | # "key_id": deque(["key_id"]),
48 | # "public_key": deque(["public_key"]),
49 | # "version": deque(["version"]),
50 | # "is_dev": deque(["is_dev"]),
51 | # "rollout_hash": deque(["rollout_hash"]),
52 | # "bundle_variant": deque(["bundle_variant"]),
53 | # "frontend_dev": deque(["frontend_env"]),
54 | }
55 |
56 | @classmethod
57 | def return_mapping(cls, keys: List[str] = None, exclude: List[str] = None) -> Dict[str, deque]:
58 | """
59 | Return key-directive pairs specified by key names. If no keys are
60 | specified, return all
61 |
62 | Parameters
63 | ----------
64 | keys : List[str]
65 | Keys that specify what directives to return
66 |
67 | Returns
68 | -------
69 | directive_dict : Dict[str, deque]
70 | Dictionary of keys and their directives
71 | """
72 | if keys is None:
73 | keys = []
74 | if exclude is None:
75 | exclude = []
76 | if isinstance(keys, str):
77 | keys = [keys]
78 | if isinstance(exclude, str):
79 | exclude = [exclude]
80 |
81 | if not keys:
82 | keys = list(cls.mapping)
83 | if exclude:
84 | keys = [key for key in keys if key not in exclude]
85 | directive_dict = {key: deepcopy(cls.mapping[key]) for key in keys}
86 | return directive_dict
87 |
88 |
89 | class _PostMapping(_GeneralMapping):
90 | """Mapping specific to Instagram post pages"""
91 |
92 | mapping = _GeneralMapping.return_mapping().copy()
93 | mapping.update(
94 | {
95 | "id": deque(["id"]),
96 | "shortcode": deque(["shortcode"]),
97 | "height": deque(["height"]),
98 | "width": deque(["width"]),
99 | "gating_info": deque(["gating_info"]),
100 | "fact_check_overall_rating": deque(["fact_check_overall_rating"]),
101 | "fact_check_information": deque(["fact_check_information"]),
102 | "sensitivity_friction_info": deque(["sensitivity_friction_info"]),
103 | "media_overlay_info": deque(["media_overlay_info"]),
104 | "media_preview": deque(["media_preview"]),
105 | "display_url": deque(["display_url"]),
106 | "accessibility_caption": deque(["accessibility_caption"]),
107 | "is_video": deque(["is_video"]),
108 | "tracking_token": deque(["tracking_token"]),
109 | "tagged_users": deque(["edge_media_to_tagged_user"]),
110 | "caption": deque(["text"]),
111 | "caption_is_edited": deque(["caption_is_edited"]),
112 | "has_ranked_comments": deque(["has_ranked_comments"]),
113 | "comments": deque(["count"]),
114 | "comments_disabled": deque(["comments_disabled"]),
115 | "commenting_disabled_for_viewer": deque(["commenting_disabled_for_viewer"]),
116 | "timestamp": deque(["taken_at_timestamp"]),
117 | "likes": deque(["edge_media_preview_like_count"]),
118 | "location": deque(["name"]),
119 | "viewer_has_liked": deque(["viewer_has_liked"]),
120 | "viewer_has_saved": deque(["viewer_has_saved"]),
121 | "viewer_has_saved_to_collection": deque(["viewer_has_saved_to_collection"]),
122 | "viewer_in_photo_of_you": deque(["viewer_in_photo_of_you"]),
123 | "viewer_can_reshare": deque(["viewer_can_reshare"]),
124 | "video_url": deque(["video_url"]),
125 | "has_audio": deque(["has_audio"]),
126 | "video_view_count": deque(["video_view_count"]),
127 | "username": deque(["shortcode_media_owner_username"]),
128 | "full_name": deque(['owner_full_name']),
129 | }
130 | )
131 |
132 | @classmethod
133 | def post_from_profile_mapping(cls):
134 | """
135 | Return the mapping needed for parsing a post's JSON data from the JSON
136 | served back after requesting a Profile page.
137 | """
138 | return {
139 | "id": deque(["id"]),
140 | "shortcode": deque(["shortcode"]),
141 | "dimensions": deque(["dimensions"]),
142 | "display_url": deque(["display_url"]),
143 | "tagged_users": deque(["edge_media_to_tagged_user", "edges"]),
144 | "fact_check_overall_rating": deque(["fact_check_overall_rating"]),
145 | "fact_check_information": deque(["fact_check_information"]),
146 | "is_video": deque(["is_video"]),
147 | "accessibility_caption": deque(["accessibility_caption"]),
148 | "caption": deque(["edge_media_to_caption", "edges", 0, "node", "text"]),
149 | "comments": deque(["count"]),
150 | "comments_disabled": deque(["comments_disabled"]),
151 | "timestamp": deque(["taken_at_timestamp"]),
152 | "likes": deque(["edge_media_preview_like_count"]),
153 | "location": deque(["location"]),
154 | }
155 |
156 | @classmethod
157 | def post_from_hashtag_mapping(cls):
158 | """
159 | Return the mapping needed for parsing a post's JSON data from the JSON
160 | served back after requesting a Hashtag page.
161 | """
162 | return {
163 | "comments_disabled": deque(["comments_disabled"]),
164 | "id": deque(["id"]),
165 | "caption": deque(["edge_media_to_caption", "edges", 0, "node", "text"]),
166 | "shortcode": deque(["shortcode"]),
167 | "comments": deque(["edge_media_to_comment", "count"]),
168 | "upload_date": deque(["taken_at_timestamp"]),
169 | "dimensions": deque(["dimensions"]),
170 | "display_url": deque(["display_url"]),
171 | "likes": deque(["edge_media_preview_like", "count"]),
172 | "owner": deque(["owner", "id"]),
173 | "is_video": deque(["is_video"]),
174 | "accessibility_caption": deque(["accessibility_caption"]),
175 | }
176 |
177 |
178 | class _ReelMapping(_PostMapping):
179 | mapping = _PostMapping.return_mapping().copy()
180 | mapping.update(
181 | {
182 | "video_play_count": deque(["video_play_count"]),
183 | }
184 | )
185 |
186 |
187 | class _IGTVMapping(_PostMapping):
188 | mapping = _PostMapping.return_mapping().copy()
189 |
190 |
191 | class _ProfileMapping(_GeneralMapping):
192 | """Mapping specific to Instagram profile pages"""
193 |
194 | mapping = _GeneralMapping.return_mapping().copy()
195 | mapping.update(
196 | {
197 | "logging_page_id": deque(["logging_page_id"]),
198 | "show_suggested_profiles": deque(["show_suggested_profiles"]),
199 | "show_follow_dialog": deque(["show_follow_dialog"]),
200 | "biography": deque(["biography"]),
201 | "blocked_by_viewer": deque(["blocked_by_viewer"]),
202 | "restricted_by_viewer": deque(["restricted_by_viewer"]),
203 | "country_block": deque(["country_block"]),
204 | "external_url": deque(["external_url"]),
205 | "external_url_linkshimmed": deque(["external_url_linkshimmed"]),
206 | "followers": deque(["count"]),
207 | "followed_by_viewer": deque(["followed_by_viewer"]),
208 | "following": deque(["edge_follow_count"]),
209 | "follows_viewer": deque(["follows_viewer"]),
210 | "full_name": deque(["user_full_name"]),
211 | "has_ar_effects": deque(["has_ar_effects"]),
212 | "has_clips": deque(["has_clips"]),
213 | "has_guides": deque(["has_guides"]),
214 | "has_channel": deque(["has_channel"]),
215 | "has_blocked_viewer": deque(["has_blocked_viewer"]),
216 | "highlight_reel_count": deque(["highlight_reel_count"]),
217 | "has_requested_viewer": deque(["has_requested_viewer"]),
218 | "id": deque(["id"]),
219 | "is_business_account": deque(["is_business_account"]),
220 | "is_joined_recently": deque(["is_joined_recently"]),
221 | "business_category_name": deque(["business_category_name"]),
222 | "overall_category_name": deque(["overall_category_name"]),
223 | "category_enum": deque(["category_enum"]),
224 | "is_private": deque(["is_private"]),
225 | "is_verified": deque(["is_verified"]),
226 | "mutual_followers": deque(["edge_mutual_followed_by_count"]),
227 | "profile_pic_url": deque(["profile_pic_url"]),
228 | "profile_pic_url_hd": deque(["profile_pic_url_hd"]),
229 | "requested_by_viewer": deque(["requested_by_viewer"]),
230 | "username": deque(["user_username"]),
231 | "connected_fb_page": deque(["connected_fb_page"]),
232 | "posts": deque(["edge_owner_to_timeline_media_count"]),
233 | }
234 | )
235 |
236 |
237 | class _HashtagMapping(_GeneralMapping):
238 | """Mapping specific to Instagram hashtag pages"""
239 |
240 | mapping = _GeneralMapping.return_mapping().copy()
241 | mapping.update(
242 | {
243 | "id": deque(["id"]),
244 | "name": deque(["name"]),
245 | "allow_following": deque(["allow_following"]),
246 | "is_following": deque(["is_following"]),
247 | "is_top_media_only": deque(["is_top_media_only"]),
248 | "profile_pic_url": deque(["profile_pic_url"]),
249 | "amount_of_posts": deque(["count"]),
250 | }
251 | )
252 |
253 |
254 | class _LocationMapping(_GeneralMapping):
255 | """Mapping specific to Instagram profile pages"""
256 |
257 | mapping = _GeneralMapping.return_mapping().copy()
258 | mapping.update(
259 | {
260 | "id": deque(["id"]),
261 | "name": deque(["name"]),
262 | "has_public_page": deque(["has_public_page"]),
263 | "latitude": deque(["lat"]),
264 | "longitude": deque(["lng"]),
265 | "slug": deque(["slug"]),
266 | "blurb": deque(["blurb"]),
267 | "website": deque(["website"]),
268 | "phone": deque(["phone"]),
269 | "primary_alias_on_fb": deque(["primary_alias_on_fb"]),
270 | "stress_address": deque(["street_address"]),
271 | "zip_code": deque(["zip_code"]),
272 | "city_name": deque(["city_name"]),
273 | "region_name": deque(["region_name"]),
274 | "country_code": deque(["country_code"]),
275 | "amount_of_posts": deque(["count"]),
276 | }
277 | )
278 |
279 |
280 | class _LoginMapping(_GeneralMapping):
281 | """Mapping specific to Instagram login page"""
282 |
283 | mapping = _GeneralMapping.return_mapping().copy()
284 |
285 |
286 | class _HttpErrorMapping(_GeneralMapping):
287 | """Mapping specific to Instagram login page"""
288 |
289 | mapping = _GeneralMapping.return_mapping().copy()
290 |
291 |
292 | class _MetaMapping:
293 | """
294 | Map the string in the Instagram JSON that indicates the type of page the
295 | JSON was scraped from
296 |
297 | Attributes
298 | ----------
299 | str_to_mapper_obj : Dict[str, Any]
300 | Dictionary that maps the string name of the JSON type to the specific
301 | mapping object
302 |
303 | Methods
304 | -------
305 | get_mapper(page_type: str)
306 | Return the mapping object that correlates to the string
307 | """
308 |
309 | str_to_mapper_obj = {
310 | "ProfilePage": _ProfileMapping,
311 | "TagPage": _HashtagMapping,
312 | "PostPage": _PostMapping,
313 | "LoginAndSignupPage": _LoginMapping,
314 | "LocationsPage": _LocationMapping
315 | }
316 |
317 | @classmethod
318 | def get_mapper(cls, page_type: str) -> MappingObject:
319 | """
320 | Return the appropriate mapper that corresponds to the page_type as
321 | given in the requested Instagram JSON data
322 | """
323 | return cls.str_to_mapper_obj[page_type]
324 |
--------------------------------------------------------------------------------
/instascrape/core/_static_scraper.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import datetime
4 | import json
5 | import csv
6 | from abc import ABC, abstractmethod
7 | from typing import Union, Dict, List, Any
8 | import sys
9 | import os
10 | from collections import namedtuple, deque
11 | import warnings
12 |
13 | import requests
14 | from bs4 import BeautifulSoup
15 |
16 | from instascrape.scrapers.scrape_tools import parse_data_from_json, determine_json_type, flatten_dict, json_from_soup
17 | from instascrape.exceptions.exceptions import InstagramLoginRedirectError, MissingSessionIDWarning, MissingCookiesWarning
18 |
19 | # pylint: disable=no-member
20 |
21 | JSONDict = Dict[str, Any]
22 |
23 | class _StaticHtmlScraper(ABC):
24 | """
25 | Base class for all of the scrapers, handles general functionality that all
26 | scraper objects will have
27 | """
28 |
29 | # Keys that represent metadata attr that the user doesn't necessarily need
30 | # to worry about
31 | _METADATA_KEYS = [
32 | "json_dict",
33 | "url",
34 | "_json_scraper",
35 | "scrape_timestamp",
36 | "map_dict",
37 | "json_data",
38 | "json_flattener",
39 | "flat_json_dict",
40 | "soup",
41 | "html",
42 | "source",
43 | ]
44 | _ASSOCIATED_JSON_TYPE = None
45 |
46 | session = requests.Session()
47 |
48 | def __init__(self, source: Union[str, BeautifulSoup, JSONDict]) -> None:
49 | """
50 | Parameters
51 | ----------
52 | source : Union[str, BeautifulSoup, JSONDict]
53 | The given source for scraping the data from. Available sources are
54 | a URL, HTML, JSON dictionary, BeautifulSoup, etc.
55 | """
56 | self.source = source
57 |
58 | # Instance variables that are given values elsewhere
59 | self.url = None
60 | self.html = None
61 | self.soup = None
62 | self.json_dict = None
63 | self.flat_json_dict = None
64 | self.scrape_timestamp = None
65 |
66 | def __getitem__(self, key: str) -> Any:
67 | return getattr(self, key)
68 |
69 | def __repr__(self) -> str:
70 | return f"<{type(self).__name__}>"
71 |
72 | def scrape(
73 | self,
74 | mapping=None,
75 | keys: List[str] = None,
76 | exclude: List[str] = None,
77 | headers={
78 | "user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Mobile Safari/537.36 Edg/87.0.664.57"
79 | },
80 | inplace=True,
81 | session=None,
82 | webdriver=None
83 | ) -> None:
84 | """
85 | Scrape data from the source
86 |
87 | Parameters
88 | ----------
89 | mapping : Dict[str, deque]
90 | Dictionary of parsing queue's that tell the JSON engine how to
91 | process the JSON data
92 | keys : List[str]
93 | List of strings that correspond to desired attributes for scraping
94 | exclude : List[str]
95 | List of strings that correspond to which attributes to exclude from
96 | being scraped
97 | headers : Dict[str, str]
98 | Dictionary of request headers to be passed on the GET request
99 | inplace : bool
100 | Determines if data modified inplace or return a new object with the
101 | scraped data
102 | session : requests.Session
103 | Session for making the GET request
104 | webdriver : selenium.webdriver.chrome.webdriver.WebDriver
105 | Webdriver for scraping the page, overrides any default or passed
106 | session
107 |
108 | Returns
109 | -------
110 | return_instance
111 | Optionally returns a scraped instance instead of modifying inplace
112 | if inplace arg is True
113 | """
114 |
115 | if mapping is None:
116 | mapping = self._Mapping.return_mapping(keys=keys, exclude=exclude)
117 | if session is None:
118 | session = self.session
119 | if webdriver is not None:
120 | session = webdriver
121 | if keys is None:
122 | keys = []
123 | if exclude is None:
124 | exclude = []
125 |
126 | if webdriver is None:
127 | try:
128 | if "sessionid" not in headers["cookie"]:
129 | warnings.warn(
130 | "Session ID not in cookies! It's recommended you pass a valid sessionid otherwise Instagram will likely redirect you to their login page.",
131 | MissingSessionIDWarning
132 | )
133 | except KeyError:
134 | warnings.warn(
135 | "Request header does not contain cookies! It's recommended you pass at least a valid sessionid otherwise Instagram will likely redirect you to their login page.",
136 | MissingCookiesWarning
137 | )
138 |
139 | # If the passed source was already an object, construct data from
140 | # source else parse it
141 | if isinstance(self.source, type(self)):
142 | scraped_dict = self.source.to_dict()
143 | else:
144 | return_data = self._get_json_from_source(self.source, headers=headers, session=session)
145 | flat_json_dict = flatten_dict(return_data["json_dict"])
146 |
147 | #HACK: patch mapping to fix the profile pic scrape when a sessionid is present
148 | try:
149 | if "sessionid" in headers["cookie"]:
150 | mapping["profile_pic_url"] = deque(["user_profile_pic_url"])
151 | mapping["profile_pic_url_hd"] = deque(["user_profile_pic_url_hd"])
152 | except KeyError:
153 | pass
154 |
155 | scraped_dict = parse_data_from_json(
156 | json_dict=flat_json_dict,
157 | map_dict=mapping,
158 | )
159 | return_data["scrape_timestamp"] = datetime.datetime.now()
160 | return_data["flat_json_dict"] = flat_json_dict
161 | return_instance = self._load_into_namespace(
162 | scraped_dict=scraped_dict,
163 | return_data=return_data,
164 | inplace=inplace
165 | )
166 | return None if return_instance is self else return_instance
167 |
168 | def to_dict(self, metadata: bool = False) -> Dict[str, Any]:
169 | """
170 | Return a dictionary containing all of the data that has been scraped
171 |
172 | Parameters
173 | ----------
174 | metadata : bool
175 | Boolean value that determines if metadata specified in self._METADATA_KEYS
176 | will be included in the dictionary.
177 |
178 | Returns
179 | -------
180 | data_dict : Dict[str, Any]
181 | Dictionary containing the scraped data
182 | """
183 | data_dict = (
184 | {key: val for key, val in self.__dict__.items() if key not in self._METADATA_KEYS}
185 | if not metadata
186 | else self.__dict__
187 | )
188 | return data_dict
189 |
190 | def to_csv(self, fp: str) -> None:
191 | """
192 | Write scraped data to .csv at the given filepath
193 |
194 | Parameters
195 | ----------
196 | fp : str
197 | Filepath to write data to
198 | """
199 | with open(fp, "w", newline="", encoding="utf-8") as csv_file:
200 | writer = csv.writer(csv_file)
201 | for key, value in self.to_dict().items():
202 | writer.writerow([key, str(value)])
203 |
204 | def to_json(self, fp: str) -> None:
205 | """
206 | Write scraped data to .json file at the given filepath
207 |
208 | Parameters
209 | ----------
210 | fp : str
211 | Filepath to write data to
212 | """
213 | outdict = {key: str(val) for key, val in self.to_dict().items()}
214 | with open(fp, "w") as outjson:
215 | json.dump(outdict, outjson)
216 |
217 | @abstractmethod
218 | def _url_from_suburl(self, suburl: str) -> str:
219 | pass
220 |
221 | def _get_json_from_source(self, source: Any, headers: dict, session: requests.Session) -> JSONDict:
222 | """Parses the JSON data out from the source based on what type the source is"""
223 | initial_type = True
224 | return_data = {"source": self.source}
225 | if isinstance(source, str):
226 | source_type = self._determine_string_type(source)
227 | elif isinstance(source, dict):
228 | json_dict = source
229 | source_type = "json dict"
230 | elif isinstance(source, BeautifulSoup):
231 | source_type = "soup"
232 |
233 | if source_type == "suburl":
234 | if initial_type:
235 | suburl = self.source
236 | url = self._url_from_suburl(suburl=suburl)
237 | source_type = "url"
238 | initial_type = False
239 | return_data["url"] = url
240 |
241 | if source_type == "url":
242 | if initial_type:
243 | url = self.source
244 | html = self._html_from_url(url=url, headers=headers, session=session)
245 | source_type = "html"
246 | initial_type = False
247 | return_data["html"] = html
248 |
249 | if source_type == "html":
250 | if initial_type:
251 | html = self.source
252 | soup = self._soup_from_html(html)
253 | source_type = "soup"
254 | initial_type = False
255 | return_data["soup"] = soup
256 |
257 | if source_type == "soup":
258 | if initial_type:
259 | soup = self.source
260 | json_dict_arr = json_from_soup(soup)
261 | if len(json_dict_arr) == 1:
262 | json_dict = json_dict_arr[0]
263 | else:
264 | json_dict = json_dict_arr[1]
265 | self._validate_scrape(json_dict)
266 |
267 | return_data["json_dict"] = json_dict
268 |
269 | return return_data
270 |
271 | def _load_into_namespace(self, scraped_dict: dict, return_data, inplace) -> None:
272 | """Loop through the scraped dictionary and set them as instance attr"""
273 | instance = self if inplace else type(self)(return_data["source"])
274 | for key, val in scraped_dict.items():
275 | setattr(instance, key, val)
276 | for key, val in return_data.items():
277 | setattr(instance, key, val)
278 | return instance
279 |
280 |
281 | @staticmethod
282 | def _html_from_url(url: str, headers: dict, session: requests.Session) -> str:
283 | """Return HTML from requested URL"""
284 | if isinstance(session, requests.Session):
285 | response = session.get(url, headers=headers)
286 | page_source = response.text
287 | else:
288 | session.get(url)
289 | page_source = session.page_source
290 | return page_source
291 |
292 | @staticmethod
293 | def _soup_from_html(html: str) -> BeautifulSoup:
294 | """Return BeautifulSoup from source HTML"""
295 | return BeautifulSoup(html, features="html.parser")
296 |
297 | def _validate_scrape(self, json_dict: str) -> JSONDict:
298 | """Raise exceptions if the scrape did not properly execute"""
299 | json_type = determine_json_type(json_dict)
300 | if json_type == "LoginAndSignupPage" and not type(self).__name__ == "LoginAndSignupPage":
301 | raise InstagramLoginRedirectError
302 | elif json_type == "HttpErrorPage" and not type(self).__name__ == "HttpErrorPage":
303 | source_str = self.url if hasattr(self, "url") else "Source"
304 | raise ValueError(f"{source_str} is not a valid Instagram page. Please provide a valid argument.")
305 |
306 | @staticmethod
307 | def _determine_string_type(string_data: str) -> str:
308 | """Match and return string representation of appropriate source"""
309 | string_type_map = [("https://", "url"), ("window._sharedData", "html"), ('{"config"', "JSON dict str")]
310 | for substr, str_type in string_type_map:
311 | if substr in string_data:
312 | #BUG: !DOCTYPE isnt returned in selenium source code, use