├── twpy ├── config │ ├── __init__.py │ └── config.py ├── core │ ├── __init__.py │ ├── request.py │ └── grabber.py ├── models │ ├── __init__.py │ └── data_model.py ├── serializers │ ├── __init__.py │ ├── __to_json.py │ ├── __to_pandas.py │ └── __to_list.py ├── exceptions │ └── __init__.py ├── __init__.py └── utils │ └── __init__.py ├── MANIFEST.in ├── setup.cfg ├── requirements.txt ├── .gitignore ├── LICENSE.txt ├── CHANGELOG.md ├── setup.py └── README.md /twpy/config/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /twpy/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # documentation 2 | include README.md -------------------------------------------------------------------------------- /twpy/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .data_model import * -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | beautifulsoup4 3 | pandas 4 | -------------------------------------------------------------------------------- /twpy/serializers/__init__.py: -------------------------------------------------------------------------------- 1 | from .__to_json import to_json 2 | from .__to_list import to_list 3 | from .__to_pandas import to_pandas 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ignore pycharm files 2 | .idea/ 3 | .idea/vcs.xml 4 | 5 | # ignore venv 6 | venv/ 7 | 8 | # ignore mac cache 9 | .DS_Store/ 10 | 11 | # pipenv 12 | Pipfile 13 | Pipfile.lock 14 | -------------------------------------------------------------------------------- /twpy/config/config.py: -------------------------------------------------------------------------------- 1 | VERSION = "1.2.5" 2 | BASE_URL = "https://twitter.com/" 3 | MOBILE_URL = "https://mobile.twitter.com/" 4 | APIV1_URL = "https://help.twitter.com/api/v1/username_lookups?username=" 5 | TIMELINE_WITH_TOKEN_QUERY = "i/search/timeline?vertical=default&src=unkn&include_available_features=1&include_entities=1" \ 6 | "&max_position=%TOKEN%&reset_error_state=false&f=tweets&q=" 7 | 8 | -------------------------------------------------------------------------------- /twpy/serializers/__to_json.py: -------------------------------------------------------------------------------- 1 | # to json serializer 2 | 3 | import json 4 | 5 | 6 | def to_json(objects_list: list) -> list: 7 | """ 8 | Get objects and convert it to json 9 | :param objects_list: 10 | :return: 11 | """ 12 | try: 13 | if objects_list[0].__class__.__name__ == "FF": 14 | return [obj.__dict__ for obj in objects_list] 15 | 16 | elif objects_list[0].__class__.__name__ == "Timeline": 17 | return [obj.__dict__ for obj in objects_list] 18 | elif objects_list[0].__class__.__name__ == "Profile": 19 | return [obj.__dict__ for obj in objects_list] 20 | except IndexError: 21 | return [] 22 | -------------------------------------------------------------------------------- /twpy/exceptions/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | class IndexError(Exception): 3 | def __init__(self, message): 4 | super().__init__(message) 5 | 6 | 7 | class ConnectionTimeout(Exception): 8 | def __init__(self, message): 9 | super().__init__(message) 10 | 11 | 12 | class InvalidValue(Exception): 13 | def __init__(self, message): 14 | super().__init__(message) 15 | 16 | 17 | class QueryError(Exception): 18 | def __init__(self, message): 19 | super().__init__(message) 20 | 21 | 22 | class ParameterRequired(Exception): 23 | def __init__(self, message): 24 | super().__init__(message) 25 | -------------------------------------------------------------------------------- /twpy/serializers/__to_pandas.py: -------------------------------------------------------------------------------- 1 | # to pandas serializer 2 | 3 | import pandas as pd 4 | from .__to_json import to_json 5 | 6 | 7 | def to_pandas(objects_list: list) -> pd.DataFrame: 8 | """ 9 | Get objects and convert it pandas DataFrame 10 | :param objects_list: 11 | :return: 12 | """ 13 | try: 14 | if objects_list[0].__class__.__name__ == "FF": 15 | return pd.DataFrame(to_json(objects_list)) 16 | elif objects_list[0].__class__.__name__ == "Timeline": 17 | return pd.DataFrame(to_json(objects_list)) 18 | elif objects_list[0].__class__.__name__ == "Profile": 19 | return pd.DataFrame(to_json(objects_list)) 20 | except IndexError: 21 | return pd.DataFrame() 22 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2017-2019 TWPY 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /twpy/serializers/__to_list.py: -------------------------------------------------------------------------------- 1 | # to list serializer 2 | def to_list(objects_list: list) -> list: 3 | """ 4 | Get objects and convert it to list 5 | :param objects_list: 6 | :return: 7 | """ 8 | try: 9 | if objects_list[0].__class__.__name__ == "FF": 10 | return [[obj.username, obj.avatar, obj.fullname] for obj in objects_list] 11 | 12 | elif objects_list[0].__class__.__name__ == "Timeline": 13 | return [[ 14 | obj.tweet_id, 15 | obj.tweet_link, 16 | obj.conversation_id, 17 | obj.is_reply, 18 | obj.has_parent, 19 | obj.screen_name, 20 | obj.user_id, 21 | obj.user_mentions, 22 | obj.content, 23 | obj.reply_count, 24 | obj.retweet_count, 25 | obj.likes_count, 26 | obj.created_at] for obj in objects_list] 27 | elif objects_list[0].__class__.__name__ == "Profile": 28 | return [[ 29 | obj.name, 30 | obj.verified, 31 | obj.protected, 32 | obj.username, 33 | obj.bio, 34 | obj.location, 35 | obj.url, 36 | obj.joined_date, 37 | obj.birthday, 38 | obj.user_id, 39 | obj.tweet_count, 40 | obj.following_count, 41 | obj.follower_count, 42 | obj.likes_count 43 | ] for obj in objects_list] 44 | except IndexError: 45 | return [] 46 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [Unreleased] 9 | 10 | ## [1.2.4] - 2020-02-25 11 | 12 | ### Added 13 | - get_user_id method added . 14 | 15 | ### Changed 16 | 17 | - Fixed get_user problem with new style of twitter . 18 | - joined_date, birthday, like_count removed from get_user data model due to twitter changes . 19 | 20 | 21 | ## [1.2.2] - 2020-01-04 22 | 23 | ### Changed 24 | 25 | - Fixed tweet count bug . 26 | - Moved from reStructured Text to MarkDown for README . 27 | - Fixed bug in get_followers/get_followings method, which doesn't get first page tweets . 28 | 29 | ## [1.2.1] - 2019-12-23 30 | 31 | ### Added 32 | 33 | - Search tweets with username and query string possible now . 34 | - Filter tweets with `since` and `until` parameters in the search method . 35 | - `__version__` property added to TwpyClient . 36 | 37 | 38 | ### Changed 39 | 40 | - Fixed infinite loop while getting timeline . 41 | - Improved get_timeline method . 42 | - Fixed setup.py packages and description . 43 | 44 | 45 | 46 | ### Removed 47 | 48 | - First request controller 49 | -------------------------------------------------------------------------------- /twpy/core/request.py: -------------------------------------------------------------------------------- 1 | # request handler 2 | import requests 3 | from ..utils import header_maker 4 | import json 5 | 6 | 7 | class RequestHandler: 8 | """ 9 | Handle all requests with specific user-agent 10 | """ 11 | def __init__(self, user_agent: str, ret: str = "text") -> None: 12 | self.user_agent = user_agent 13 | self.current_proxy = None 14 | self.ret = ret 15 | 16 | @property 17 | def proxy(self): 18 | return self.proxy 19 | 20 | @proxy.setter 21 | def proxy(self, new_proxy): 22 | self.current_proxy = new_proxy 23 | 24 | def get(self, url: str): 25 | """ 26 | make request 27 | :param url: 28 | :return: 29 | """ 30 | proxies = { 31 | "http": f"http://{self.current_proxy}", 32 | "https": f"https://{self.current_proxy}" 33 | } 34 | headers = { 35 | "User-Agent": header_maker(self.user_agent) 36 | } 37 | try: 38 | s = requests.Session() 39 | if self.current_proxy: 40 | res = s.get(url, headers=headers, proxies=proxies) 41 | else: 42 | res = s.get(url, headers=headers) 43 | if res.status_code == 200: 44 | # check return mode 45 | if self.ret == "text": 46 | return res.text 47 | else: 48 | return res.json() 49 | else: 50 | return None 51 | except requests.exceptions.ConnectionError: 52 | raise requests.exceptions.ConnectionError 53 | except json.decoder.JSONDecodeError: 54 | return None 55 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | from setuptools import setup 3 | 4 | TWPY_VERSION = '1.2.5' 5 | TWPY_DOWNLOAD = ('https://github.com/0x0ptim0us/twpy/tarball/' + TWPY_VERSION) 6 | 7 | 8 | def read_file(filename): 9 | """ 10 | Read a utf8 encoded text file and return its contents. 11 | """ 12 | with codecs.open(filename, 'r', 'utf8') as f: 13 | return f.read() 14 | 15 | 16 | setup( 17 | name='twpy', 18 | packages=[ 19 | 'twpy', 20 | 'twpy.config', 21 | 'twpy.core', 22 | 'twpy.exceptions', 23 | 'twpy.models', 24 | 'twpy.serializers', 25 | 'twpy.utils'], 26 | 27 | version=TWPY_VERSION, 28 | description='Twitter High level scraper for humans. ', 29 | long_description=read_file('README.md'), 30 | long_description_content_type='text/markdown', 31 | license='MIT', 32 | author='Fardin Allahverdinazhand', 33 | author_email='0x0ptim0us@gmail.com', 34 | url='https://github.com/0x0ptim0us/twpy', 35 | download_url=TWPY_DOWNLOAD, 36 | keywords=['python3', 'twitter', 'twitter api', 'twpy', 'twitter scraper'], 37 | classifiers=[ 38 | 'Intended Audience :: Developers', 39 | 'License :: OSI Approved :: MIT License', 40 | 'Programming Language :: Python :: 3.5', 41 | 'Programming Language :: Python :: 3.6', 42 | 'Programming Language :: Python :: 3.7', 43 | 'Programming Language :: Python :: 3.8', 44 | 'Natural Language :: English', 45 | ], 46 | 47 | install_requires=[ 48 | 'requests', 49 | 'beautifulsoup4', 50 | 'pandas' 51 | ], 52 | setup_requires=[ 53 | 'requests', 54 | 'beautifulsoup4', 55 | 'pandas' 56 | ], 57 | 58 | ) 59 | -------------------------------------------------------------------------------- /twpy/__init__.py: -------------------------------------------------------------------------------- 1 | from .config.config import VERSION 2 | from .core.grabber import follower_following, timeline, profile, search, get_user_id 3 | 4 | 5 | class TwpyClient(object): 6 | 7 | def __init__(self, proxy: str = "") -> None: 8 | """ 9 | Twpy client 10 | :param proxy: 11 | """ 12 | self.proxy = proxy 13 | 14 | @property 15 | def __version__(self): 16 | return VERSION 17 | 18 | def get_followers(self, username: str, interval: int = 0, limit: int = 0) -> list: 19 | """ 20 | get user followers 21 | :param username: 22 | :param interval: 23 | :param limit: 24 | :return: 25 | """ 26 | return follower_following(username=username, limit=limit, proxy=self.proxy, interval=interval) 27 | 28 | def get_friends(self, username: str, limit: int = 0, interval: int = 0) -> list: 29 | """ 30 | get user friends 31 | :param username: 32 | :param limit: 33 | :param interval: 34 | :return: 35 | """ 36 | return follower_following(username=username, limit=limit, proxy=self.proxy, type_="followings", interval=interval) 37 | 38 | def get_timeline(self, username: str, limit: int = 0, interval: int = 0) -> list: 39 | """ 40 | get user timeline 41 | :param username: 42 | :param limit: 43 | :param interval: 44 | :return: 45 | """ 46 | return timeline(username=username, limit=limit, proxy=self.proxy, interval=interval) 47 | 48 | def get_user(self, username: str): 49 | """ 50 | get user profile info 51 | :param username: 52 | :return: 53 | """ 54 | return profile(username=username, proxy=self.proxy) 55 | 56 | def search(self, username: str = "", since: str = "", until: str = "", query: str = "", limit: int = 0, verified: bool = False, interval: int = 0): 57 | """ 58 | search tweets by given parameters 59 | :param username: 60 | :param since: 61 | :param until: 62 | :param query: 63 | :param limit: 64 | :param verified: 65 | :param interval: 66 | :return: 67 | """ 68 | return search(username=username, since=since, until=until, query=query, limit=limit, verified=verified, proxy=self.proxy, interval=interval) 69 | 70 | def get_user_id(self, username: str) -> str: 71 | """ 72 | Get user_id of user 73 | :param username 74 | :return: str 75 | """ 76 | return get_user_id(username=username, proxy=self.proxy) 77 | -------------------------------------------------------------------------------- /twpy/models/data_model.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | 4 | class FF: 5 | def __init__(self, username: str, avatar: str, fullname: str) -> None: 6 | """Data model for followers/followings 7 | 8 | Arguments: 9 | username {str} -- [description] 10 | user_id {int} -- [description] 11 | avatar {str} -- [description] 12 | fullname {str} -- [description] 13 | """ 14 | self.username = username 15 | self.avatar = avatar 16 | self.fullname = fullname 17 | 18 | 19 | class Timeline: 20 | def __init__(self, tweet_id: int, 21 | tweet_link: str, 22 | conversation_id: str, 23 | is_reply: str, 24 | has_parent: str, 25 | screen_name: str, 26 | name: str, 27 | user_id: str, 28 | user_mentions: Union[str, list], 29 | content: str, 30 | reply_count: int, 31 | retweet_count: int, 32 | likes_count: int, 33 | created_at: str) -> None: 34 | """ 35 | 36 | :param tweet_id: 37 | :param tweet_link: 38 | :param conversation_id: 39 | :param is_reply: 40 | :param has_parent: 41 | :param screen_name: 42 | :param name: 43 | :param user_id: 44 | :param user_mentions: 45 | :param content: 46 | :param reply_count: 47 | :param retweet_count: 48 | :param likes_count: 49 | 50 | """ 51 | self.tweet_id = tweet_id 52 | self.tweet_link = tweet_link 53 | self.conversation_id = conversation_id 54 | self.is_reply = is_reply 55 | self.has_parent = has_parent 56 | self.screen_name = screen_name 57 | self.name = name 58 | self.user_id = user_id 59 | self.user_mentions = user_mentions 60 | self.content = content 61 | self.reply_count = reply_count 62 | self.retweet_count = retweet_count 63 | self.likes_count = likes_count 64 | self.created_at = created_at 65 | 66 | 67 | class Profile: 68 | def __init__(self, name: str, 69 | verified: str, 70 | protected: str, 71 | username: str, 72 | bio: str, 73 | location: str, 74 | url: str, 75 | tweet_count: int, 76 | following_count: int, 77 | follower_count: int) -> None: 78 | """ 79 | User profile data model 80 | """ 81 | self.name = name 82 | self.verified = verified 83 | self.protected = protected 84 | self.username = username 85 | self.bio = bio 86 | self.location = location 87 | self.url = url 88 | self.tweet_count = tweet_count 89 | self.following_count = following_count 90 | self.follower_count = follower_count 91 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | Twpy 3 | ==== 4 | 5 | ![image](https://pepy.tech/badge/twpy) ![image](https://pepy.tech/badge/twpy/week) 6 | 7 | Twitter High level scraper for humans. 8 | 9 | Features 10 | -------- 11 | 12 | - NO LIMIT, NO API required 13 | - Fast and easy to use 14 | - Working with python 3.5+ 15 | - Integrated with pandas for data science research 16 | 17 | Installation 18 | ------------ 19 | 20 | Manual install via git : 21 | 22 | ```bash 23 | $ git clone https://github.com/0x0ptim0us/twpy.git 24 | $ cd twpy 25 | $ python setup.py install 26 | ``` 27 | 28 | Install using pip: 29 | 30 | ```bash 31 | $ pip3 install twpy 32 | # or 33 | $ python3 -m pip install twpy 34 | ``` 35 | 36 | Usage 37 | ----- 38 | 39 | Create Twpy object : 40 | 41 | ```python 42 | from twpy import TwpyClient 43 | 44 | # create twpy client object 45 | tc = TwpyClient() 46 | ``` 47 | 48 | with proxy : 49 | 50 | ```python 51 | # or you can pass proxy 52 | tc = TwpyClient(proxy="127.0.0.1:8080") 53 | ``` 54 | 55 | Get twpy current version : 56 | 57 | ```python 58 | tc.__version__ 59 | # '1.2.4' 60 | ``` 61 | 62 | 63 | Get user followers: 64 | 65 | ```python 66 | # get user followers, limited up to 50 67 | # interval : delay between each request, default is 0 for no delay 68 | # proxy : send traffic through proxy, default is none 69 | followers_data = tc.get_followers(username="elonmusk", limit=50, interval=1) 70 | ``` 71 | 72 | Get user timeline: 73 | 74 | ```python 75 | tweets = tc.get_timeline(username="elonmusk", limit=50) 76 | ``` 77 | 78 | Get user profile: 79 | 80 | ```python 81 | user_info = tc.get_user(username="elonmusk") 82 | ``` 83 | 84 | Convert result object to other data structures : 85 | 86 | ```python 87 | from twpy.serializers import to_pandas, to_json, to_list 88 | # convert result to pandas data frame, json and list 89 | # pandas 90 | pandas_sample = to_pandas(followers_data) 91 | # json 92 | json_sample = to_json(followers_data) 93 | # list 94 | list_sample = to_list(followers_data) 95 | ``` 96 | 97 | Search example: 98 | 99 | ```python 100 | # search user tweets until 2015 101 | tweets = tc.search(username="elonmusk", until="2015") 102 | 103 | # add limit and interval 104 | tweets = tc.search(username="elonmusk", until="2015", limit=100, interval=1) 105 | 106 | # search tweets contains `love` word 107 | tweets = tc.search(query="love", limit=100, interval=1) 108 | 109 | # search tweets which contains `love` word and were tweeted since 2015-01-01 110 | tweets = tc.search(query="love", since="2015-01-01", limit=10) 111 | ``` 112 | 113 | # Supported methods 114 | | method | description | 115 | |--|--| 116 | | get_friends() | get user followings/friends | 117 | | get_timeline() | get user timeline/tweets | 118 | | get_user() | get user profile info | 119 | | search() | search tweets with query and username | 120 | 121 | Meta 122 | ---- 123 | 124 | Fardin Allahverdinazhand - [\@0x0ptim0us](https://twitter.com/0x0ptim0us) - <0x0ptim0us@gmail.com> Distributed under the MIT license. see [LICENSE.txt](https://github.com/0x0ptim0us/twpy/blob/master/LICENSE.txt) 125 | for more information. 126 | 127 | 128 | -------------------------------------------------------------------------------- /twpy/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | from bs4 import BeautifulSoup 3 | from ..models import FF, Timeline, Profile 4 | import json 5 | 6 | 7 | def header_maker(mode: str) -> str: 8 | """ 9 | make header and return as dict 10 | :param mode: 11 | :return: 12 | """ 13 | user_agents = { 14 | "FF": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36", 15 | "TIMELINE": "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36", 16 | "MOBILE": "Opera/9.80 (Android 4.1.2; Linux; Opera Mobi/ADR-1305251841) Presto/2.11.355 Version/12.10" 17 | } 18 | 19 | return user_agents[mode] 20 | 21 | 22 | def extract_cursor(html) -> str: 23 | """ 24 | extract token for next page 25 | :param html: 26 | :return: 27 | """ 28 | cursor = re.findall('cursor=(\d+)', html) 29 | if len(cursor) > 0: 30 | return cursor[0] 31 | else: 32 | return "" 33 | 34 | 35 | def extract_timeline_cursor(response: json) -> tuple: 36 | """ 37 | Extract cursor from json 38 | :param response: 39 | :return: 40 | """ 41 | return response['min_position'], response['has_more_items'] 42 | 43 | 44 | def extract_ff(html: str) -> list: 45 | """ 46 | Extract followers/followings data from html 47 | :param html: 48 | :return: 49 | """ 50 | result: list = [] 51 | soup = BeautifulSoup(html, 'html.parser') 52 | user_tables = soup.find_all('table', attrs={"class": "user-item"}) 53 | if user_tables: 54 | for user in user_tables: 55 | avatar = user.find("img", attrs={"class": "profile-image"})['src'] 56 | username = user.find("span", attrs={"class": "username"}).text.strip("@") 57 | fullname = user.find("strong", attrs={"class": "fullname"}).text 58 | # append to result list 59 | result.append(FF(username=username, avatar=avatar, fullname=fullname)) 60 | return result 61 | 62 | 63 | def extract_timeline(html: str) -> list: 64 | """ 65 | Extract tweets from timeline data 66 | :param html: 67 | :return: 68 | """ 69 | result: list = [] 70 | soup = BeautifulSoup(html, 'html.parser') 71 | 72 | for li in soup.find_all('li', attrs={'class': 'js-stream-item stream-item stream-item'}): 73 | # find first div 74 | first_div = li.find('div') 75 | # user and tweet info 76 | tweet_id = first_div['data-tweet-id'] 77 | tweet_link = first_div['data-permalink-path'] 78 | conversation_id = first_div['data-conversation-id'] 79 | is_reply = first_div.get('data-is-reply-to', "false") 80 | has_parent = first_div.get('data-has-parent-tweet', "false") 81 | screen_name = first_div['data-screen-name'] 82 | name = first_div['data-name'] 83 | user_id = first_div['data-user-id'] 84 | user_mentions = first_div.get('data-mentions', 'false') 85 | if ' ' in user_mentions: 86 | user_mentions = [user for user in user_mentions.split(" ")] 87 | # get content info 88 | content = li.find('div', attrs={'class': 'js-tweet-text-container'}).text 89 | # tweet statistics 90 | reply_count = li.find('span', attrs={'class': 'ProfileTweet-action--reply u-hiddenVisually'}).text 91 | reply_count = reply_count.split(" ")[0].strip() 92 | retweet_count = li.find('span', attrs={'class': 'ProfileTweet-action--retweet u-hiddenVisually'}).text 93 | retweet_count = retweet_count.split(" ")[0].strip() 94 | likes_count = li.find('span', attrs={'class': 'ProfileTweet-action--favorite u-hiddenVisually'}).text 95 | likes_count = likes_count.split(" ")[0].strip() 96 | # time 97 | created_at = li.find('a', attrs={'class': 'tweet-timestamp js-permalink js-nav js-tooltip'})['title'] 98 | # 99 | # add data to result 100 | result.append(Timeline( 101 | tweet_id=tweet_id, 102 | tweet_link=tweet_link, 103 | conversation_id=conversation_id, 104 | is_reply=is_reply, 105 | has_parent=has_parent, 106 | screen_name=screen_name, 107 | name=name, 108 | user_id=user_id, 109 | user_mentions=user_mentions, 110 | content=content, 111 | reply_count=reply_count, 112 | retweet_count=retweet_count, 113 | likes_count=likes_count, 114 | created_at=created_at 115 | )) 116 | return result 117 | 118 | 119 | def extract_profile(html: str) -> object: 120 | """ 121 | extract profile data 122 | :param html: 123 | :return: 124 | """ 125 | result: list = [] 126 | 127 | soup = BeautifulSoup(html, 'html.parser') 128 | # get name 129 | name = soup.find('div', class_='fullname').text.strip() 130 | # verified account 131 | verified = soup.find('img', attrs={'alt': 'Verified Account'}) 132 | if verified: 133 | verified = "true" 134 | else: 135 | verified = "false" 136 | # protected account 137 | protected = soup.find('div', class_='protected') 138 | if protected: 139 | protected = "true" 140 | else: 141 | protected = "false" 142 | # screen name 143 | username = soup.find('span', class_='screen-name').text.strip() 144 | # bio 145 | bio = soup.find('div', class_='bio').text.strip() 146 | # location 147 | location = soup.find('div', class_='location').text.strip() 148 | # url 149 | url = soup.find('div', class_='url').text.strip() 150 | # find profile info 151 | info = soup.find_all('div', class_='statnum') 152 | if info: 153 | tweet_count = info[0].text 154 | following_count = info[1].text 155 | follower_count = info[2].text 156 | else: 157 | tweet_count = 0 158 | following_count = 0 159 | follower_count = 0 160 | 161 | result.append(Profile( 162 | name=name, 163 | verified=verified, 164 | protected=protected, 165 | username=username, 166 | bio=bio, 167 | location=location, 168 | url=url, 169 | tweet_count=tweet_count, 170 | following_count=following_count, 171 | follower_count=follower_count 172 | )) 173 | return result 174 | -------------------------------------------------------------------------------- /twpy/core/grabber.py: -------------------------------------------------------------------------------- 1 | 2 | # Grabber Engine 3 | import time 4 | from datetime import datetime 5 | from ..exceptions import QueryError, ParameterRequired 6 | from .request import RequestHandler 7 | from ..config.config import BASE_URL, MOBILE_URL, TIMELINE_WITH_TOKEN_QUERY, APIV1_URL 8 | from ..utils import extract_cursor, extract_ff, extract_timeline_cursor, extract_timeline, extract_profile 9 | from time import sleep 10 | 11 | 12 | def follower_following( 13 | username: str, limit: int = 0, 14 | type_: str = "followers", 15 | proxy: str = None, 16 | interval: int = 0) -> list: 17 | """ 18 | Followers/Followings scraper 19 | :param username: 20 | :param limit: 21 | :param type_: 22 | :param proxy: 23 | :param interval: 24 | :return: 25 | """ 26 | result: list = [] 27 | cursor: str = str() 28 | first_request: bool = True 29 | has_more: bool = True 30 | # mode = FF -> followers/followings user-agent 31 | req = RequestHandler(user_agent="FF") 32 | # if proxy enabled set it 33 | if proxy: 34 | req.proxy = proxy 35 | while has_more: 36 | if first_request: 37 | url = MOBILE_URL + f"/{username}/{type_}/?lang=en" 38 | res = req.get(url) 39 | first_request = False 40 | else: 41 | url = MOBILE_URL + f"/{username}/{type_}/?lang=en&cursor={cursor}" 42 | res = req.get(url) 43 | if res: 44 | # extract cursor 45 | cursor = extract_cursor(res) 46 | if cursor: 47 | has_more = True 48 | else: 49 | has_more = False 50 | 51 | # parse followers/followings 52 | extracted_ff = extract_ff(res) 53 | result.extend(extracted_ff) 54 | # if there was limit 55 | if limit > 0: 56 | if len(result) > limit: 57 | return result[:limit] 58 | else: 59 | sleep(interval) 60 | continue 61 | 62 | else: 63 | return result 64 | # interval 65 | sleep(interval) 66 | 67 | return result 68 | 69 | 70 | def timeline(username: str, limit: int = 0, proxy: str = None, interval: int = 0) -> list: 71 | """ 72 | timeline scraper 73 | :param username: 74 | :param limit: 75 | :param proxy: 76 | :param interval: 77 | :return: 78 | """ 79 | result: list = [] 80 | cursor = "-1" 81 | has_more = True 82 | req = RequestHandler(user_agent="TIMELINE", ret="json") 83 | if proxy: 84 | req.proxy = proxy 85 | 86 | while has_more: 87 | 88 | url = BASE_URL+TIMELINE_WITH_TOKEN_QUERY+f"+from:{username}" 89 | url = url.replace("%TOKEN%", cursor) 90 | res = req.get(url) 91 | if res: 92 | cursor, has_more = extract_timeline_cursor(response=res) 93 | extracted_tweets = extract_timeline(res['items_html']) 94 | result.extend(extracted_tweets) 95 | # check limitation 96 | if limit > 0: 97 | if len(result) > limit: 98 | return result[:limit] 99 | else: 100 | sleep(interval) 101 | continue 102 | else: 103 | return result 104 | 105 | sleep(interval) 106 | 107 | return result 108 | 109 | 110 | def profile(username: str, proxy: str): 111 | """ 112 | get user profile 113 | """ 114 | 115 | req = RequestHandler(user_agent="MOBILE") 116 | if proxy: 117 | req.proxy = proxy 118 | url = BASE_URL+username+"/?lang=en" 119 | res = req.get(url=url) 120 | if res: 121 | return extract_profile(res) 122 | else: 123 | return None 124 | 125 | 126 | def get_user_id(username: str, proxy: str): 127 | """ 128 | get user id 129 | """ 130 | 131 | req = RequestHandler(user_agent="TIMELINE", ret="json") 132 | if proxy: 133 | req.proxy = proxy 134 | url = APIV1_URL + username 135 | res = req.get(url=url) 136 | if res: 137 | return res.get('user_id', '') 138 | else: 139 | return '' 140 | 141 | 142 | def search(username: str = "", since: str = "", until: str = "", query: str = "", limit: int = 0, verified: bool = False, proxy: str = "", interval: int = 0): 143 | """Advanced search engine""" 144 | 145 | cursor: str = "-1" 146 | has_more: bool = True 147 | result: list = [] 148 | req = RequestHandler(user_agent="TIMELINE", ret="json") 149 | if proxy: 150 | req.proxy = proxy 151 | 152 | if since: 153 | since = int(time.mktime(datetime.strptime(since, "%Y-%m-%d").timetuple())) 154 | 155 | if until: 156 | if len(until) == 4: 157 | until = f"{until}-01-01" 158 | 159 | query_structure = { 160 | "from": f"+from:{username}", 161 | "since": f"+since:{since}", 162 | "verified": ":verified", 163 | "until": f"+until:{until}", 164 | "query": f"+{query}" 165 | } 166 | 167 | if username and query: 168 | """ not allowed """ 169 | raise QueryError("`username` and `query` parameter not allowed together.") 170 | 171 | if since and until: 172 | """ not allowed """ 173 | raise QueryError("`since` and `until` parameter not allowed together.") 174 | 175 | url = BASE_URL+TIMELINE_WITH_TOKEN_QUERY 176 | url = url.replace("%TOKEN%", cursor) 177 | 178 | # if there was username or query 179 | if username or query: 180 | if username: 181 | url += query_structure['from'] 182 | else: 183 | url += query_structure['query'] 184 | 185 | # if username and query aren't set properly raise error 186 | else: 187 | raise ParameterRequired("`username` or `query` required for search.") 188 | 189 | if since or until: 190 | if since: 191 | url += query_structure['since'] 192 | elif until: 193 | url += query_structure['until'] 194 | 195 | if verified: 196 | url += query_structure['verified'] 197 | 198 | while has_more: 199 | res = req.get(url=url) 200 | if res: 201 | cursor, has_more = extract_timeline_cursor(response=res) 202 | if cursor: 203 | extracted_tweets = extract_timeline(res['items_html']) 204 | result.extend(extracted_tweets) 205 | url = url.replace("%TOKEN%", cursor) 206 | # check limitation 207 | if limit > 0: 208 | if len(result) > limit: 209 | return result[:limit] 210 | else: 211 | sleep(interval) 212 | continue 213 | else: 214 | break 215 | sleep(interval) 216 | else: 217 | return result 218 | 219 | return result 220 | --------------------------------------------------------------------------------