├── twpy
    ├── config
    │   ├── __init__.py
    │   └── config.py
    ├── core
    │   ├── __init__.py
    │   ├── request.py
    │   └── grabber.py
    ├── models
    │   ├── __init__.py
    │   └── data_model.py
    ├── serializers
    │   ├── __init__.py
    │   ├── __to_json.py
    │   ├── __to_pandas.py
    │   └── __to_list.py
    ├── exceptions
    │   └── __init__.py
    ├── __init__.py
    └── utils
    │   └── __init__.py
├── MANIFEST.in
├── setup.cfg
├── requirements.txt
├── .gitignore
├── LICENSE.txt
├── CHANGELOG.md
├── setup.py
└── README.md


/twpy/config/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/twpy/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | # documentation
2 | include README.md


--------------------------------------------------------------------------------
/twpy/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .data_model import *


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | beautifulsoup4
3 | pandas
4 | 


--------------------------------------------------------------------------------
/twpy/serializers/__init__.py:
--------------------------------------------------------------------------------
1 | from .__to_json import to_json
2 | from .__to_list import to_list
3 | from .__to_pandas import to_pandas
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # ignore pycharm files
 2 | .idea/
 3 | .idea/vcs.xml
 4 | 
 5 | # ignore venv
 6 | venv/
 7 | 
 8 | # ignore mac cache
 9 | .DS_Store/
10 | 
11 | # pipenv
12 | Pipfile
13 | Pipfile.lock
14 | 


--------------------------------------------------------------------------------
/twpy/config/config.py:
--------------------------------------------------------------------------------
1 | VERSION = "1.2.5"
2 | BASE_URL = "https://twitter.com/"
3 | MOBILE_URL = "https://mobile.twitter.com/"
4 | APIV1_URL = "https://help.twitter.com/api/v1/username_lookups?username="
5 | TIMELINE_WITH_TOKEN_QUERY = "i/search/timeline?vertical=default&src=unkn&include_available_features=1&include_entities=1" \
6 |                  "&max_position=%TOKEN%&reset_error_state=false&f=tweets&q="
7 | 
8 | 


--------------------------------------------------------------------------------
/twpy/serializers/__to_json.py:
--------------------------------------------------------------------------------
 1 | # to json serializer
 2 | 
 3 | import json
 4 | 
 5 | 
 6 | def to_json(objects_list: list) -> list:
 7 | 	"""
 8 | 	Get objects and convert it to json
 9 | 	:param objects_list:
10 | 	:return:
11 | 	"""
12 | 	try:
13 | 		if objects_list[0].__class__.__name__ == "FF":
14 | 			return [obj.__dict__ for obj in objects_list]
15 | 
16 | 		elif objects_list[0].__class__.__name__ == "Timeline":
17 | 			return [obj.__dict__ for obj in objects_list]
18 | 		elif objects_list[0].__class__.__name__ == "Profile":
19 | 			return [obj.__dict__ for obj in objects_list]
20 | 	except IndexError:
21 | 		return []
22 | 


--------------------------------------------------------------------------------
/twpy/exceptions/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class IndexError(Exception):
 3 |     def __init__(self, message):
 4 |         super().__init__(message)
 5 | 
 6 | 
 7 | class ConnectionTimeout(Exception):
 8 |     def __init__(self, message):
 9 |         super().__init__(message)
10 | 
11 | 
12 | class InvalidValue(Exception):
13 |     def __init__(self, message):
14 |         super().__init__(message)
15 | 
16 | 
17 | class QueryError(Exception):
18 |     def __init__(self, message):
19 |         super().__init__(message)
20 | 
21 | 
22 | class ParameterRequired(Exception):
23 |     def __init__(self, message):
24 |         super().__init__(message)
25 | 


--------------------------------------------------------------------------------
/twpy/serializers/__to_pandas.py:
--------------------------------------------------------------------------------
 1 | # to pandas serializer
 2 | 
 3 | import pandas as pd
 4 | from .__to_json import to_json
 5 | 
 6 | 
 7 | def to_pandas(objects_list: list) -> pd.DataFrame:
 8 | 	"""
 9 | 	Get objects and convert it pandas DataFrame
10 | 	:param objects_list:
11 | 	:return:
12 | 	"""
13 | 	try:
14 | 		if objects_list[0].__class__.__name__ == "FF":
15 | 			return pd.DataFrame(to_json(objects_list))
16 | 		elif objects_list[0].__class__.__name__ == "Timeline":
17 | 			return pd.DataFrame(to_json(objects_list))
18 | 		elif objects_list[0].__class__.__name__ == "Profile":
19 | 			return pd.DataFrame(to_json(objects_list))
20 | 	except IndexError:
21 | 		return pd.DataFrame()
22 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright 2017-2019 TWPY
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/twpy/serializers/__to_list.py:
--------------------------------------------------------------------------------
 1 | # to list serializer
 2 | def to_list(objects_list: list) -> list:
 3 | 	"""
 4 | 	Get objects and convert it to list
 5 | 	:param objects_list:
 6 | 	:return:
 7 | 	"""
 8 | 	try:
 9 | 		if objects_list[0].__class__.__name__ == "FF":
10 | 			return [[obj.username, obj.avatar, obj.fullname] for obj in objects_list]
11 | 
12 | 		elif objects_list[0].__class__.__name__ == "Timeline":
13 | 			return [[
14 | 				obj.tweet_id,
15 | 				obj.tweet_link,
16 | 				obj.conversation_id,
17 | 				obj.is_reply,
18 | 				obj.has_parent,
19 | 				obj.screen_name,
20 | 				obj.user_id,
21 | 				obj.user_mentions,
22 | 				obj.content,
23 | 				obj.reply_count,
24 | 				obj.retweet_count,
25 | 				obj.likes_count,
26 | 				obj.created_at] for obj in objects_list]
27 | 		elif objects_list[0].__class__.__name__ == "Profile":
28 | 			return [[
29 | 				obj.name,
30 | 				obj.verified,
31 | 				obj.protected,
32 | 				obj.username,
33 | 				obj.bio,
34 | 				obj.location,
35 | 				obj.url,
36 | 				obj.joined_date,
37 | 				obj.birthday,
38 | 				obj.user_id,
39 | 				obj.tweet_count,
40 | 				obj.following_count,
41 | 				obj.follower_count,
42 | 				obj.likes_count
43 | 			] for obj in objects_list]
44 | 	except IndexError:
45 | 		return []
46 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All notable changes to this project will be documented in this file.
 4 | 
 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 7 | 
 8 | ## [Unreleased]
 9 | 
10 | ## [1.2.4] - 2020-02-25
11 | 
12 | ### Added
13 | - get_user_id method added .
14 | 
15 | ### Changed
16 | 
17 | - Fixed get_user problem with new style of twitter .
18 | - joined_date, birthday, like_count removed from get_user data model due to twitter changes .
19 | 
20 | 
21 | ## [1.2.2] - 2020-01-04
22 | 
23 | ### Changed
24 | 
25 | - Fixed tweet count bug .
26 | - Moved from reStructured Text to MarkDown for README .
27 | - Fixed bug in get_followers/get_followings method, which doesn't get first page tweets .
28 | 
29 | ## [1.2.1] - 2019-12-23
30 | 
31 | ### Added
32 | 
33 | - Search tweets with username and query string possible now .
34 | - Filter tweets with `since` and `until` parameters in the search method .
35 | - `__version__` property added to TwpyClient .
36 | 
37 | 
38 | ### Changed
39 | 
40 | - Fixed infinite loop while getting timeline .
41 | - Improved get_timeline method .
42 | - Fixed setup.py packages and description .
43 | 
44 | 
45 | 
46 | ### Removed
47 | 
48 | - First request controller
49 | 


--------------------------------------------------------------------------------
/twpy/core/request.py:
--------------------------------------------------------------------------------
 1 | # request handler
 2 | import requests
 3 | from ..utils import header_maker
 4 | import json
 5 | 
 6 | 
 7 | class RequestHandler:
 8 | 	"""
 9 | 	Handle all requests with specific user-agent
10 | 	"""
11 | 	def __init__(self, user_agent: str, ret: str = "text") -> None:
12 | 		self.user_agent = user_agent
13 | 		self.current_proxy = None
14 | 		self.ret = ret
15 | 
16 | 	@property
17 | 	def proxy(self):
18 | 		return self.proxy
19 | 
20 | 	@proxy.setter
21 | 	def proxy(self, new_proxy):
22 | 		self.current_proxy = new_proxy
23 | 
24 | 	def get(self, url: str):
25 | 		"""
26 | 		make request
27 | 		:param url:
28 | 		:return:
29 | 		"""
30 | 		proxies = {
31 | 			"http": f"http://{self.current_proxy}",
32 | 			"https": f"https://{self.current_proxy}"
33 | 		}
34 | 		headers = {
35 | 			"User-Agent": header_maker(self.user_agent)
36 | 		}
37 | 		try:
38 | 			s = requests.Session()
39 | 			if self.current_proxy:
40 | 				res = s.get(url, headers=headers, proxies=proxies)
41 | 			else:
42 | 				res = s.get(url, headers=headers)
43 | 			if res.status_code == 200:
44 | 				# check return mode
45 | 				if self.ret == "text":
46 | 					return res.text
47 | 				else:
48 | 					return res.json()
49 | 			else:
50 | 				return None
51 | 		except requests.exceptions.ConnectionError:
52 | 			raise requests.exceptions.ConnectionError
53 | 		except json.decoder.JSONDecodeError:
54 | 			return None
55 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | from setuptools import setup
 3 | 
 4 | TWPY_VERSION = '1.2.5'
 5 | TWPY_DOWNLOAD = ('https://github.com/0x0ptim0us/twpy/tarball/' + TWPY_VERSION)
 6 | 
 7 | 
 8 | def read_file(filename):
 9 | 	"""
10 | 	Read a utf8 encoded text file and return its contents.
11 | 	"""
12 | 	with codecs.open(filename, 'r', 'utf8') as f:
13 | 		return f.read()
14 | 
15 | 
16 | setup(
17 | 	name='twpy',
18 | 	packages=[
19 | 		'twpy',
20 | 		'twpy.config',
21 | 		'twpy.core',
22 | 		'twpy.exceptions',
23 | 		'twpy.models',
24 | 		'twpy.serializers',
25 | 		'twpy.utils'],
26 | 
27 | 	version=TWPY_VERSION,
28 | 	description='Twitter High level scraper for humans. ',
29 | 	long_description=read_file('README.md'),
30 | 	long_description_content_type='text/markdown',
31 | 	license='MIT',
32 | 	author='Fardin Allahverdinazhand',
33 | 	author_email='0x0ptim0us@gmail.com',
34 | 	url='https://github.com/0x0ptim0us/twpy',
35 | 	download_url=TWPY_DOWNLOAD,
36 | 	keywords=['python3', 'twitter', 'twitter api', 'twpy', 'twitter scraper'],
37 | 	classifiers=[
38 | 		'Intended Audience :: Developers',
39 | 		'License :: OSI Approved :: MIT License',
40 | 		'Programming Language :: Python :: 3.5',
41 | 		'Programming Language :: Python :: 3.6',
42 | 		'Programming Language :: Python :: 3.7',
43 | 		'Programming Language :: Python :: 3.8',
44 | 		'Natural Language :: English',
45 | 	],
46 | 
47 | 	install_requires=[
48 | 		'requests',
49 | 		'beautifulsoup4',
50 | 		'pandas'
51 | 	],
52 | 	setup_requires=[
53 | 		'requests',
54 | 		'beautifulsoup4',
55 | 		'pandas'
56 | 	],
57 | 
58 | )
59 | 


--------------------------------------------------------------------------------
/twpy/__init__.py:
--------------------------------------------------------------------------------
 1 | from .config.config import VERSION
 2 | from .core.grabber import follower_following, timeline, profile, search, get_user_id
 3 | 
 4 | 
 5 | class TwpyClient(object):
 6 | 
 7 | 	def __init__(self, proxy: str = "") -> None:
 8 | 		"""
 9 | 		Twpy client
10 | 		:param proxy:
11 | 		"""
12 | 		self.proxy = proxy
13 | 
14 | 	@property
15 | 	def __version__(self):
16 | 		return VERSION
17 | 
18 | 	def get_followers(self, username: str, interval: int = 0, limit: int = 0) -> list:
19 | 		"""
20 | 		get user followers
21 | 		:param username:
22 | 		:param interval:
23 | 		:param limit:
24 | 		:return:
25 | 		"""
26 | 		return follower_following(username=username, limit=limit, proxy=self.proxy, interval=interval)
27 | 
28 | 	def get_friends(self, username: str, limit: int = 0, interval: int = 0) -> list:
29 | 		"""
30 | 		get user friends
31 | 		:param username:
32 | 		:param limit:
33 | 		:param interval:
34 | 		:return:
35 | 		"""
36 | 		return follower_following(username=username, limit=limit, proxy=self.proxy, type_="followings", interval=interval)
37 | 
38 | 	def get_timeline(self, username: str, limit: int = 0, interval: int = 0) -> list:
39 | 		"""
40 | 		get user timeline
41 | 		:param username:
42 | 		:param limit:
43 | 		:param interval:
44 | 		:return:
45 | 		"""
46 | 		return timeline(username=username, limit=limit, proxy=self.proxy, interval=interval)
47 | 
48 | 	def get_user(self, username: str):
49 | 		"""
50 | 		get user profile info
51 | 		:param username:
52 | 		:return:
53 | 		"""
54 | 		return profile(username=username, proxy=self.proxy)
55 | 
56 | 	def search(self, username: str = "", since: str = "", until: str = "", query: str = "", limit: int = 0, verified: bool = False, interval: int = 0):
57 | 		"""
58 | 		search tweets by given parameters
59 | 		:param username:
60 | 		:param since:
61 | 		:param until:
62 | 		:param query:
63 | 		:param limit:
64 | 		:param verified:
65 | 		:param interval:
66 | 		:return:
67 | 		"""
68 | 		return search(username=username, since=since, until=until, query=query, limit=limit, verified=verified, proxy=self.proxy, interval=interval)
69 | 
70 | 	def get_user_id(self, username: str) -> str:
71 | 		"""
72 | 		Get user_id of user
73 | 		:param username
74 | 		:return: str
75 | 		"""
76 | 		return get_user_id(username=username, proxy=self.proxy)
77 | 


--------------------------------------------------------------------------------
/twpy/models/data_model.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | 
 4 | class FF:
 5 |     def __init__(self, username: str, avatar: str, fullname: str) -> None:
 6 |         """Data model for followers/followings
 7 |         
 8 |         Arguments:
 9 |             username {str} -- [description]
10 |             user_id {int} -- [description]
11 |             avatar {str} -- [description]
12 |             fullname {str} -- [description]
13 |         """
14 |         self.username = username
15 |         self.avatar = avatar
16 |         self.fullname = fullname
17 | 
18 | 
19 | class Timeline:
20 |     def __init__(self, tweet_id: int,
21 |                  tweet_link: str,
22 |                  conversation_id: str,
23 |                  is_reply: str,
24 |                  has_parent: str,
25 |                  screen_name: str,
26 |                  name: str,
27 |                  user_id: str,
28 |                  user_mentions: Union[str, list],
29 |                  content: str,
30 |                  reply_count: int,
31 |                  retweet_count: int,
32 |                  likes_count: int,
33 |                  created_at: str) -> None:
34 |         """
35 | 
36 |         :param tweet_id:
37 |         :param tweet_link:
38 |         :param conversation_id:
39 |         :param is_reply:
40 |         :param has_parent:
41 |         :param screen_name:
42 |         :param name:
43 |         :param user_id:
44 |         :param user_mentions:
45 |         :param content:
46 |         :param reply_count:
47 |         :param retweet_count:
48 |         :param likes_count:
49 | 
50 |         """
51 |         self.tweet_id = tweet_id
52 |         self.tweet_link = tweet_link
53 |         self.conversation_id = conversation_id
54 |         self.is_reply = is_reply
55 |         self.has_parent = has_parent
56 |         self.screen_name = screen_name
57 |         self.name = name
58 |         self.user_id = user_id
59 |         self.user_mentions = user_mentions
60 |         self.content = content
61 |         self.reply_count = reply_count
62 |         self.retweet_count = retweet_count
63 |         self.likes_count = likes_count
64 |         self.created_at = created_at
65 | 
66 | 
67 | class Profile:
68 |     def __init__(self, name: str,
69 |                  verified: str,
70 |                  protected: str,
71 |                  username: str,
72 |                  bio: str,
73 |                  location: str,
74 |                  url: str,
75 |                  tweet_count: int,
76 |                  following_count: int,
77 |                  follower_count: int) -> None:
78 |         """
79 |         User profile data model
80 |         """
81 |         self.name = name
82 |         self.verified = verified
83 |         self.protected = protected
84 |         self.username = username
85 |         self.bio = bio
86 |         self.location = location
87 |         self.url = url
88 |         self.tweet_count = tweet_count
89 |         self.following_count = following_count
90 |         self.follower_count = follower_count
91 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | Twpy
  3 | ====
  4 | 
  5 | ![image](https://pepy.tech/badge/twpy) ![image](https://pepy.tech/badge/twpy/week)
  6 | 
  7 | Twitter High level scraper for humans.
  8 | 
  9 | Features
 10 | --------
 11 | 
 12 | - NO LIMIT, NO API required
 13 | - Fast and easy to use
 14 | - Working with python 3.5+
 15 | - Integrated with pandas for data science research
 16 | 
 17 | Installation
 18 | ------------
 19 | 
 20 | Manual install via git :
 21 | 
 22 | ```bash
 23 | $ git clone https://github.com/0x0ptim0us/twpy.git
 24 | $ cd twpy
 25 | $ python setup.py install
 26 | ```
 27 | 
 28 | Install using pip:
 29 | 
 30 | ```bash
 31 | $ pip3 install twpy
 32 | # or
 33 | $ python3 -m pip install twpy
 34 | ```
 35 | 
 36 | Usage
 37 | -----
 38 | 
 39 | Create Twpy object :
 40 | 
 41 | ```python
 42 | from twpy import TwpyClient 
 43 | 
 44 | # create twpy client object
 45 | tc = TwpyClient()
 46 | ```
 47 | 
 48 | with proxy :
 49 | 
 50 | ```python
 51 | # or you can pass proxy
 52 | tc = TwpyClient(proxy="127.0.0.1:8080")
 53 | ```
 54 | 
 55 | Get twpy current version :
 56 | 
 57 | ```python
 58 | tc.__version__
 59 | # '1.2.4'
 60 | ```
 61 | 
 62 | 
 63 | Get user followers:
 64 | 
 65 | ```python
 66 | # get user followers, limited up to 50
 67 | # interval : delay between each request, default is 0 for no delay
 68 | # proxy : send traffic through proxy, default is none
 69 | followers_data = tc.get_followers(username="elonmusk", limit=50, interval=1)
 70 | ```
 71 | 
 72 | Get user timeline:
 73 | 
 74 | ```python
 75 | tweets = tc.get_timeline(username="elonmusk", limit=50)
 76 | ```
 77 | 
 78 | Get user profile:
 79 | 
 80 | ```python
 81 | user_info = tc.get_user(username="elonmusk")
 82 | ```
 83 | 
 84 | Convert result object to other data structures :
 85 | 
 86 | ```python
 87 | from twpy.serializers import to_pandas, to_json, to_list
 88 | # convert result to pandas data frame, json and list
 89 | # pandas
 90 | pandas_sample = to_pandas(followers_data)
 91 | # json
 92 | json_sample = to_json(followers_data)
 93 | # list
 94 | list_sample = to_list(followers_data)
 95 | ```
 96 | 
 97 | Search example:
 98 | 
 99 | ```python
100 | # search user tweets until 2015
101 | tweets = tc.search(username="elonmusk", until="2015")
102 | 
103 | # add limit and interval
104 | tweets = tc.search(username="elonmusk", until="2015", limit=100, interval=1)
105 | 
106 | # search tweets contains `love` word
107 | tweets = tc.search(query="love", limit=100, interval=1)
108 | 
109 | # search tweets which contains `love` word and were tweeted since 2015-01-01
110 | tweets = tc.search(query="love", since="2015-01-01", limit=10)
111 | ```
112 | 
113 |   # Supported methods
114 | | method | description |
115 | |--|--|
116 | | get_friends() | get user followings/friends |
117 | | get_timeline()  | get user timeline/tweets |
118 | | get_user() | get user profile info |
119 | | search() | search tweets with query and username |
120 | 
121 | Meta
122 | ----
123 | 
124 | Fardin Allahverdinazhand -  [\@0x0ptim0us](https://twitter.com/0x0ptim0us) - <0x0ptim0us@gmail.com>  Distributed under the MIT license. see  [LICENSE.txt](https://github.com/0x0ptim0us/twpy/blob/master/LICENSE.txt)
125 | for more information.
126 | 
127 | <https://github.com/0x0ptim0us/twpy>
128 | 


--------------------------------------------------------------------------------
/twpy/utils/__init__.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from bs4 import BeautifulSoup
  3 | from ..models import FF, Timeline, Profile
  4 | import json
  5 | 
  6 | 
  7 | def header_maker(mode: str) -> str:
  8 | 	"""
  9 | 	make header and return as dict
 10 | 	:param mode:
 11 | 	:return:
 12 | 	"""
 13 | 	user_agents = {
 14 | 		"FF": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36",
 15 | 		"TIMELINE": "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
 16 | 		"MOBILE": "Opera/9.80 (Android 4.1.2; Linux; Opera Mobi/ADR-1305251841) Presto/2.11.355 Version/12.10"
 17 | 	}
 18 | 
 19 | 	return user_agents[mode]
 20 | 
 21 | 
 22 | def extract_cursor(html) -> str:
 23 | 	"""
 24 | 	extract token for next page
 25 | 	:param html:
 26 | 	:return:
 27 | 	"""
 28 | 	cursor = re.findall('cursor=(\d+)', html)
 29 | 	if len(cursor) > 0:
 30 | 		return cursor[0]
 31 | 	else:
 32 | 		return ""
 33 | 
 34 | 
 35 | def extract_timeline_cursor(response: json) -> tuple:
 36 | 	"""
 37 | 	Extract cursor from json
 38 | 	:param response:
 39 | 	:return:
 40 | 	"""
 41 | 	return response['min_position'], response['has_more_items']
 42 | 
 43 | 
 44 | def extract_ff(html: str) -> list:
 45 | 	"""
 46 | 	Extract followers/followings data from html
 47 | 	:param html:
 48 | 	:return:
 49 | 	"""
 50 | 	result: list = []
 51 | 	soup = BeautifulSoup(html, 'html.parser')
 52 | 	user_tables = soup.find_all('table', attrs={"class": "user-item"})
 53 | 	if user_tables:
 54 | 		for user in user_tables:
 55 | 			avatar = user.find("img", attrs={"class": "profile-image"})['src']
 56 | 			username = user.find("span", attrs={"class": "username"}).text.strip("@")
 57 | 			fullname = user.find("strong", attrs={"class": "fullname"}).text
 58 | 			# append to result list
 59 | 			result.append(FF(username=username, avatar=avatar, fullname=fullname))
 60 | 	return result
 61 | 
 62 | 
 63 | def extract_timeline(html: str) -> list:
 64 | 	"""
 65 | 	Extract tweets from timeline data
 66 | 	:param html:
 67 | 	:return:
 68 | 	"""
 69 | 	result: list = []
 70 | 	soup = BeautifulSoup(html, 'html.parser')
 71 | 
 72 | 	for li in soup.find_all('li', attrs={'class': 'js-stream-item stream-item stream-item'}):
 73 | 		# find first div
 74 | 		first_div = li.find('div')
 75 | 		# user and tweet info
 76 | 		tweet_id = first_div['data-tweet-id']
 77 | 		tweet_link = first_div['data-permalink-path']
 78 | 		conversation_id = first_div['data-conversation-id']
 79 | 		is_reply = first_div.get('data-is-reply-to', "false")
 80 | 		has_parent = first_div.get('data-has-parent-tweet', "false")
 81 | 		screen_name = first_div['data-screen-name']
 82 | 		name = first_div['data-name']
 83 | 		user_id = first_div['data-user-id']
 84 | 		user_mentions = first_div.get('data-mentions', 'false')
 85 | 		if ' ' in user_mentions:
 86 | 			user_mentions = [user for user in user_mentions.split(" ")]
 87 | 		# get content info
 88 | 		content = li.find('div', attrs={'class': 'js-tweet-text-container'}).text
 89 | 		# tweet statistics
 90 | 		reply_count = li.find('span', attrs={'class': 'ProfileTweet-action--reply u-hiddenVisually'}).text
 91 | 		reply_count = reply_count.split(" ")[0].strip()
 92 | 		retweet_count = li.find('span', attrs={'class': 'ProfileTweet-action--retweet u-hiddenVisually'}).text
 93 | 		retweet_count = retweet_count.split(" ")[0].strip()
 94 | 		likes_count = li.find('span', attrs={'class': 'ProfileTweet-action--favorite u-hiddenVisually'}).text
 95 | 		likes_count = likes_count.split(" ")[0].strip()
 96 | 		# time
 97 | 		created_at = li.find('a', attrs={'class': 'tweet-timestamp js-permalink js-nav js-tooltip'})['title']
 98 | 		#
 99 | 		# add data to result
100 | 		result.append(Timeline(
101 | 			tweet_id=tweet_id,
102 | 			tweet_link=tweet_link,
103 | 			conversation_id=conversation_id,
104 | 			is_reply=is_reply,
105 | 			has_parent=has_parent,
106 | 			screen_name=screen_name,
107 | 			name=name,
108 | 			user_id=user_id,
109 | 			user_mentions=user_mentions,
110 | 			content=content,
111 | 			reply_count=reply_count,
112 | 			retweet_count=retweet_count,
113 | 			likes_count=likes_count,
114 | 			created_at=created_at
115 | 		))
116 | 	return result
117 | 
118 | 
119 | def extract_profile(html: str) -> object:
120 | 	"""
121 | 	extract profile data
122 | 	:param html:
123 | 	:return:
124 | 	"""
125 | 	result: list = []
126 | 
127 | 	soup = BeautifulSoup(html, 'html.parser')
128 | 	# get name
129 | 	name = soup.find('div', class_='fullname').text.strip()
130 | 	# verified account
131 | 	verified = soup.find('img', attrs={'alt': 'Verified Account'})
132 | 	if verified:
133 | 		verified = "true"
134 | 	else:
135 | 		verified = "false"
136 | 	# protected account
137 | 	protected = soup.find('div', class_='protected')
138 | 	if protected:
139 | 		protected = "true"
140 | 	else:
141 | 		protected = "false"
142 | 	# screen name
143 | 	username = soup.find('span', class_='screen-name').text.strip()
144 | 	# bio
145 | 	bio = soup.find('div', class_='bio').text.strip()
146 | 	# location
147 | 	location = soup.find('div', class_='location').text.strip()
148 | 	# url
149 | 	url = soup.find('div', class_='url').text.strip()
150 | 	# find profile info
151 | 	info = soup.find_all('div', class_='statnum')
152 | 	if info:
153 | 		tweet_count = info[0].text
154 | 		following_count = info[1].text
155 | 		follower_count = info[2].text
156 | 	else:
157 | 		tweet_count = 0
158 | 		following_count = 0
159 | 		follower_count = 0
160 | 
161 | 	result.append(Profile(
162 | 		name=name,
163 | 		verified=verified,
164 | 		protected=protected,
165 | 		username=username,
166 | 		bio=bio,
167 | 		location=location,
168 | 		url=url,
169 | 		tweet_count=tweet_count,
170 | 		following_count=following_count,
171 | 		follower_count=follower_count
172 | 	))
173 | 	return result
174 | 


--------------------------------------------------------------------------------
/twpy/core/grabber.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Grabber Engine
  3 | import time
  4 | from datetime import datetime
  5 | from ..exceptions import QueryError, ParameterRequired
  6 | from .request import RequestHandler
  7 | from ..config.config import BASE_URL, MOBILE_URL, TIMELINE_WITH_TOKEN_QUERY, APIV1_URL
  8 | from ..utils import extract_cursor, extract_ff, extract_timeline_cursor, extract_timeline, extract_profile
  9 | from time import sleep
 10 | 
 11 | 
 12 | def follower_following(
 13 | 		username: str, limit: int = 0,
 14 | 		type_: str = "followers",
 15 | 		proxy: str = None,
 16 | 		interval: int = 0) -> list:
 17 | 	"""
 18 | 	Followers/Followings scraper
 19 | 	:param username:
 20 | 	:param limit:
 21 | 	:param type_:
 22 | 	:param proxy:
 23 | 	:param interval:
 24 | 	:return:
 25 | 	"""
 26 | 	result: list = []
 27 | 	cursor: str = str()
 28 | 	first_request: bool = True
 29 | 	has_more: bool = True
 30 | 	# mode = FF -> followers/followings user-agent
 31 | 	req = RequestHandler(user_agent="FF")
 32 | 	# if proxy enabled set it
 33 | 	if proxy:
 34 | 		req.proxy = proxy
 35 | 	while has_more:
 36 | 		if first_request:
 37 | 			url = MOBILE_URL + f"/{username}/{type_}/?lang=en"
 38 | 			res = req.get(url)
 39 | 			first_request = False
 40 | 		else:
 41 | 			url = MOBILE_URL + f"/{username}/{type_}/?lang=en&cursor={cursor}"
 42 | 			res = req.get(url)
 43 | 		if res:
 44 | 			# extract cursor
 45 | 			cursor = extract_cursor(res)
 46 | 			if cursor:
 47 | 				has_more = True
 48 | 			else:
 49 | 				has_more = False
 50 | 
 51 | 			# parse followers/followings
 52 | 			extracted_ff = extract_ff(res)
 53 | 			result.extend(extracted_ff)
 54 | 			# if there was limit
 55 | 			if limit > 0:
 56 | 				if len(result) > limit:
 57 | 					return result[:limit]
 58 | 			else:
 59 | 				sleep(interval)
 60 | 				continue
 61 | 
 62 | 		else:
 63 | 			return result
 64 | 		# interval
 65 | 		sleep(interval)
 66 | 
 67 | 	return result
 68 | 
 69 | 
 70 | def timeline(username: str, limit: int = 0, proxy: str = None, interval: int = 0) -> list:
 71 | 	"""
 72 | 	timeline scraper
 73 | 	:param username:
 74 | 	:param limit:
 75 | 	:param proxy:
 76 | 	:param interval:
 77 | 	:return:
 78 | 	"""
 79 | 	result: list = []
 80 | 	cursor = "-1"
 81 | 	has_more = True
 82 | 	req = RequestHandler(user_agent="TIMELINE", ret="json")
 83 | 	if proxy:
 84 | 		req.proxy = proxy
 85 | 
 86 | 	while has_more:
 87 | 
 88 | 		url = BASE_URL+TIMELINE_WITH_TOKEN_QUERY+f"+from:{username}"
 89 | 		url = url.replace("%TOKEN%", cursor)
 90 | 		res = req.get(url)
 91 | 		if res:
 92 | 			cursor, has_more = extract_timeline_cursor(response=res)
 93 | 			extracted_tweets = extract_timeline(res['items_html'])
 94 | 			result.extend(extracted_tweets)
 95 | 			# check limitation
 96 | 			if limit > 0:
 97 | 				if len(result) > limit:
 98 | 					return result[:limit]
 99 | 			else:
100 | 				sleep(interval)
101 | 				continue
102 | 		else:
103 | 			return result
104 | 
105 | 		sleep(interval)
106 | 
107 | 	return result
108 | 
109 | 
110 | def profile(username: str, proxy: str):
111 | 	"""
112 | 	get user profile
113 | 	"""
114 | 	
115 | 	req = RequestHandler(user_agent="MOBILE")
116 | 	if proxy:
117 | 		req.proxy = proxy
118 | 	url = BASE_URL+username+"/?lang=en"
119 | 	res = req.get(url=url)
120 | 	if res:
121 | 		return extract_profile(res)
122 | 	else:
123 | 		return None
124 | 
125 | 
126 | def get_user_id(username: str, proxy: str):
127 | 	"""
128 | 	get user id
129 | 	"""
130 | 
131 | 	req = RequestHandler(user_agent="TIMELINE", ret="json")
132 | 	if proxy:
133 | 		req.proxy = proxy
134 | 	url = APIV1_URL + username
135 | 	res = req.get(url=url)
136 | 	if res:
137 | 		return res.get('user_id', '')
138 | 	else:
139 | 		return ''
140 | 
141 | 
142 | def search(username: str = "", since: str = "", until: str = "", query: str = "", limit: int = 0, verified: bool = False, proxy: str = "", interval: int = 0):
143 | 	"""Advanced search engine"""
144 | 
145 | 	cursor: str = "-1"
146 | 	has_more: bool = True
147 | 	result: list = []
148 | 	req = RequestHandler(user_agent="TIMELINE", ret="json")
149 | 	if proxy:
150 | 		req.proxy = proxy
151 | 
152 | 	if since:
153 | 		since = int(time.mktime(datetime.strptime(since, "%Y-%m-%d").timetuple()))
154 | 
155 | 	if until:
156 | 		if len(until) == 4:
157 | 			until = f"{until}-01-01"
158 | 
159 | 	query_structure = {
160 | 		"from": f"+from:{username}",
161 | 		"since": f"+since:{since}",
162 | 		"verified": ":verified",
163 | 		"until": f"+until:{until}",
164 | 		"query": f"+{query}"
165 | 	}
166 | 
167 | 	if username and query:
168 | 		""" not allowed """
169 | 		raise QueryError("`username` and `query` parameter not allowed together.")
170 | 
171 | 	if since and until:
172 | 		""" not allowed """
173 | 		raise QueryError("`since` and `until` parameter not allowed together.")
174 | 
175 | 	url = BASE_URL+TIMELINE_WITH_TOKEN_QUERY
176 | 	url = url.replace("%TOKEN%", cursor)
177 | 
178 | 	# if there was username or query
179 | 	if username or query:
180 | 		if username:
181 | 			url += query_structure['from']
182 | 		else:
183 | 			url += query_structure['query']
184 | 
185 | 	# if username and query aren't set properly raise error
186 | 	else:
187 | 		raise ParameterRequired("`username` or `query` required for search.")
188 | 
189 | 	if since or until:
190 | 		if since:
191 | 			url += query_structure['since']
192 | 		elif until:
193 | 			url += query_structure['until']
194 | 
195 | 	if verified:
196 | 		url += query_structure['verified']
197 | 
198 | 	while has_more:
199 | 		res = req.get(url=url)
200 | 		if res:
201 | 			cursor, has_more = extract_timeline_cursor(response=res)
202 | 			if cursor:
203 | 				extracted_tweets = extract_timeline(res['items_html'])
204 | 				result.extend(extracted_tweets)
205 | 				url = url.replace("%TOKEN%", cursor)
206 | 				# check limitation
207 | 				if limit > 0:
208 | 					if len(result) > limit:
209 | 						return result[:limit]
210 | 				else:
211 | 					sleep(interval)
212 | 					continue
213 | 			else:
214 | 				break
215 | 			sleep(interval)
216 | 		else:
217 | 			return result
218 | 
219 | 	return result
220 | 


--------------------------------------------------------------------------------