├── get_user_comments.py ├── LICENSE ├── README.md ├── example.py └── search_hn.py /get_user_comments.py: -------------------------------------------------------------------------------- 1 | # gets all (really the first 1000) of a user's comments (nicolashahn here) 2 | # outputs the cleaned comments to a text file, one comment per line 3 | 4 | from bs4 import BeautifulSoup 5 | from search_hn import SearchHN 6 | 7 | hn = SearchHN() 8 | 9 | 10 | def clean(text): 11 | return BeautifulSoup(text.replace("
", " ")).get_text().replace("\n", " ") 12 | 13 | 14 | comments = ( 15 | hn.comments() 16 | .author("nicolashahn") 17 | .max_hits_per_page() # 1000 items per query max 18 | .get() 19 | ) 20 | 21 | with open("nicolashahn_comments.txt", "w") as file: 22 | for comment in comments: 23 | cleaned_text = clean(comment.comment_text) 24 | print(cleaned_text) 25 | file.write(cleaned_text + "\n") 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Nicolas Hahn 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # py-search-hn 2 | Search Hacker News with Python 3 | 4 | ``` 5 | from search_hn import SearchHN 6 | hn = SearchHN() 7 | ``` 8 | 9 | 10 | String together methods to build queries 11 | ``` 12 | results = (hn 13 | .search('bitcoin') # search query = 'bitcoin' 14 | .latest() # return newest first 15 | .stories() # stories only 16 | .get() # execute search 17 | ) 18 | for story in results: 19 | print(story.title) # each JSON result becomes object w/fields as attributes 20 | author = story.get_author() # and helpers to get related items 21 | ``` 22 | 23 | 24 | Or just use the non-composable methods for quick results 25 | ``` 26 | >>> print(hn.get_latest_stories()[0]) 27 | 28 | { '_tags': ['story', 'author_smacktoward', 'story_15383441'], 29 | 'author': 'smacktoward', 30 | 'title': 'Carrier Deployment Raises Questions About Navy’s Rash of ' 31 | 'Physiological Episodes', 32 | 'url': 'https://news.usni.org/2017/10/02/recent-carrier-deployment-raises-questions-navys-rash-physiological-episodes' 33 | ... 34 | ``` 35 | 36 | Get single item (story, comment, poll, etc) by ID or username 37 | ``` 38 | hn.get_item(1234) 39 | hn.get_user('nicolashahn') 40 | ``` 41 | 42 | [Example of how to turn the items returned by a query into a plaintext file, one item per line](get_user_comments.py) 43 | 44 | Check out the [source](search_hn.py#L178) to see available methods or [example.py](example.py) for more examples - better docs soon 45 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | from search_hn import SearchHN 2 | 3 | hn = SearchHN() 4 | 5 | # Get a story, comment, or poll 6 | print(hn.item(1234).get()) 7 | """ 8 | { 'author': 'Alex3917', 9 | 'children': [ { 'author': 'pg', 10 | 'children': [ { 'author': 'Alex3917', 11 | ... 12 | """ 13 | 14 | # Get a user 15 | print(hn.user("nicolashahn").get()) 16 | """ 17 | { 'about': 'Full Stack Engineer at Distribute', 18 | 'avg': 0.0, 19 | 'comment_count': 87, 20 | 'created_at': '2013-03-07T21:01:00.000Z', 21 | ... 22 | """ 23 | 24 | # String together methods to build complex queries 25 | from datetime import datetime 26 | 27 | comments = ( 28 | hn.comments() 29 | # can be timestamp in seconds or datetime 30 | .created_between(datetime(2017, 9, 1), datetime(2017, 9, 30)) 31 | .author("nicolashahn") 32 | .get() 33 | ) 34 | for comment in comments: 35 | print(comment) 36 | """ 37 | [{ '_highlightResult': { 'author': { 'matchLevel': 'none', 38 | 'matchedWords': [], 39 | 'value': 'nicolashahn'}, 40 | 'comment_text': { 'matchLevel': 'none', 41 | 'matchedWords': [], 42 | 'value': "I don't know about " 43 | 'you or the guy ' 44 | 'above, but I pretty ' 45 | 'much always use Uber ' 46 | 'Pool/Lyft Line over ' 47 | 'the single rider ' 48 | ... 49 | 50 | """ 51 | 52 | # You can inspect the SearchHN object to get query parameters 53 | latest_btc = hn.search("bitcoin").stories().latest() 54 | print(latest_btc) 55 | """ 56 | SearchHN object: 57 | { 'base_url': 'http://hn.algolia.com/api/v1/search_by_date', 58 | ... 59 | 'param_obj': {'query': 'bitcoin', 'tags': ['story']}, 60 | ... 61 | """ 62 | 63 | # Or after execution to get the final query url 64 | stories = latest_btc.get(reset=False) 65 | print(latest_btc) 66 | """ 67 | SearchHN object: 68 | { ... 69 | 'full_url': 'http://hn.algolia.com/api/v1/search_by_date?query=bitcoin&tags=story', 70 | ... 71 | """ 72 | 73 | # top-level JSON fields accessible as attributes 74 | for story in stories: 75 | print(story.title) 76 | """ 77 | Blockstream CEO wants 25,000 BTC ($100M) bet over future of Bitcoin Segwit 1X/2X 78 | What the world's financial bigwigs think about Bitcoin 79 | Why Bitcoin and Ethereum will soon be everywhere (for reals) 80 | ... 81 | """ 82 | 83 | # get all comments from the latest "Who is hiring" thread: 84 | whoishiring = hn.get_latest_whoishiring_thread().get_story_comments() 85 | python_jobs = [post for post in whoishiring if "python" in post.comment_text.lower()] 86 | 87 | print( 88 | "{} jobs available, {} python jobs available".format( 89 | len(whoishiring), len(python_jobs) 90 | ) 91 | ) 92 | """ 93 | 70 jobs available, 45 python jobs available 94 | """ 95 | -------------------------------------------------------------------------------- /search_hn.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import urllib 3 | import inspect 4 | import pprint 5 | import datetime 6 | 7 | hn_url = "http://hn.algolia.com/api/v1/" 8 | item_url = "{}items/".format(hn_url) 9 | user_url = "{}users/".format(hn_url) 10 | search_url = "{}search".format(hn_url) 11 | date_url = "{}_by_date".format(search_url) 12 | 13 | pp = pprint.PrettyPrinter(indent=4) 14 | 15 | max_hits_per_page = 1000 16 | 17 | 18 | def attr_list(obj): 19 | members = inspect.getmembers(obj, lambda a: not (inspect.isroutine(a))) 20 | return [m[0] for m in members if not (m[0].startswith("__"))] 21 | 22 | 23 | class QueryFailed(BaseException): 24 | """ Raised when the server request fails. """ 25 | 26 | 27 | class Hit(object): 28 | """basic hit object returned by search""" 29 | 30 | def __init__(self, **fields): 31 | for key in fields.keys(): 32 | setattr(self, key, fields[key]) 33 | 34 | def __repr__(self): 35 | return pp.pformat(self.json()) 36 | 37 | def json(self): 38 | return {attr: self.__dict__[attr] for attr in attr_list(self)} 39 | 40 | @classmethod 41 | def get_type_cls_from_fields(cls, fields): 42 | type_map = { 43 | "story": Story, 44 | "poll": Poll, 45 | "pollopt": PollOption, 46 | "comment": Comment, 47 | "user": User, 48 | } 49 | if "_tags" in fields: 50 | type_key = list(type_map.keys() & fields["_tags"])[0] 51 | return type_map[type_key] 52 | if "type" in fields: 53 | return type_map[fields["type"]] 54 | if "username" in fields: 55 | return User 56 | return cls 57 | 58 | @classmethod 59 | def make(cls, fields): 60 | type_cls = cls.get_type_cls_from_fields(fields) 61 | return type_cls(**fields) 62 | 63 | # 'public' instance methods - use these in your code 64 | 65 | def get_full(self): 66 | """get by id gives more information than a search list item for some types""" 67 | if hasattr(self, "objectID"): 68 | return SearchHN().item(self.objectID).get() 69 | 70 | def get_parent_object(self): 71 | if hasattr(self, "parent_id"): 72 | if self.parent_id: 73 | return SearchHN().item(self.parent_id).get() 74 | 75 | 76 | class Story(Hit): 77 | def get_author(self): 78 | return SearchHN().user(self.author).get() 79 | 80 | def get_story_comments(self): 81 | return SearchHN().story(self.objectID).comments().max_hits_per_page().get() 82 | 83 | 84 | class Poll(Story): 85 | def get_poll_options(self): 86 | # TODO this doesn't work because pollopts don't have a parent ref 87 | # return SearchHN().story(self.objectID).poll_options().get() 88 | raise NotImplementedError 89 | 90 | 91 | class PollOption(Hit): 92 | def get_parent_poll(self): 93 | return SearchHN().item(self.parent_id).get() 94 | pass 95 | 96 | 97 | class Comment(Hit): 98 | def get_parent_story(self): 99 | return SearchHN().item(self.story_id).get() 100 | 101 | 102 | class User(Hit): 103 | def get_full(self): 104 | if hasattr(self, "username"): 105 | return SearchHN().user(self.username).get() 106 | 107 | def get_user_comments(self): 108 | return SearchHN().author(self.username).comments().get() 109 | 110 | def get_user_stories(self): 111 | return SearchHN().author(self.username).stories().get() 112 | 113 | 114 | class SearchHN(object): 115 | """creates and executes the query""" 116 | 117 | def __init__(self): 118 | self.param_obj = {} 119 | self.base_url = search_url 120 | self.single_item = False 121 | self.full_url = None 122 | 123 | def __repr__(self): 124 | return "SearchHN object:\n{}".format(pp.pformat(self._json())) 125 | 126 | def _json(self): 127 | return {attr: self.__dict__[attr] for attr in attr_list(self)} 128 | 129 | def _add_numeric_filter(self, filter): 130 | if "numericFilters" not in self.param_obj: 131 | self.param_obj["numericFilters"] = [] 132 | self.param_obj["numericFilters"].append(filter) 133 | return self 134 | 135 | def _created_at_i(self, symbol, ts): 136 | if type(ts) == datetime.datetime: 137 | ts = ts.timestamp() 138 | created = "created_at_i{}{}".format(symbol, ts) 139 | return self._add_numeric_filter(created) 140 | 141 | def _add_tag(self, tag): 142 | if "tags" not in self.param_obj: 143 | self.param_obj["tags"] = [] 144 | self.param_obj["tags"].append(tag) 145 | return self 146 | 147 | def _single(self): 148 | """ Get single item/user by id/username, not searching. """ 149 | self.single_item = True 150 | return self 151 | 152 | def _add_request_fields(self, json): 153 | """ Add request's response fields after executing a search. """ 154 | for key in [k for k in json.keys() if k != "hits"]: 155 | setattr(self, key, json[key]) 156 | 157 | def _get_field_str(self, field): 158 | """ Because requests.get() does this wrong. """ 159 | result = "" 160 | if field in self.param_obj: 161 | result = "&{}=".format(field) + ",".join(self.param_obj[field]) 162 | self.param_obj.pop(field, None) 163 | return result 164 | 165 | def _get_full_url(self): 166 | tags_str = self._get_field_str("tags") 167 | numeric_str = self._get_field_str("numericFilters") 168 | param_str_minus_tags = urllib.parse.urlencode(self.param_obj) 169 | return "{}?{}{}{}".format( 170 | self.base_url, param_str_minus_tags, numeric_str, tags_str 171 | ) 172 | 173 | def _request(self): 174 | full_url = self._get_full_url() 175 | setattr(self, "full_url", full_url) 176 | return requests.get(full_url) 177 | 178 | # "public" methods - use these 179 | # design choice: should methods that don't take an arg besides self 180 | # have @property? Currently do not, for the sake of consistency 181 | 182 | def search(self, query_str): 183 | self.param_obj["query"] = query_str 184 | return self 185 | 186 | def min_points(self, points): 187 | return self._add_numeric_filter("points>={}".format(points)) 188 | 189 | def min_comments(self, comments): 190 | return self._add_numeric_filter("num_comments>={}".format(comments)) 191 | 192 | def latest(self): 193 | self.base_url = date_url 194 | return self 195 | 196 | def created_after(self, timestamp): 197 | return self._created_at_i(">", timestamp) 198 | 199 | def created_before(self, timestamp): 200 | return self._created_at_i("<", timestamp) 201 | 202 | def created_between(self, ts1, ts2): 203 | self.created_after(ts1) 204 | return self.created_before(ts2) 205 | 206 | def stories(self): 207 | return self._add_tag("story") 208 | 209 | def comments(self): 210 | return self._add_tag("comment") 211 | 212 | def polls(self): 213 | return self._add_tag("poll") 214 | 215 | def poll_options(self): 216 | return self._add_tag("pollopt") 217 | 218 | def author(self, author): 219 | return self._add_tag("author_{}".format(author)) 220 | 221 | def whoishiring_threads(self): 222 | return self.author("whoishiring").stories().search("hiring") 223 | 224 | def whowantstobehired_threads(self): 225 | return self.author("whoishiring").stories().search("hired") 226 | 227 | def story(self, story_id): 228 | return self._add_tag("story_{}".format(story_id)) 229 | 230 | def item(self, object_id): 231 | self.base_url = item_url + str(object_id) 232 | return self._single() 233 | 234 | def user(self, username): 235 | self.base_url = user_url + username 236 | return self._single() 237 | 238 | def hits_per_page(self, num_hits): 239 | self.param_obj["hitsPerPage"] = num_hits 240 | return self 241 | 242 | def max_hits_per_page(self): 243 | return self.hits_per_page(max_hits_per_page) 244 | 245 | def page(self, page_num): 246 | self.param_obj["page"] = page_num 247 | return self 248 | 249 | def reset(self): 250 | self = self.__init__() 251 | return self 252 | 253 | def get(self, reset=True): 254 | """ 255 | `reset` as kwarg because a user may want to use same query but 256 | increment page count. 257 | """ 258 | resp = self._request() 259 | if not resp.ok: 260 | raise QueryFailed 261 | if self.single_item: 262 | result = Hit.make(resp.json()) 263 | else: 264 | result = [Hit.make(hit) for hit in resp.json()["hits"]] 265 | self._add_request_fields(resp.json()) 266 | if reset: 267 | self.reset() 268 | return result 269 | 270 | def get_first(self): 271 | return self.hits_per_page(1).get()[0] 272 | 273 | # non composable - quick and easy results 274 | 275 | def get_item(self, item): 276 | return self.item(item).get() 277 | 278 | def get_user(self, username): 279 | return self.user(username).get() 280 | 281 | def get_latest_stories(self): 282 | return self.latest().stories().get() 283 | 284 | def search_stories(self, query): 285 | return self.search(query).get() 286 | 287 | def get_latest_comments(self): 288 | return self.latest().comments().get() 289 | 290 | def search_comments(self, query): 291 | return self.search(query).comments().get() 292 | 293 | def get_latest_whoishiring_thread(self): 294 | return self.whoishiring_threads().latest().get_first() 295 | 296 | def get_latest_whowantstobehired_thread(self): 297 | return self.whowantstobehired_threads().latest().get_first() 298 | 299 | 300 | if __name__ == "__main__": 301 | # TODO make cli 302 | hn = SearchHN() 303 | print(hn.latest().stories().get_first()) 304 | --------------------------------------------------------------------------------