├── Pipfile ├── Pipfile.lock ├── articles-20200620-114853.json ├── articles-20200620-123204.json ├── articles.txt ├── celerybeat-schedule.db ├── license.md ├── readme.md ├── requirements.txt ├── scraping.py └── tasks.py /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | 8 | [packages] 9 | celery = "*" 10 | requests = "*" 11 | bs4 = "*" 12 | 13 | [requires] 14 | python_version = "3.7" 15 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "8b43361e28fd0854444d7e4eae3b9dbf28408c1a456ed6cb08a1217ea7e2f8b4" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.7" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "https://pypi.org/simple", 14 | "verify_ssl": true 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "amqp": { 20 | "hashes": [ 21 | "sha256:70cdb10628468ff14e57ec2f751c7aa9e48e7e3651cfd62d431213c0c4e58f21", 22 | "sha256:aa7f313fb887c91f15474c1229907a04dac0b8135822d6603437803424c0aa59" 23 | ], 24 | "version": "==2.6.1" 25 | }, 26 | "beautifulsoup4": { 27 | "hashes": [ 28 | "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35", 29 | "sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25", 30 | "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666" 31 | ], 32 | "version": "==4.9.3" 33 | }, 34 | "billiard": { 35 | "hashes": [ 36 | "sha256:bff575450859a6e0fbc2f9877d9b715b0bbc07c3565bb7ed2280526a0cdf5ede", 37 | "sha256:d91725ce6425f33a97dfa72fb6bfef0e47d4652acd98a032bd1a7fbf06d5fa6a" 38 | ], 39 | "version": "==3.6.3.0" 40 | }, 41 | "bs4": { 42 | "hashes": [ 43 | "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a" 44 | ], 45 | "index": "pypi", 46 | "version": "==0.0.1" 47 | }, 48 | "celery": { 49 | "hashes": [ 50 | "sha256:c3f4173f83ceb5a5c986c5fdaefb9456de3b0729a72a5776e46bd405fda7b647", 51 | "sha256:d1762d6065522879f341c3d67c2b9fe4615eb79756d59acb1434601d4aca474b" 52 | ], 53 | "index": "pypi", 54 | "version": "==4.4.5" 55 | }, 56 | "certifi": { 57 | "hashes": [ 58 | "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c", 59 | "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830" 60 | ], 61 | "version": "==2020.12.5" 62 | }, 63 | "chardet": { 64 | "hashes": [ 65 | "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", 66 | "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" 67 | ], 68 | "version": "==3.0.4" 69 | }, 70 | "future": { 71 | "hashes": [ 72 | "sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d" 73 | ], 74 | "version": "==0.18.2" 75 | }, 76 | "idna": { 77 | "hashes": [ 78 | "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6", 79 | "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0" 80 | ], 81 | "version": "==2.10" 82 | }, 83 | "importlib-metadata": { 84 | "hashes": [ 85 | "sha256:c9db46394197244adf2f0b08ec5bc3cf16757e9590b02af1fca085c16c0d600a", 86 | "sha256:d2d46ef77ffc85cbf7dac7e81dd663fde71c45326131bea8033b9bad42268ebe" 87 | ], 88 | "markers": "python_version < '3.8'", 89 | "version": "==3.10.0" 90 | }, 91 | "kombu": { 92 | "hashes": [ 93 | "sha256:be48cdffb54a2194d93ad6533d73f69408486483d189fe9f5990ee24255b0e0a", 94 | "sha256:ca1b45faac8c0b18493d02a8571792f3c40291cf2bcf1f55afed3d8f3aa7ba74" 95 | ], 96 | "version": "==4.6.11" 97 | }, 98 | "lxml": { 99 | "hashes": [ 100 | "sha256:079f3ae844f38982d156efce585bc540c16a926d4436712cf4baee0cce487a3d", 101 | "sha256:0fbcf5565ac01dff87cbfc0ff323515c823081c5777a9fc7703ff58388c258c3", 102 | "sha256:122fba10466c7bd4178b07dba427aa516286b846b2cbd6f6169141917283aae2", 103 | "sha256:1b7584d421d254ab86d4f0b13ec662a9014397678a7c4265a02a6d7c2b18a75f", 104 | "sha256:26e761ab5b07adf5f555ee82fb4bfc35bf93750499c6c7614bd64d12aaa67927", 105 | "sha256:289e9ca1a9287f08daaf796d96e06cb2bc2958891d7911ac7cae1c5f9e1e0ee3", 106 | "sha256:2a9d50e69aac3ebee695424f7dbd7b8c6d6eb7de2a2eb6b0f6c7db6aa41e02b7", 107 | "sha256:33bb934a044cf32157c12bfcfbb6649807da20aa92c062ef51903415c704704f", 108 | "sha256:3439c71103ef0e904ea0a1901611863e51f50b5cd5e8654a151740fde5e1cade", 109 | "sha256:39b78571b3b30645ac77b95f7c69d1bffc4cf8c3b157c435a34da72e78c82468", 110 | "sha256:4289728b5e2000a4ad4ab8da6e1db2e093c63c08bdc0414799ee776a3f78da4b", 111 | "sha256:4bff24dfeea62f2e56f5bab929b4428ae6caba2d1eea0c2d6eb618e30a71e6d4", 112 | "sha256:542d454665a3e277f76954418124d67516c5f88e51a900365ed54a9806122b83", 113 | "sha256:5a0a14e264069c03e46f926be0d8919f4105c1623d620e7ec0e612a2e9bf1c04", 114 | "sha256:66e575c62792c3f9ca47cb8b6fab9e35bab91360c783d1606f758761810c9791", 115 | "sha256:74f7d8d439b18fa4c385f3f5dfd11144bb87c1da034a466c5b5577d23a1d9b51", 116 | "sha256:7610b8c31688f0b1be0ef882889817939490a36d0ee880ea562a4e1399c447a1", 117 | "sha256:76fa7b1362d19f8fbd3e75fe2fb7c79359b0af8747e6f7141c338f0bee2f871a", 118 | "sha256:7728e05c35412ba36d3e9795ae8995e3c86958179c9770e65558ec3fdfd3724f", 119 | "sha256:8157dadbb09a34a6bd95a50690595e1fa0af1a99445e2744110e3dca7831c4ee", 120 | "sha256:820628b7b3135403540202e60551e741f9b6d3304371712521be939470b454ec", 121 | "sha256:884ab9b29feaca361f7f88d811b1eea9bfca36cf3da27768d28ad45c3ee6f969", 122 | "sha256:89b8b22a5ff72d89d48d0e62abb14340d9e99fd637d046c27b8b257a01ffbe28", 123 | "sha256:92e821e43ad382332eade6812e298dc9701c75fe289f2a2d39c7960b43d1e92a", 124 | "sha256:b007cbb845b28db4fb8b6a5cdcbf65bacb16a8bd328b53cbc0698688a68e1caa", 125 | "sha256:bc4313cbeb0e7a416a488d72f9680fffffc645f8a838bd2193809881c67dd106", 126 | "sha256:bccbfc27563652de7dc9bdc595cb25e90b59c5f8e23e806ed0fd623755b6565d", 127 | "sha256:c4f05c5a7c49d2fb70223d0d5bcfbe474cf928310ac9fa6a7c6dddc831d0b1d4", 128 | "sha256:ce256aaa50f6cc9a649c51be3cd4ff142d67295bfc4f490c9134d0f9f6d58ef0", 129 | "sha256:d2e35d7bf1c1ac8c538f88d26b396e73dd81440d59c1ef8522e1ea77b345ede4", 130 | "sha256:df7c53783a46febb0e70f6b05df2ba104610f2fb0d27023409734a3ecbb78fb2", 131 | "sha256:efac139c3f0bf4f0939f9375af4b02c5ad83a622de52d6dfa8e438e8e01d0eb0", 132 | "sha256:efd7a09678fd8b53117f6bae4fa3825e0a22b03ef0a932e070c0bdbb3a35e654", 133 | "sha256:f2380a6376dfa090227b663f9678150ef27543483055cc327555fb592c5967e2", 134 | "sha256:f8380c03e45cf09f8557bdaa41e1fa7c81f3ae22828e1db470ab2a6c96d8bc23", 135 | "sha256:f90ba11136bfdd25cae3951af8da2e95121c9b9b93727b1b896e3fa105b2f586" 136 | ], 137 | "index": "pypi", 138 | "version": "==4.6.3" 139 | }, 140 | "pytz": { 141 | "hashes": [ 142 | "sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da", 143 | "sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798" 144 | ], 145 | "version": "==2021.1" 146 | }, 147 | "requests": { 148 | "hashes": [ 149 | "sha256:b3559a131db72c33ee969480840fff4bb6dd111de7dd27c8ee1f820f4f00231b", 150 | "sha256:fe75cc94a9443b9246fc7049224f75604b113c36acb93f87b80ed42c44cbb898" 151 | ], 152 | "index": "pypi", 153 | "version": "==2.24.0" 154 | }, 155 | "soupsieve": { 156 | "hashes": [ 157 | "sha256:052774848f448cf19c7e959adf5566904d525f33a3f8b6ba6f6f8f26ec7de0cc", 158 | "sha256:c2c1c2d44f158cdbddab7824a9af8c4f83c76b1e23e049479aa432feb6c4c23b" 159 | ], 160 | "markers": "python_version >= '3.0'", 161 | "version": "==2.2.1" 162 | }, 163 | "typing-extensions": { 164 | "hashes": [ 165 | "sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918", 166 | "sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c", 167 | "sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f" 168 | ], 169 | "markers": "python_version < '3.8'", 170 | "version": "==3.7.4.3" 171 | }, 172 | "urllib3": { 173 | "hashes": [ 174 | "sha256:8d7eaa5a82a1cac232164990f04874c594c9453ec55eef02eab885aa02fc17a2", 175 | "sha256:f5321fbe4bf3fefa0efd0bfe7fb14e90909eb62a48ccda331726b4319897dd5e" 176 | ], 177 | "version": "==1.25.11" 178 | }, 179 | "vine": { 180 | "hashes": [ 181 | "sha256:133ee6d7a9016f177ddeaf191c1f58421a1dcc6ee9a42c58b34bed40e1d2cd87", 182 | "sha256:ea4947cc56d1fd6f2095c8d543ee25dad966f78692528e68b4fada11ba3f98af" 183 | ], 184 | "version": "==1.3.0" 185 | }, 186 | "zipp": { 187 | "hashes": [ 188 | "sha256:3607921face881ba3e026887d8150cca609d517579abe052ac81fc5aeffdbd76", 189 | "sha256:51cb66cc54621609dd593d1787f286ee42a5c0adbb4b29abea5a63edc3e03098" 190 | ], 191 | "version": "==3.4.1" 192 | } 193 | }, 194 | "develop": {} 195 | } 196 | -------------------------------------------------------------------------------- /articles-20200620-114853.json: -------------------------------------------------------------------------------- 1 | [{"title": "Trump Expected to Suspend H-1B, Other Visas Until End of Year", "link": "https://www.npr.org/2020/06/20/881245867/trump-expected-to-suspend-h-1b-other-visas-until-end-of-year", "published": "Sat, 20 Jun 2020 18:17:24 +0000", "created_at": "2020-06-20 11:48:53.060362", "source": "HackerNews RSS"}, {"title": "Why Figma Wins", "link": "https://kwokchain.com/2020/06/19/why-figma-wins/", "published": "Sat, 20 Jun 2020 16:54:51 +0000", "created_at": "2020-06-20 11:48:53.060461", "source": "HackerNews RSS"}, {"title": "Hypercritical: The Art of the Possible", "link": "https://hypercritical.co/2020/06/20/the-art-of-the-possible", "published": "Sat, 20 Jun 2020 16:01:19 +0000", "created_at": "2020-06-20 11:48:53.060534", "source": "HackerNews RSS"}, {"title": "NimConf 2020 \u2013 Live now until 20:45 UTC [video]", "link": "https://conf.nim-lang.org/index.html?ref=hn", "published": "Sat, 20 Jun 2020 17:05:18 +0000", "created_at": "2020-06-20 11:48:53.060605", "source": "HackerNews RSS"}, {"title": "RSS Box \u2013 RSS for websites that do not support RSS", "link": "https://rssbox.herokuapp.com/", "published": "Sat, 20 Jun 2020 12:41:50 +0000", "created_at": "2020-06-20 11:48:53.060748", "source": "HackerNews RSS"}, {"title": "A Nvidia Engineer Wrote a Vulkan Driver That Works on Older Raspberry Pi", "link": "https://www.phoronix.com/scan.php?page=news_item&px=RPi-VK-Driver", "published": "Sat, 20 Jun 2020 15:19:56 +0000", "created_at": "2020-06-20 11:48:53.060837", "source": "HackerNews RSS"}, {"title": "\u201cUpon re-evaluation, we found that your app is not in compliance\u201d", "link": "https://twitter.com/keleftheriou/status/1274356729224892416", "published": "Sat, 20 Jun 2020 18:27:11 +0000", "created_at": "2020-06-20 11:48:53.060909", "source": "HackerNews RSS"}, {"title": "Discovering Dennis Ritchie\u2019s Lost Dissertation", "link": "https://computerhistory.org/blog/discovering-dennis-ritchies-lost-dissertation/", "published": "Sat, 20 Jun 2020 05:59:03 +0000", "created_at": "2020-06-20 11:48:53.061057", "source": "HackerNews RSS"}, {"title": "An Intro to Compilers", "link": "https://nicoleorchard.com/blog/compilers", "published": "Sat, 20 Jun 2020 06:48:29 +0000", "created_at": "2020-06-20 11:48:53.061129", "source": "HackerNews RSS"}, {"title": "Representing Graphs by Knuth Trees (1974) [pdf]", "link": "https://www.cs.virginia.edu/~jlp/75.knuth.trees.pdf", "published": "Sat, 20 Jun 2020 12:23:27 +0000", "created_at": "2020-06-20 11:48:53.061201", "source": "HackerNews RSS"}, {"title": "A Technology Preview of Nginx Support for QUIC and HTTP/3", "link": "https://www.nginx.com/blog/introducing-technology-preview-nginx-support-for-quic-http-3/", "published": "Sat, 20 Jun 2020 07:35:42 +0000", "created_at": "2020-06-20 11:48:53.061273", "source": "HackerNews RSS"}, {"title": "Estimating pitch with SPICE and Tensorflow Hub", "link": "https://blog.tensorflow.org/2020/06/estimating-pitch-with-spice-and-tensorflow-hub.html", "published": "Thu, 18 Jun 2020 10:45:47 +0000", "created_at": "2020-06-20 11:48:53.061344", "source": "HackerNews RSS"}, {"title": "Rediscovering the beauty of text on the internet", "link": "https://cheapskatesguide.org/articles/beauty-of-text.html", "published": "Fri, 19 Jun 2020 22:01:54 +0000", "created_at": "2020-06-20 11:48:53.061416", "source": "HackerNews RSS"}, {"title": "Anyone Can Build This Open Source, DRM-Free Kindle Alternative", "link": "https://www.vice.com/en_us/article/7x5kpb/anyone-can-build-this-open-source-drm-free-kindle-alternative", "published": "Sat, 20 Jun 2020 16:43:45 +0000", "created_at": "2020-06-20 11:48:53.061488", "source": "HackerNews RSS"}, {"title": "Building an online community around learning from incidents (2019)", "link": "https://www.learningfromincidents.io/blog/learning-from-incidents-in-software", "published": "Sat, 20 Jun 2020 14:21:09 +0000", "created_at": "2020-06-20 11:48:53.061561", "source": "HackerNews RSS"}, {"title": "Better Python 59 Ways", "link": "https://github.com/SigmaQuan/Better-Python-59-Ways", "published": "Sat, 20 Jun 2020 15:53:27 +0000", "created_at": "2020-06-20 11:48:53.061674", "source": "HackerNews RSS"}, {"title": "Why Apple ditched PowerPC, and what it says about Apple ditching Intel", "link": "https://tedium.co/2020/06/16/apple-powerpc-intel-transition-history/", "published": "Fri, 19 Jun 2020 11:14:57 +0000", "created_at": "2020-06-20 11:48:53.061747", "source": "HackerNews RSS"}, {"title": "CityLab has been relaunched under the Bloomberg umbrella", "link": "https://www.niemanlab.org/2020/06/citylab-has-been-relaunched-under-the-bloomberg-umbrella/", "published": "Sat, 20 Jun 2020 12:01:12 +0000", "created_at": "2020-06-20 11:48:53.061819", "source": "HackerNews RSS"}, {"title": "Osint Amateur Hour", "link": "https://www.secjuice.com/geolocation-osint-amateur-hour/", "published": "Fri, 19 Jun 2020 07:12:36 +0000", "created_at": "2020-06-20 11:48:53.061894", "source": "HackerNews RSS"}, {"title": "Ask HN: Google won't remove my site URL from random business using it on Maps", "link": "https://news.ycombinator.com/item?id=23582602", "published": "Sat, 20 Jun 2020 08:22:47 +0000", "created_at": "2020-06-20 11:48:53.061968", "source": "HackerNews RSS"}, {"title": "Show HN: Tragopan \u2013 Minimal, dependency-free pan/zoom JavaScript library", "link": "https://github.com/team-video/tragopan", "published": "Fri, 19 Jun 2020 21:36:46 +0000", "created_at": "2020-06-20 11:48:53.062040", "source": "HackerNews RSS"}, {"title": "Making an iPad case for blind to \u201cSee\u201d with touch", "link": "https://youtu.be/8Au47gnXs0w", "published": "Sat, 20 Jun 2020 16:15:40 +0000", "created_at": "2020-06-20 11:48:53.062112", "source": "HackerNews RSS"}, {"title": "Gameloft took down Marvel's SpiderMan trailer (11M views) with a copyright claim", "link": "https://www.reddit.com/r/PS5/comments/hcj9hx/gameloft_has_taken_down_marvels_spiderman_miles/", "published": "Sat, 20 Jun 2020 15:42:42 +0000", "created_at": "2020-06-20 11:48:53.062183", "source": "HackerNews RSS"}, {"title": "Freud and Faith (2007)", "link": "https://www.nytimes.com/2007/09/09/magazine/09wwln-lede-t.html", "published": "Sat, 20 Jun 2020 14:33:54 +0000", "created_at": "2020-06-20 11:48:53.062255", "source": "HackerNews RSS"}, {"title": "Mozart\u2019s Infinite Riches", "link": "https://standpointmag.co.uk/issues/may-june-2020/mozarts-infinite-riches/", "published": "Fri, 19 Jun 2020 22:04:14 +0000", "created_at": "2020-06-20 11:48:53.062327", "source": "HackerNews RSS"}, {"title": "Alex: A ML-enhanced range index", "link": "https://github.com/microsoft/ALEX", "published": "Fri, 19 Jun 2020 06:48:31 +0000", "created_at": "2020-06-20 11:48:53.062399", "source": "HackerNews RSS"}, {"title": "Written communication is remote work super power", "link": "https://snir.dev/blog/remote-async-communication/", "published": "Fri, 19 Jun 2020 18:08:45 +0000", "created_at": "2020-06-20 11:48:53.062472", "source": "HackerNews RSS"}, {"title": "Show HN: VOTEism \u2013 Secure political opinion poll app", "link": "https://docs.voteism.org/", "published": "Sat, 20 Jun 2020 16:03:21 +0000", "created_at": "2020-06-20 11:48:53.062544", "source": "HackerNews RSS"}, {"title": "When God asks you to build an OS", "link": "https://en.wikipedia.org/wiki/TempleOS", "published": "Sat, 20 Jun 2020 18:28:35 +0000", "created_at": "2020-06-20 11:48:53.062698", "source": "HackerNews RSS"}, {"title": "BBS and Usenet era Textfiles", "link": "http://textfiles.com/", "published": "Sat, 20 Jun 2020 08:40:28 +0000", "created_at": "2020-06-20 11:48:53.062773", "source": "HackerNews RSS"}] -------------------------------------------------------------------------------- /articles-20200620-123204.json: -------------------------------------------------------------------------------- 1 | [{"title": "Why Figma Wins", "link": "https://kwokchain.com/2020/06/19/why-figma-wins/", "published": "Sat, 20 Jun 2020 16:54:51 +0000", "created_at": "2020-06-20 12:32:04.518125", "source": "HackerNews RSS"}, {"title": "Homelessness: The Problem That Even Silicon Valley Can\u2019t Seem to Solve", "link": "https://computerhistory.org/blog/homelessness-the-problem-that-even-silicon-valley-cant-seem-to-solve/", "published": "Sat, 20 Jun 2020 18:17:55 +0000", "created_at": "2020-06-20 12:32:04.518209", "source": "HackerNews RSS"}, {"title": "Hypercritical: The Art of the Possible", "link": "https://hypercritical.co/2020/06/20/the-art-of-the-possible", "published": "Sat, 20 Jun 2020 16:01:19 +0000", "created_at": "2020-06-20 12:32:04.518284", "source": "HackerNews RSS"}, {"title": "NimConf 2020 \u2013 Live now until 20:45 UTC [video]", "link": "https://conf.nim-lang.org/index.html?ref=hn", "published": "Sat, 20 Jun 2020 17:05:18 +0000", "created_at": "2020-06-20 12:32:04.518357", "source": "HackerNews RSS"}, {"title": "RSS Box \u2013 RSS for websites that do not support RSS", "link": "https://rssbox.herokuapp.com/", "published": "Sat, 20 Jun 2020 12:41:50 +0000", "created_at": "2020-06-20 12:32:04.518513", "source": "HackerNews RSS"}, {"title": "\u201cUpon re-evaluation, we found that your app is not in compliance\u201d", "link": "https://twitter.com/keleftheriou/status/1274356729224892416", "published": "Sat, 20 Jun 2020 18:27:11 +0000", "created_at": "2020-06-20 12:32:04.518603", "source": "HackerNews RSS"}, {"title": "A Nvidia Engineer Wrote a Vulkan Driver That Works on Older Raspberry Pi", "link": "https://www.phoronix.com/scan.php?page=news_item&px=RPi-VK-Driver", "published": "Sat, 20 Jun 2020 15:19:56 +0000", "created_at": "2020-06-20 12:32:04.518766", "source": "HackerNews RSS"}, {"title": "The Most Famous Loop", "link": "https://alexdanco.com/2020/06/19/the-most-famous-loop/", "published": "Sat, 20 Jun 2020 11:13:22 +0000", "created_at": "2020-06-20 12:32:04.518839", "source": "HackerNews RSS"}, {"title": "Trump Expected to Suspend H-1B, Other Visas Until End of Year", "link": "https://www.npr.org/2020/06/20/881245867/trump-expected-to-suspend-h-1b-other-visas-until-end-of-year", "published": "Sat, 20 Jun 2020 18:17:24 +0000", "created_at": "2020-06-20 12:32:04.518910", "source": "HackerNews RSS"}, {"title": "Discovering Dennis Ritchie\u2019s Lost Dissertation", "link": "https://computerhistory.org/blog/discovering-dennis-ritchies-lost-dissertation/", "published": "Sat, 20 Jun 2020 05:59:03 +0000", "created_at": "2020-06-20 12:32:04.518982", "source": "HackerNews RSS"}, {"title": "An Intro to Compilers", "link": "https://nicoleorchard.com/blog/compilers", "published": "Sat, 20 Jun 2020 06:48:29 +0000", "created_at": "2020-06-20 12:32:04.519054", "source": "HackerNews RSS"}, {"title": "A Technology Preview of Nginx Support for QUIC and HTTP/3", "link": "https://www.nginx.com/blog/introducing-technology-preview-nginx-support-for-quic-http-3/", "published": "Sat, 20 Jun 2020 07:35:42 +0000", "created_at": "2020-06-20 12:32:04.519126", "source": "HackerNews RSS"}, {"title": "Representing Graphs by Knuth Trees (1974) [pdf]", "link": "https://www.cs.virginia.edu/~jlp/75.knuth.trees.pdf", "published": "Sat, 20 Jun 2020 12:23:27 +0000", "created_at": "2020-06-20 12:32:04.519198", "source": "HackerNews RSS"}, {"title": "Building an online community around learning from incidents (2019)", "link": "https://www.learningfromincidents.io/blog/learning-from-incidents-in-software", "published": "Sat, 20 Jun 2020 14:21:09 +0000", "created_at": "2020-06-20 12:32:04.519270", "source": "HackerNews RSS"}, {"title": "From Head to Toe in the Ancient Maya World", "link": "https://www.archaeology.org/issues/386-2007/features/8757-maya-clothing-jewelry-body-modification", "published": "Fri, 19 Jun 2020 22:48:26 +0000", "created_at": "2020-06-20 12:32:04.519370", "source": "HackerNews RSS"}, {"title": "Anyone Can Build This Open Source, DRM-Free Kindle Alternative", "link": "https://www.vice.com/en_us/article/7x5kpb/anyone-can-build-this-open-source-drm-free-kindle-alternative", "published": "Sat, 20 Jun 2020 16:43:45 +0000", "created_at": "2020-06-20 12:32:04.519519", "source": "HackerNews RSS"}, {"title": "Estimating pitch with SPICE and Tensorflow Hub", "link": "https://blog.tensorflow.org/2020/06/estimating-pitch-with-spice-and-tensorflow-hub.html", "published": "Thu, 18 Jun 2020 10:45:47 +0000", "created_at": "2020-06-20 12:32:04.519611", "source": "HackerNews RSS"}, {"title": "Rediscovering the beauty of text on the internet", "link": "https://cheapskatesguide.org/articles/beauty-of-text.html", "published": "Fri, 19 Jun 2020 22:01:54 +0000", "created_at": "2020-06-20 12:32:04.519683", "source": "HackerNews RSS"}, {"title": "Better Python 59 Ways", "link": "https://github.com/SigmaQuan/Better-Python-59-Ways", "published": "Sat, 20 Jun 2020 15:53:27 +0000", "created_at": "2020-06-20 12:32:04.519755", "source": "HackerNews RSS"}, {"title": "Freud and Faith (2007)", "link": "https://www.nytimes.com/2007/09/09/magazine/09wwln-lede-t.html", "published": "Sat, 20 Jun 2020 14:33:54 +0000", "created_at": "2020-06-20 12:32:04.519827", "source": "HackerNews RSS"}, {"title": "Why Apple ditched PowerPC, and what it says about Apple ditching Intel", "link": "https://tedium.co/2020/06/16/apple-powerpc-intel-transition-history/", "published": "Fri, 19 Jun 2020 11:14:57 +0000", "created_at": "2020-06-20 12:32:04.519899", "source": "HackerNews RSS"}, {"title": "Osint Amateur Hour", "link": "https://www.secjuice.com/geolocation-osint-amateur-hour/", "published": "Fri, 19 Jun 2020 07:12:36 +0000", "created_at": "2020-06-20 12:32:04.520001", "source": "HackerNews RSS"}, {"title": "Gameloft took down Marvel's SpiderMan trailer (11M views) with a copyright claim", "link": "https://www.reddit.com/r/PS5/comments/hcj9hx/gameloft_has_taken_down_marvels_spiderman_miles/", "published": "Sat, 20 Jun 2020 15:42:42 +0000", "created_at": "2020-06-20 12:32:04.520074", "source": "HackerNews RSS"}, {"title": "Show HN: Tragopan \u2013 Minimal, dependency-free pan/zoom JavaScript library", "link": "https://github.com/team-video/tragopan", "published": "Fri, 19 Jun 2020 21:36:46 +0000", "created_at": "2020-06-20 12:32:04.520147", "source": "HackerNews RSS"}, {"title": "Ask HN: Google won't remove my site URL from random business using it on Maps", "link": "https://news.ycombinator.com/item?id=23582602", "published": "Sat, 20 Jun 2020 08:22:47 +0000", "created_at": "2020-06-20 12:32:04.520219", "source": "HackerNews RSS"}, {"title": "Show HN: VOTEism \u2013 Secure political opinion poll app", "link": "https://docs.voteism.org/", "published": "Sat, 20 Jun 2020 16:03:21 +0000", "created_at": "2020-06-20 12:32:04.520291", "source": "HackerNews RSS"}, {"title": "Why you hate contemporary architecture (2017)", "link": "https://www.currentaffairs.org/2017/10/why-you-hate-contemporary-architecture", "published": "Sat, 20 Jun 2020 09:53:21 +0000", "created_at": "2020-06-20 12:32:04.520467", "source": "HackerNews RSS"}, {"title": "CityLab has been relaunched under the Bloomberg umbrella", "link": "https://www.niemanlab.org/2020/06/citylab-has-been-relaunched-under-the-bloomberg-umbrella/", "published": "Sat, 20 Jun 2020 12:01:12 +0000", "created_at": "2020-06-20 12:32:04.520583", "source": "HackerNews RSS"}, {"title": "Written communication is remote work super power", "link": "https://snir.dev/blog/remote-async-communication/", "published": "Fri, 19 Jun 2020 18:08:45 +0000", "created_at": "2020-06-20 12:32:04.520657", "source": "HackerNews RSS"}, {"title": "Making an iPad case for blind to \u201cSee\u201d with touch", "link": "https://youtu.be/8Au47gnXs0w", "published": "Sat, 20 Jun 2020 16:15:40 +0000", "created_at": "2020-06-20 12:32:04.520732", "source": "HackerNews RSS"}] -------------------------------------------------------------------------------- /articles.txt: -------------------------------------------------------------------------------- 1 | [{"title": "Wil Shipley: Every year I fill out this survey from Apple, for Apple developers", "link": "https://medianatives.blogspot.com/2020/06/wil-shipley-every-year-i-fill-out-this.html", "published": "Fri, 12 Jun 2020 21:47:02 +0000"}, {"title": "Biohacking Lite", "link": "https://karpathy.github.io/2020/06/11/biohacking-lite/", "published": "Fri, 12 Jun 2020 16:44:09 +0000"}, {"title": "The relationship between mindset and age", "link": "http://aging.nautil.us/feature/218/why-you-cant-help-but-act-your-age", "published": "Fri, 12 Jun 2020 21:27:36 +0000"}, {"title": "Jepsen: PostgreSQL 12.3", "link": "http://jepsen.io/analyses/postgresql-12.3", "published": "Fri, 12 Jun 2020 13:03:30 +0000"}, {"title": "The Early History of F# [pdf]", "link": "https://dl.acm.org/doi/pdf/10.1145/3386325", "published": "Sat, 13 Jun 2020 00:14:14 +0000"}, {"title": "Twilio Super Sim \u2013 Public Beta", "link": "https://www.twilio.com/docs/iot/supersim", "published": "Fri, 12 Jun 2020 16:23:36 +0000"}, {"title": "Show HN: Code Notes \u2013 A Gatsby theme for publishing code-related notes", "link": "https://zander.wtf/blog/code-notes-release", "published": "Sat, 13 Jun 2020 00:22:40 +0000"}, {"title": "Facebook fires employee who protested its inaction on Trump tweets", "link": "https://www.reuters.com/article/us-facebook-protests-firing/facebook-fires-employee-who-protested-its-inaction-on-trump-tweets-idUSKBN23J35Y", "published": "Fri, 12 Jun 2020 22:14:24 +0000"}, {"title": "The blissfully escapist comic novels of PG Wodehouse", "link": "https://www.bbc.com/culture/article/20200602-the-man-who-wrote-the-most-perfect-sentences-ever-written", "published": "Fri, 12 Jun 2020 00:11:29 +0000"}, {"title": "Forth implemented in Rust trait system", "link": "https://github.com/Ashymad/fortraith", "published": "Fri, 12 Jun 2020 17:21:12 +0000"}, {"title": "Crux: Open-source document database with bi-temporal graph queries", "link": "https://opencrux.com/", "published": "Thu, 11 Jun 2020 20:50:44 +0000"}, {"title": "Flexport is hiring people who want to help companies navigate Brexit in London", "link": "https://www.flexport.com/careers", "published": "Sat, 13 Jun 2020 00:21:47 +0000"}, {"title": "The Book of Shaders", "link": "https://thebookofshaders.com/", "published": "Fri, 12 Jun 2020 10:46:42 +0000"}, {"title": "Best practices for managing and storing secrets like API keys and credentials", "link": "https://blog.gitguardian.com/secrets-api-management/", "published": "Fri, 12 Jun 2020 15:54:37 +0000"}, {"title": "Officer of China\u2019s People\u2019s Liberation Army Arrested at Lax", "link": "https://www.justice.gov/usao-ndca/pr/officer-china-s-people-s-liberation-army-arrested-los-angeles-international-airport", "published": "Sat, 13 Jun 2020 00:33:53 +0000"}, {"title": "Venmo and Paypal Are Stalling Urgent Efforts to Bail People Out of Jail", "link": "https://www.vice.com/en_us/article/k7qbnz/venmo-paypal-freeze-transfer-limits-bail-funds", "published": "Fri, 12 Jun 2020 21:42:19 +0000"}, {"title": "How to Build a Universe That Doesn't Fall Apart Two Days Later, Philip K. Dick", "link": "https://web.archive.org/web/20080125030037/http://deoxy.org/pkd_how2build.htm", "published": "Fri, 12 Jun 2020 15:55:10 +0000"}, {"title": "Pgsodium: Modern cryptography for PostgreSQL using libsodium", "link": "https://github.com/michelp/pgsodium", "published": "Wed, 10 Jun 2020 12:17:39 +0000"}, {"title": "Bicycles from Sketches (2016)", "link": "http://www.gianlucagimini.it/prototypes/velocipedia.html", "published": "Thu, 11 Jun 2020 10:12:30 +0000"}, {"title": "The Map Is Not the Territory (2015)", "link": "https://fs.blog/2015/11/map-and-territory/", "published": "Thu, 11 Jun 2020 10:27:30 +0000"}, {"title": "Expresso: A simple expressions language with polymorphic extensible row types", "link": "https://github.com/willtim/Expresso", "published": "Fri, 12 Jun 2020 18:11:39 +0000"}, {"title": "Play Counter-Strike 1.6 in your browser", "link": "http://cs-online.club", "published": "Fri, 12 Jun 2020 08:30:56 +0000"}, {"title": "Show HN: Koyeb \u2013 Simple serverless processing workflows, on any cloud", "link": "https://www.koyeb.com/", "published": "Thu, 11 Jun 2020 13:55:52 +0000"}, {"title": "Brilliant Hardware in the Valley of the Software Slump", "link": "https://craigmod.com/essays/software_slump/", "published": "Fri, 12 Jun 2020 10:34:21 +0000"}, {"title": "EUA Authorized Serology Test Performance", "link": "https://www.fda.gov/medical-devices/emergency-situations-medical-devices/eua-authorized-serology-test-performance", "published": "Thu, 11 Jun 2020 23:29:33 +0000"}, {"title": "Show HN: ProsperStack \u2013 Use Stripe to automatically prevent cancellations", "link": "https://prosperstack.com/", "published": "Fri, 12 Jun 2020 17:13:07 +0000"}, {"title": "Disney almost bought Twitter in 2016", "link": "https://daringfireball.net/linked/2020/06/12/disney-twitter-bob-iger", "published": "Fri, 12 Jun 2020 20:36:00 +0000"}, {"title": "Extracting Structured Data from Templatic Documents", "link": "http://ai.googleblog.com/2020/06/extracting-structured-data-from.html", "published": "Fri, 12 Jun 2020 18:24:59 +0000"}, {"title": "Redefining the Scale of Social Problems [pdf]", "link": "https://homepages.se.edu/cvonbergen/files/2013/01/Small-Wins_Redefining-the-Scale-of-Social-Problems.pdf", "published": "Thu, 11 Jun 2020 10:20:06 +0000"}, {"title": "I Replaced My MacBook Pro with a Raspberry Pi 4 8GB for a Day", "link": "https://www.jeffgeerling.com/blog/2020/i-replaced-my-macbook-pro-raspberry-pi-4-8gb-day", "published": "Fri, 12 Jun 2020 19:59:30 +0000"}] -------------------------------------------------------------------------------- /celerybeat-schedule.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mattdood/web_scraping_example/735437133e23519c5d0f3fedece8af625c6c5617/celerybeat-schedule.db -------------------------------------------------------------------------------- /license.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [2020] [Matthew Wimberly] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Building an RSS feed scraper with Python 2 | This is a project created to illustrate the basics of web scraping by pulling information from the [HackerNews RSS feed](https://news.ycombinator.com/rss). This builds from a simple web scraper in `scraping.py`, into an automated scraping tool in `tasks.py`. 3 | 4 | ## Articles 5 | 6 | 1. Building an RSS feed scraper with Python is available [here](https://codeburst.io/building-an-rss-feed-scraper-with-python-73715ca06e1f). 7 | 8 | 2. Automated web scraping with Python and Celery is available [here](https://codeburst.io/automated-web-scraping-with-python-and-celery-ac02a4a9ce51). 9 | 10 | ## Automated scraping commands 11 | The following are used to start the scheduled scraping with Celery in `tasks.py`. 12 | 13 | Starting our RabbitMQ server (terminal #1): 14 | ``` 15 | rabbitmq-server 16 | ``` 17 | 18 | Starting the scraping (terminal #2): 19 | ``` 20 | celery -A tasks worker -B -l INFO 21 | ``` 22 | 23 | MIT License. -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | beautifulsoup4 3 | celery -------------------------------------------------------------------------------- /scraping.py: -------------------------------------------------------------------------------- 1 | import requests # pulling data 2 | from bs4 import BeautifulSoup # xml parsing 3 | import json # exporting to files 4 | 5 | # save function 6 | def save_function(article_list): 7 | with open('articles.txt', 'w') as outfile: 8 | json.dump(article_list, outfile) 9 | 10 | # scraping function 11 | def hackernews_rss(): 12 | article_list = [] 13 | 14 | try: 15 | # execute my request, parse the data using XML 16 | # parser in BS4 17 | r = requests.get('https://news.ycombinator.com/rss') 18 | soup = BeautifulSoup(r.content, features='xml') 19 | 20 | # select only the "items" I want from the data 21 | articles = soup.findAll('item') 22 | 23 | # for each "item" I want, parse it into a list 24 | for a in articles: 25 | title = a.find('title').text 26 | link = a.find('link').text 27 | published = a.find('pubDate').text 28 | 29 | # create an "article" object with the data 30 | # from each "item" 31 | article = { 32 | 'title': title, 33 | 'link': link, 34 | 'published': published 35 | } 36 | 37 | # append my "article_list" with each "article" object 38 | article_list.append(article) 39 | 40 | # after the loop, dump my saved objects into a .txt file 41 | return save_function(article_list) 42 | except Exception as e: 43 | print('The scraping job failed. See exception:') 44 | print(e) 45 | 46 | print('Starting scraping') 47 | hackernews_rss() 48 | print('Finished scraping') 49 | -------------------------------------------------------------------------------- /tasks.py: -------------------------------------------------------------------------------- 1 | from celery import Celery 2 | from celery.schedules import crontab # scheduler 3 | 4 | import requests # pulling data 5 | from bs4 import BeautifulSoup # xml parsing 6 | import json # exporting to files 7 | 8 | from datetime import datetime # for timestamps 9 | 10 | app = Celery('tasks') 11 | 12 | app.conf.timezone = 'UTC' 13 | 14 | app.conf.beat_schedule = { 15 | # executes every 1 minute 16 | 'scraping-task-one-min': { 17 | 'task': 'tasks.hackernews_rss', 18 | 'schedule': crontab(), 19 | }, 20 | # # executes every 15 minutes 21 | # 'scraping-task-fifteen-min': { 22 | # 'task': 'tasks.hackernews_rss', 23 | # 'schedule': crontab(minute='*/15') 24 | # }, 25 | # # executes daily at midnight 26 | # 'scraping-task-midnight-daily': { 27 | # 'task': 'tasks.hackernews_rss', 28 | # 'schedule': crontab(minute=0, hour=0) 29 | # } 30 | } 31 | 32 | # save function 33 | @app.task 34 | def save_function(article_list): 35 | 36 | # timestamp and filename 37 | timestamp = datetime.now().strftime('%Y%m%d-%H%M%S') 38 | 39 | filename = 'articles-{}.json'.format(timestamp) 40 | 41 | # creating our articles file with timestamp 42 | with open(filename, 'w') as outfile: 43 | json.dump(article_list, outfile) 44 | 45 | # scraping function 46 | @app.task 47 | def hackernews_rss(): 48 | article_list = [] 49 | 50 | try: 51 | print('Starting the scraping tool') 52 | # execute my request, parse the data using XML 53 | # parser in BS4 54 | r = requests.get('https://news.ycombinator.com/rss') 55 | soup = BeautifulSoup(r.content, features='xml') 56 | 57 | # select only the "items" I want from the data 58 | articles = soup.findAll('item') 59 | 60 | # for each "item" I want, parse it into a list 61 | for a in articles: 62 | title = a.find('title').text 63 | link = a.find('link').text 64 | published = a.find('pubDate').text 65 | 66 | # create an "article" object with the data 67 | # from each "item" 68 | article = { 69 | 'title': title, 70 | 'link': link, 71 | 'published': published, 72 | 'created_at': str(datetime.now()), 73 | 'source': 'HackerNews RSS' 74 | } 75 | 76 | # append my "article_list" with each "article" object 77 | article_list.append(article) 78 | 79 | print('Finished scraping the articles') 80 | # after the loop, dump my saved objects into a .txt file 81 | return save_function(article_list) 82 | except Exception as e: 83 | print('The scraping job failed. See exception:') 84 | print(e) 85 | --------------------------------------------------------------------------------