├── .gitignore ├── LICENSE ├── README.md ├── Vagrantfile ├── java-api-backend ├── build.gradle └── src │ └── main │ ├── java │ └── scraper │ │ └── api │ │ ├── ScraperApiApplication.java │ │ ├── amqp │ │ ├── RabbitMqConfiguration.java │ │ ├── ScrapingResultConsumerConfiguration.java │ │ ├── ScrapingResultHandler.java │ │ ├── ScrapingResultMessage.java │ │ ├── TaskMessage.java │ │ ├── TaskProducer.java │ │ └── TaskProducerConfiguration.java │ │ ├── domain │ │ ├── Bookmark.java │ │ ├── BookmarkEventHandler.java │ │ └── BookmarkRepository.java │ │ └── filter │ │ └── CORSFilter.java │ └── resources │ └── application.properties ├── knockout-frontend ├── README.md ├── css │ └── style.css ├── index.html ├── js │ └── bookmark.js └── runserver.sh ├── python-scraping-service ├── requirements.txt ├── scraper.py └── worker.py └── vagrant-scripts ├── postgres.sh └── rabbitmq.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .vagrant 2 | .idea* 3 | .python* 4 | dataSources 5 | *.iml 6 | *.pyc 7 | *.class 8 | *.ids 9 | *.ipr 10 | *.iws 11 | python-scraping-service/venv 12 | java-api-backend/build 13 | java-api-backend/.gradle 14 | java-api-backend/java-api-backend.iws 15 | *.log 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scraping-microservice-java-python-rabbitmq 2 | 3 | A sample web scraping service demonstrating how to build a message driven application using RabbitMQ. The application consists of three parts: front-end developed in Knockout.js, that is communicating with a Spring Boot Java api, which in turn is offloading scraping tasks to a Python microservice. 4 | 5 | ![Architecture](http://www.bernhardwenzel.com/images/posts/scraper-microservice/scraper-microservice.jpg) 6 | 7 | Original blog post: -------------------------------------------------------------------------------- /Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | # All Vagrant configuration is done below. The "2" in Vagrant.configure 5 | # configures the configuration version (we support older styles for 6 | # backwards compatibility). Please don't change it unless you know what 7 | # you're doing. 8 | Vagrant.configure("2") do |config| 9 | # The most common configuration options are documented and commented below. 10 | # For a complete reference, please see the online documentation at 11 | # https://docs.vagrantup.com. 12 | 13 | # Every Vagrant development environment requires a box. You can search for 14 | # boxes at https://atlas.hashicorp.com/search. 15 | config.vm.box = "ubuntu/trusty64" 16 | 17 | # Disable automatic box update checking. If you disable this, then 18 | # boxes will only be checked for updates when the user runs 19 | # `vagrant box outdated`. This is not recommended. 20 | # config.vm.box_check_update = false 21 | 22 | # Create a forwarded port mapping which allows access to a specific port 23 | # within the machine from a port on the host machine. In the example below, 24 | # accessing "localhost:8080" will access port 80 on the guest machine. 25 | config.vm.network "forwarded_port", guest: 80, host: 8080 26 | config.vm.network :forwarded_port, guest: 5432, host: 5432 27 | config.vm.network :forwarded_port, guest: 5672, host: 5672 28 | config.vm.network :forwarded_port, guest: 15672, host: 15672 29 | 30 | # Create a private network, which allows host-only access to the machine 31 | # using a specific IP. 32 | # config.vm.network "private_network", ip: "192.168.33.10" 33 | 34 | # Create a public network, which generally matched to bridged network. 35 | # Bridged networks make the machine appear as another physical device on 36 | # your network. 37 | # config.vm.network "public_network" 38 | 39 | # Share an additional folder to the guest VM. The first argument is 40 | # the path on the host to the actual folder. The second argument is 41 | # the path on the guest to mount the folder. And the optional third 42 | # argument is a set of non-required options. 43 | # config.vm.synced_folder "../data", "/vagrant_data" 44 | 45 | # Provider-specific configuration so you can fine-tune various 46 | # backing providers for Vagrant. These expose provider-specific options. 47 | # Example for VirtualBox: 48 | # 49 | # config.vm.provider "virtualbox" do |vb| 50 | # # Display the VirtualBox GUI when booting the machine 51 | # vb.gui = true 52 | # 53 | # # Customize the amount of memory on the VM: 54 | # vb.memory = "1024" 55 | # end 56 | # 57 | # View the documentation for the provider you are using for more 58 | # information on available options. 59 | 60 | # Define a Vagrant Push strategy for pushing to Atlas. Other push strategies 61 | # such as FTP and Heroku are also available. See the documentation at 62 | # https://docs.vagrantup.com/v2/push/atlas.html for more information. 63 | # config.push.define "atlas" do |push| 64 | # push.app = "YOUR_ATLAS_USERNAME/YOUR_APPLICATION_NAME" 65 | # end 66 | 67 | # Enable provisioning with a shell script. Additional provisioners such as 68 | # Puppet, Chef, Ansible, Salt, and Docker are also available. Please see the 69 | # documentation for more information about their specific syntax and use. 70 | # config.vm.provision "shell", inline: <<-SHELL 71 | # apt-get update 72 | # apt-get install -y apache2 73 | # SHELL 74 | 75 | # Provision RabbitMq 76 | config.vm.provision "shell", path: "vagrant-scripts/rabbitmq.sh", args: ["user", "password"] 77 | 78 | 79 | # Provision PostgreSQL 80 | config.vm.provision "shell", path: "vagrant-scripts/postgres.sh", args: ["root", "root", "scrapingservicedb"] 81 | 82 | end 83 | -------------------------------------------------------------------------------- /java-api-backend/build.gradle: -------------------------------------------------------------------------------- 1 | plugins { 2 | id 'org.springframework.boot' version '1.5.1.RELEASE' 3 | } 4 | 5 | apply plugin: 'java' 6 | apply plugin: 'eclipse' 7 | apply plugin: 'idea' 8 | apply plugin: 'application' 9 | 10 | mainClassName = "scraper.api.ScraperApiApplication" 11 | 12 | applicationDefaultJvmArgs = [ 13 | "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005" 14 | ] 15 | 16 | jar { 17 | baseName = 'demo' 18 | version = '0.0.1-SNAPSHOT' 19 | } 20 | 21 | repositories { 22 | mavenCentral() 23 | } 24 | 25 | 26 | dependencies { 27 | compile("org.springframework.boot:spring-boot-starter-amqp") 28 | compile("org.springframework.boot:spring-boot-starter-data-jpa") 29 | compile("org.springframework.boot:spring-boot-starter-data-rest") 30 | compile("org.springframework.boot:spring-boot-starter-logging") 31 | compile("postgresql:postgresql:9.1-901-1.jdbc4") 32 | compile("com.fasterxml.jackson.core:jackson-databind:2.6.0") 33 | testCompile("org.springframework.boot:spring-boot-starter-test") 34 | } -------------------------------------------------------------------------------- /java-api-backend/src/main/java/scraper/api/ScraperApiApplication.java: -------------------------------------------------------------------------------- 1 | package scraper.api; 2 | 3 | import org.springframework.boot.SpringApplication; 4 | import org.springframework.boot.autoconfigure.SpringBootApplication; 5 | import org.springframework.boot.web.servlet.FilterRegistrationBean; 6 | import org.springframework.context.annotation.Bean; 7 | import scraper.api.domain.BookmarkEventHandler; 8 | import scraper.api.filter.CORSFilter; 9 | 10 | @SpringBootApplication 11 | public class ScraperApiApplication 12 | { 13 | 14 | public static void main(String[] args) 15 | { 16 | SpringApplication.run(ScraperApiApplication.class, args); 17 | } 18 | 19 | @Bean 20 | BookmarkEventHandler bookmarkEventHandler() 21 | { 22 | return new BookmarkEventHandler(); 23 | } 24 | 25 | @Bean 26 | public FilterRegistrationBean commonsRequestLoggingFilter() 27 | { 28 | final FilterRegistrationBean registrationBean = new FilterRegistrationBean(); 29 | registrationBean.setFilter(new CORSFilter()); 30 | return registrationBean; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /java-api-backend/src/main/java/scraper/api/amqp/RabbitMqConfiguration.java: -------------------------------------------------------------------------------- 1 | package scraper.api.amqp; 2 | 3 | import org.springframework.amqp.core.AmqpAdmin; 4 | import org.springframework.amqp.rabbit.connection.CachingConnectionFactory; 5 | import org.springframework.amqp.rabbit.connection.ConnectionFactory; 6 | import org.springframework.amqp.rabbit.core.RabbitAdmin; 7 | import org.springframework.amqp.support.converter.DefaultClassMapper; 8 | import org.springframework.amqp.support.converter.Jackson2JsonMessageConverter; 9 | import org.springframework.amqp.support.converter.MessageConverter; 10 | import org.springframework.context.annotation.Bean; 11 | import org.springframework.context.annotation.Configuration; 12 | 13 | @Configuration 14 | public class RabbitMqConfiguration 15 | { 16 | @Bean 17 | public ConnectionFactory connectionFactory() 18 | { 19 | CachingConnectionFactory connectionFactory = new CachingConnectionFactory("localhost"); 20 | connectionFactory.setUsername("user"); 21 | connectionFactory.setPassword("password"); 22 | connectionFactory.setPort(5672); 23 | return connectionFactory; 24 | } 25 | 26 | @Bean 27 | public AmqpAdmin amqpAdmin() 28 | { 29 | return new RabbitAdmin(connectionFactory()); 30 | } 31 | 32 | 33 | @Bean 34 | public MessageConverter jsonMessageConverter() 35 | { 36 | final Jackson2JsonMessageConverter converter = new Jackson2JsonMessageConverter(); 37 | converter.setClassMapper(classMapper()); 38 | return converter; 39 | } 40 | 41 | @Bean 42 | public DefaultClassMapper classMapper() 43 | { 44 | DefaultClassMapper typeMapper = new DefaultClassMapper(); 45 | typeMapper.setDefaultType(ScrapingResultMessage.class); 46 | // Map> idClassMapping = new HashMap>(); 47 | // idClassMapping.put("scrapingResult", UnitAmqpMessage.class); 48 | // typeMapper.setIdClassMapping(idClassMapping); 49 | return typeMapper; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /java-api-backend/src/main/java/scraper/api/amqp/ScrapingResultConsumerConfiguration.java: -------------------------------------------------------------------------------- 1 | package scraper.api.amqp; 2 | 3 | import org.springframework.amqp.core.Queue; 4 | import org.springframework.amqp.rabbit.core.RabbitTemplate; 5 | import org.springframework.amqp.rabbit.listener.SimpleMessageListenerContainer; 6 | import org.springframework.amqp.rabbit.listener.adapter.MessageListenerAdapter; 7 | import org.springframework.amqp.support.converter.DefaultClassMapper; 8 | import org.springframework.beans.factory.annotation.Autowired; 9 | import org.springframework.context.annotation.Bean; 10 | import org.springframework.context.annotation.Configuration; 11 | 12 | @Configuration 13 | public class ScrapingResultConsumerConfiguration extends RabbitMqConfiguration 14 | { 15 | protected final String scrapingResultQueue = "scrapingresult.queue"; 16 | 17 | @Autowired 18 | private ScrapingResultHandler scrapingResultHandler; 19 | 20 | @Bean 21 | public RabbitTemplate rabbitTemplate() { 22 | RabbitTemplate template = new RabbitTemplate(connectionFactory()); 23 | template.setRoutingKey(this.scrapingResultQueue); 24 | template.setQueue(this.scrapingResultQueue); 25 | template.setMessageConverter(jsonMessageConverter()); 26 | return template; 27 | } 28 | 29 | @Bean 30 | public Queue scrapingResultQueue() { 31 | return new Queue(this.scrapingResultQueue); 32 | } 33 | 34 | @Bean 35 | public SimpleMessageListenerContainer listenerContainer() { 36 | SimpleMessageListenerContainer container = new SimpleMessageListenerContainer(); 37 | container.setConnectionFactory(connectionFactory()); 38 | container.setQueueNames(this.scrapingResultQueue); 39 | container.setMessageListener(messageListenerAdapter()); 40 | 41 | return container; 42 | } 43 | 44 | @Bean 45 | public MessageListenerAdapter messageListenerAdapter() { 46 | return new MessageListenerAdapter(scrapingResultHandler, jsonMessageConverter()); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /java-api-backend/src/main/java/scraper/api/amqp/ScrapingResultHandler.java: -------------------------------------------------------------------------------- 1 | package scraper.api.amqp; 2 | 3 | import org.springframework.beans.factory.annotation.Autowired; 4 | import org.springframework.stereotype.Component; 5 | import scraper.api.domain.Bookmark; 6 | import scraper.api.domain.BookmarkRepository; 7 | 8 | import java.util.List; 9 | 10 | @Component 11 | public class ScrapingResultHandler 12 | { 13 | @Autowired 14 | private BookmarkRepository bookmarkRepository; 15 | 16 | public void handleMessage(ScrapingResultMessage scrapingResultMessage) 17 | { 18 | System.out.println("Received summary: " + scrapingResultMessage.getSummary()); 19 | final String url = scrapingResultMessage.getUrl(); 20 | final List bookmarks = bookmarkRepository.findByUrl(url); 21 | if (bookmarks.size() == 0) 22 | { 23 | System.out.println("No bookmark of url: " + url + " found."); 24 | } 25 | else 26 | { 27 | for (Bookmark bookmark : bookmarks) 28 | { 29 | bookmark.setSummary(scrapingResultMessage.getSummary()); 30 | bookmarkRepository.save(bookmarks); 31 | System.out.println("updated bookmark: " + url); 32 | } 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /java-api-backend/src/main/java/scraper/api/amqp/ScrapingResultMessage.java: -------------------------------------------------------------------------------- 1 | package scraper.api.amqp; 2 | 3 | public class ScrapingResultMessage 4 | { 5 | private String url; 6 | private String summary; 7 | private String codeSnippets; 8 | 9 | public String getUrl() 10 | { 11 | return url; 12 | } 13 | 14 | public void setUrl(String url) 15 | { 16 | this.url = url; 17 | } 18 | 19 | public String getSummary() 20 | { 21 | return summary; 22 | } 23 | 24 | public void setSummary(String summary) 25 | { 26 | this.summary = summary; 27 | } 28 | 29 | public String getCodeSnippets() 30 | { 31 | return codeSnippets; 32 | } 33 | 34 | public void setCodeSnippets(String codeSnippets) 35 | { 36 | this.codeSnippets = codeSnippets; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /java-api-backend/src/main/java/scraper/api/amqp/TaskMessage.java: -------------------------------------------------------------------------------- 1 | package scraper.api.amqp; 2 | 3 | public class TaskMessage 4 | { 5 | private String url; 6 | 7 | public String getUrl() 8 | { 9 | return url; 10 | } 11 | 12 | public void setUrl(String url) 13 | { 14 | this.url = url; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /java-api-backend/src/main/java/scraper/api/amqp/TaskProducer.java: -------------------------------------------------------------------------------- 1 | package scraper.api.amqp; 2 | 3 | import org.springframework.beans.factory.annotation.Autowired; 4 | import org.springframework.stereotype.Component; 5 | 6 | @Component 7 | public class TaskProducer 8 | { 9 | @Autowired 10 | private TaskProducerConfiguration taskProducerConfiguration; 11 | 12 | public void sendNewTask(TaskMessage taskMessage) 13 | { 14 | taskProducerConfiguration.rabbitTemplate().convertAndSend(taskProducerConfiguration.tasksQueue, taskMessage); 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /java-api-backend/src/main/java/scraper/api/amqp/TaskProducerConfiguration.java: -------------------------------------------------------------------------------- 1 | package scraper.api.amqp; 2 | 3 | import org.springframework.amqp.core.Queue; 4 | import org.springframework.amqp.rabbit.core.RabbitTemplate; 5 | import org.springframework.context.annotation.Bean; 6 | import org.springframework.context.annotation.Configuration; 7 | 8 | @Configuration 9 | public class TaskProducerConfiguration extends RabbitMqConfiguration 10 | { 11 | protected final String tasksQueue = "tasks.queue"; 12 | 13 | @Bean 14 | public RabbitTemplate rabbitTemplate() 15 | { 16 | RabbitTemplate template = new RabbitTemplate(connectionFactory()); 17 | template.setRoutingKey(this.tasksQueue); 18 | template.setQueue(this.tasksQueue); 19 | template.setMessageConverter(jsonMessageConverter()); 20 | return template; 21 | } 22 | 23 | @Bean 24 | public Queue tasksQueue() 25 | { 26 | return new Queue(this.tasksQueue); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /java-api-backend/src/main/java/scraper/api/domain/Bookmark.java: -------------------------------------------------------------------------------- 1 | package scraper.api.domain; 2 | 3 | import javax.persistence.*; 4 | import java.util.Date; 5 | 6 | @Entity 7 | public class Bookmark 8 | { 9 | @Id 10 | @GeneratedValue(strategy = GenerationType.AUTO) 11 | private long id; 12 | 13 | @Column(nullable=false) 14 | private String url; 15 | 16 | @Column(columnDefinition = "Text") 17 | private String summary; 18 | 19 | @Column(nullable=false) 20 | private Date created; 21 | 22 | private String note; 23 | 24 | public Date getCreated() 25 | { 26 | return created; 27 | } 28 | 29 | public void setCreated(Date created) 30 | { 31 | this.created = created; 32 | } 33 | 34 | public String getUrl() 35 | { 36 | return url; 37 | } 38 | 39 | public void setUrl(String url) 40 | { 41 | this.url = url; 42 | } 43 | 44 | public String getSummary() 45 | { 46 | return summary; 47 | } 48 | 49 | public void setSummary(String summary) 50 | { 51 | this.summary = summary; 52 | } 53 | 54 | public String getNote() 55 | { 56 | return note; 57 | } 58 | 59 | public void setNote(String note) 60 | { 61 | this.note = note; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /java-api-backend/src/main/java/scraper/api/domain/BookmarkEventHandler.java: -------------------------------------------------------------------------------- 1 | package scraper.api.domain; 2 | 3 | import org.springframework.beans.factory.annotation.Autowired; 4 | import org.springframework.data.rest.core.annotation.HandleAfterCreate; 5 | import org.springframework.data.rest.core.annotation.HandleBeforeCreate; 6 | import org.springframework.data.rest.core.annotation.RepositoryEventHandler; 7 | import scraper.api.amqp.TaskMessage; 8 | import scraper.api.amqp.TaskProducer; 9 | 10 | import java.util.Date; 11 | 12 | @RepositoryEventHandler(Bookmark.class) 13 | public class BookmarkEventHandler 14 | { 15 | @Autowired 16 | private TaskProducer taskProducer; 17 | 18 | @HandleBeforeCreate 19 | public void handleBookmarkCreate(Bookmark bookmark) 20 | { 21 | bookmark.setCreated(new Date()); 22 | bookmark.setUrl(bookmark.getUrl().trim()); 23 | 24 | } 25 | 26 | @HandleAfterCreate 27 | public void handleAfterBookmarkCreate(Bookmark bookmark) 28 | { 29 | final TaskMessage taskMessage = new TaskMessage(); 30 | taskMessage.setUrl(bookmark.getUrl()); 31 | taskProducer.sendNewTask(taskMessage); 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /java-api-backend/src/main/java/scraper/api/domain/BookmarkRepository.java: -------------------------------------------------------------------------------- 1 | package scraper.api.domain; 2 | 3 | import org.springframework.data.repository.CrudRepository; 4 | import org.springframework.data.repository.query.Param; 5 | import org.springframework.data.rest.core.annotation.RepositoryRestResource; 6 | import org.springframework.data.rest.core.annotation.RestResource; 7 | 8 | import java.util.List; 9 | 10 | @RepositoryRestResource 11 | public interface BookmarkRepository extends CrudRepository 12 | { 13 | @RestResource(path="url") 14 | List findByUrl(@Param("text") String url); 15 | } 16 | -------------------------------------------------------------------------------- /java-api-backend/src/main/java/scraper/api/filter/CORSFilter.java: -------------------------------------------------------------------------------- 1 | package scraper.api.filter; 2 | 3 | import org.springframework.stereotype.Component; 4 | 5 | import javax.servlet.*; 6 | import javax.servlet.http.HttpServletResponse; 7 | import java.io.IOException; 8 | 9 | @Component 10 | public class CORSFilter implements Filter 11 | { 12 | public void doFilter(ServletRequest req, ServletResponse res, FilterChain chain) throws IOException, ServletException 13 | { 14 | HttpServletResponse response = (HttpServletResponse) res; 15 | response.setHeader("Access-Control-Allow-Origin", "*"); 16 | response.setHeader("Access-Control-Allow-Methods", "POST, PUT, PATCH, GET, OPTIONS, DELETE"); 17 | response.setHeader("Access-Control-Max-Age", "3600"); 18 | response.setHeader("Access-Control-Allow-Headers", "x-requested-with"); 19 | response.setHeader("Access-Control-Allow-Headers", "Content-Type"); 20 | chain.doFilter(req, res); 21 | } 22 | 23 | public void init(FilterConfig filterConfig) 24 | { 25 | } 26 | 27 | public void destroy() 28 | { 29 | } 30 | 31 | } -------------------------------------------------------------------------------- /java-api-backend/src/main/resources/application.properties: -------------------------------------------------------------------------------- 1 | spring.datasource.driverClassName=org.postgresql.Driver 2 | spring.datasource.url=jdbc:postgresql://localhost:5432/scrapingservicedb 3 | spring.datasource.username=root 4 | spring.datasource.password=root 5 | 6 | spring.jpa.generate-ddl=true 7 | spring.jpa.show-sql=true 8 | spring.jpa.database-platform=org.hibernate.dialect.PostgreSQLDialect 9 | # Note: change this line to 10 | # spring.jpa.hibernate.ddl-auto=validate 11 | # to avoid data loss between restarts. 12 | spring.jpa.hibernate.ddl-auto=create -------------------------------------------------------------------------------- /knockout-frontend/README.md: -------------------------------------------------------------------------------- 1 | Serve `index.html` with 2 | 3 | knockout-frontend$ python -m SimpleHTTPServer 8090 -------------------------------------------------------------------------------- /knockout-frontend/css/style.css: -------------------------------------------------------------------------------- 1 | textarea { 2 | border: 1px solid #eee; 3 | font-size: 0.8em; 4 | color: #848484; 5 | } 6 | 7 | .summary { 8 | color: #7b7b7b; 9 | line-height: 1.3em; 10 | } -------------------------------------------------------------------------------- /knockout-frontend/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Bookmarks frontend 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 |

My bookmarks

15 | 16 |
17 |
18 |
19 | 20 | 21 |
22 | 23 |
24 |
25 |
26 | 27 | 28 |
29 | 31 |
32 |
33 |
34 |
35 | 36 |
37 |
38 |
39 |
40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 56 | 57 | 58 | 61 | 64 | 65 | 66 |
UrlNoteSummaryActions
54 | 55 |
59 | 60 | 62 | 63 |
67 | 68 |
69 | 70 | 71 | 72 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /knockout-frontend/js/bookmark.js: -------------------------------------------------------------------------------- 1 | 2 | // The bookmark model 3 | function Bookmark(selfHref, url, created, note, summary) { 4 | var self = this; 5 | self.selfHref = selfHref; 6 | self.url = ko.observable(url); 7 | self.created = created; 8 | self.note = ko.observable(note); 9 | self.summary = summary; 10 | } 11 | 12 | // The bookmark view model 13 | function BookmarkViewModel() { 14 | var self = this; 15 | 16 | self.newUrl = ko.observable(); 17 | self.newNote = ko.observable(); 18 | self.bookmarks = ko.observableArray([]); 19 | 20 | // add bookmark: send POST to bookmarks resource 21 | self.addBookmark = function () { 22 | // a little bit of pre-processing of user entered url and note 23 | var newUrl = self.newUrl(); 24 | if (typeof newUrl == "undefined") { 25 | alert("Url required"); 26 | return; 27 | } 28 | 29 | // prefix with http:// if not added by user 30 | if (newUrl.search(/^http[s]?\:\/\//) == -1) { 31 | newUrl = 'http://' + newUrl; 32 | } 33 | 34 | var newNote = self.newNote(); 35 | if (typeof newNote == "undefined") { 36 | newNote = ""; 37 | } 38 | 39 | // make POST request 40 | $.ajax("http://localhost:8080/bookmarks", { 41 | data: '{"url": "' + newUrl + ' ", "note": "' + newNote + '"}', 42 | type: "post", 43 | contentType: "application/json", 44 | success: function (allData) { 45 | self.loadBookmarks(); 46 | self.newUrl(""); 47 | self.newNote(""); 48 | } 49 | }); 50 | }; 51 | 52 | // update bookmark: send PUT to existing bookmarks resource 53 | self.updateBookmark = function (bookmark) { 54 | 55 | // same as in "addBookmark" a little bit of parameter checking. Some code duplication here 56 | // but we leave it for demonstration purposes 57 | var newUrl = bookmark.url(); 58 | if (typeof newUrl == "undefined") { 59 | alert("Url required"); 60 | return; 61 | } 62 | 63 | // prefix with http:// if not added by user 64 | if (newUrl.search(/^http[s]?\:\/\//) == -1) { 65 | newUrl = 'http://' + newUrl; 66 | } 67 | 68 | var newNote = bookmark.note(); 69 | if (typeof newNote == "undefined") { 70 | newNote = ""; 71 | } 72 | 73 | // make PUT request (or send PATCH then we don't need to include the created date) 74 | $.ajax(bookmark.selfHref, { 75 | data: '{"url": "' + newUrl + ' ", "note": "' + newNote + '", "created": "' + bookmark.created +'"}', 76 | type: "patch", 77 | contentType: "application/json", 78 | success: function (allData) { 79 | self.loadBookmarks(); 80 | } 81 | }); 82 | }; 83 | 84 | 85 | // delete bookmark: send DELETE to bookmarks resource 86 | self.deleteBookmark = function (bookmark) { 87 | $.ajax(bookmark.selfHref, { 88 | type: "delete", 89 | success: function (allData) { 90 | self.loadBookmarks(); 91 | } 92 | }); 93 | }; 94 | 95 | // load bookmarks from server: GET on bookmarks resource 96 | self.loadBookmarks = function () { 97 | $.ajax("http://localhost:8080/bookmarks", { 98 | type: "get", 99 | success: function (allData) { 100 | var json = ko.toJSON(allData); 101 | var parsed = JSON.parse(json); 102 | if (parsed._embedded) { 103 | var parsedBookmarks = parsed._embedded.bookmarks; 104 | var mappedBookmarks = $.map(parsedBookmarks, function (bookmark) { 105 | return new Bookmark(bookmark._links.self.href, bookmark.url, bookmark.created, bookmark.note, bookmark.summary) 106 | }); 107 | self.bookmarks(mappedBookmarks); 108 | } else { 109 | self.bookmarks([]); 110 | } 111 | 112 | } 113 | }); 114 | }; 115 | 116 | // Load initial data 117 | self.loadBookmarks(); 118 | } 119 | 120 | 121 | // Activates knockout.js 122 | ko.applyBindings(new BookmarkViewModel()); -------------------------------------------------------------------------------- /knockout-frontend/runserver.sh: -------------------------------------------------------------------------------- 1 | python -m SimpleHTTPServer 8090 2 | -------------------------------------------------------------------------------- /python-scraping-service/requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.0 2 | breadability==0.1.20 3 | chardet==2.3.0 4 | docopt==0.6.2 5 | lxml==3.7.2 6 | numpy==1.12.0 7 | packaging==16.8 8 | pika==0.10.0 9 | pyparsing==2.1.10 10 | requests==2.13.0 11 | six==1.10.0 12 | sumy==0.5.1 13 | -------------------------------------------------------------------------------- /python-scraping-service/scraper.py: -------------------------------------------------------------------------------- 1 | from sumy.parsers.html import HtmlParser 2 | from sumy.nlp.tokenizers import Tokenizer 3 | from sumy.summarizers.lex_rank import LexRankSummarizer as Summarizer 4 | from sumy.nlp.stemmers import Stemmer 5 | from sumy.utils import get_stop_words 6 | 7 | class ScrapingResult: 8 | def __init__(self): 9 | self.url = None 10 | self.summary = None 11 | 12 | 13 | LANGUAGE = "english" 14 | SENTENCES_COUNT = 2 15 | 16 | 17 | class Scraper: 18 | 19 | def scrape(self, url): 20 | complete_url = url 21 | try: 22 | # get summary 23 | print "Retrieving page summary of %s... " % url 24 | 25 | parser = HtmlParser.from_url(complete_url, Tokenizer(LANGUAGE)) 26 | stemmer = Stemmer(LANGUAGE) 27 | 28 | summarizer = Summarizer(stemmer) 29 | summarizer.stop_words = get_stop_words(LANGUAGE) 30 | 31 | url_summary = ''.join(str(sentence) for sentence in summarizer(parser.document, SENTENCES_COUNT)) 32 | 33 | except Exception, e: 34 | url_summary = "Could not scrape summary. Reason: %s" % e.message 35 | 36 | print "Done: %s = %s" % (url, url_summary) 37 | 38 | # create scraping result 39 | scraping_result = ScrapingResult() 40 | 41 | scraping_result.summary = url_summary 42 | scraping_result.url = url 43 | 44 | return scraping_result 45 | -------------------------------------------------------------------------------- /python-scraping-service/worker.py: -------------------------------------------------------------------------------- 1 | import pika 2 | import json 3 | from scraper import Scraper 4 | 5 | credentials = pika.PlainCredentials("user", "password") 6 | parameters = pika.ConnectionParameters(host='localhost', credentials=credentials) 7 | 8 | connection = pika.BlockingConnection(parameters) 9 | channel = connection.channel() 10 | tasks_queue = channel.queue_declare(queue='tasks.queue', durable=True) 11 | scraping_result_queue = channel.queue_declare(queue='scrapingresult.queue', durable=True) 12 | 13 | print ' [*] Waiting for tasks. To exit press CTRL+C' 14 | 15 | def publish_result(scraping_result): 16 | j = json.dumps(scraping_result.__dict__) 17 | properties = pika.BasicProperties(content_type="application/json") 18 | channel.basic_publish(exchange='', routing_key='scrapingresult.queue', body=j, properties=properties) 19 | 20 | def callback(ch, method, properties, body): 21 | url = json.loads(body)['url'] 22 | scraper = Scraper() 23 | result = scraper.scrape(url.strip()) 24 | publish_result(result) 25 | 26 | channel.basic_consume(callback, queue='tasks.queue', no_ack=True) 27 | channel.start_consuming() 28 | -------------------------------------------------------------------------------- /vagrant-scripts/postgres.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh -e 2 | 3 | # Thanks to https://github.com/jackdb/pg-app-dev-vm 4 | 5 | # Edit the following to change the name of the database user that will be created: 6 | APP_DB_USER=$1 7 | APP_DB_PASS=$2 8 | 9 | # Edit the following to change the name of the database that is created (defaults to the user name) 10 | APP_DB_NAME=$3 11 | 12 | # Edit the following to change the version of PostgreSQL that is installed 13 | PG_VERSION=9.4 14 | 15 | ########################################################### 16 | # Changes below this line are probably not necessary 17 | ########################################################### 18 | print_db_usage () { 19 | echo "Your PostgreSQL database has been setup and can be accessed on your local machine on the forwarded port (default: 15432)" 20 | echo " Host: localhost" 21 | echo " Port: 15432" 22 | echo " Database: $APP_DB_NAME" 23 | echo " Username: $APP_DB_USER" 24 | echo " Password: $APP_DB_PASS" 25 | echo "" 26 | echo "Admin access to postgres user via VM:" 27 | echo " vagrant ssh" 28 | echo " sudo su - postgres" 29 | echo "" 30 | echo "psql access to app database user via VM:" 31 | echo " vagrant ssh" 32 | echo " sudo su - postgres" 33 | echo " PGUSER=$APP_DB_USER PGPASSWORD=$APP_DB_PASS psql -h localhost $APP_DB_NAME" 34 | echo "" 35 | echo "Env variable for application development:" 36 | echo " DATABASE_URL=postgresql://$APP_DB_USER:$APP_DB_PASS@localhost:15432/$APP_DB_NAME" 37 | echo "" 38 | echo "Local command to access the database via psql:" 39 | echo " PGUSER=$APP_DB_USER PGPASSWORD=$APP_DB_PASS psql -h localhost -p 15432 $APP_DB_NAME" 40 | } 41 | 42 | export DEBIAN_FRONTEND=noninteractive 43 | 44 | PROVISIONED_ON=/etc/vm_provision_on_timestamp 45 | if [ -f "$PROVISIONED_ON" ] 46 | then 47 | echo "VM was already provisioned at: $(cat $PROVISIONED_ON)" 48 | echo "To run system updates manually login via 'vagrant ssh' and run 'apt-get update && apt-get upgrade'" 49 | echo "" 50 | print_db_usage 51 | exit 52 | fi 53 | 54 | PG_REPO_APT_SOURCE=/etc/apt/sources.list.d/pgdg.list 55 | if [ ! -f "$PG_REPO_APT_SOURCE" ] 56 | then 57 | # Add PG apt repo: 58 | echo "deb http://apt.postgresql.org/pub/repos/apt/ trusty-pgdg main" > "$PG_REPO_APT_SOURCE" 59 | 60 | # Add PGDG repo key: 61 | wget --quiet -O - https://apt.postgresql.org/pub/repos/apt/ACCC4CF8.asc | apt-key add - 62 | fi 63 | 64 | # Update package list and upgrade all packages 65 | apt-get update 66 | apt-get -y upgrade 67 | 68 | apt-get -y install "postgresql-$PG_VERSION" "postgresql-contrib-$PG_VERSION" 69 | 70 | PG_CONF="/etc/postgresql/$PG_VERSION/main/postgresql.conf" 71 | PG_HBA="/etc/postgresql/$PG_VERSION/main/pg_hba.conf" 72 | PG_DIR="/var/lib/postgresql/$PG_VERSION/main" 73 | 74 | # Edit postgresql.conf to change listen address to '*': 75 | sed -i "s/#listen_addresses = 'localhost'/listen_addresses = '*'/" "$PG_CONF" 76 | 77 | # Append to pg_hba.conf to add password auth: 78 | echo "host all all all md5" >> "$PG_HBA" 79 | 80 | # Explicitly set default client_encoding 81 | echo "client_encoding = utf8" >> "$PG_CONF" 82 | 83 | # Restart so that all new config is loaded: 84 | service postgresql restart 85 | 86 | cat << EOF | su - postgres -c psql 87 | -- Create the database user: 88 | CREATE USER $APP_DB_USER WITH PASSWORD '$APP_DB_PASS'; 89 | 90 | -- Create the database: 91 | CREATE DATABASE $APP_DB_NAME WITH OWNER=$APP_DB_USER 92 | LC_COLLATE='en_US.utf8' 93 | LC_CTYPE='en_US.utf8' 94 | ENCODING='UTF8' 95 | TEMPLATE=template0; 96 | EOF 97 | 98 | # Tag the provision time: 99 | date > "$PROVISIONED_ON" 100 | 101 | echo "Successfully created PostgreSQL dev virtual machine." 102 | echo "" 103 | print_db_usage -------------------------------------------------------------------------------- /vagrant-scripts/rabbitmq.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | echo ">>> Installing RabbitMQ" 4 | 5 | apt-get -y install erlang-nox 6 | wget https://www.rabbitmq.com/rabbitmq-release-signing-key.asc 7 | apt-key add rabbitmq-release-signing-key.asc 8 | echo "deb http://www.rabbitmq.com/debian/ testing main" > /etc/apt/sources.list.d/rabbitmq.list 9 | apt-get update 10 | apt-get install -y rabbitmq-server 11 | 12 | rabbitmqctl add_user $1 $2 13 | rabbitmqctl set_permissions -p / $1 ".*" ".*" ".*" --------------------------------------------------------------------------------