├── .gitignore
├── LICENSE
├── README.md
├── Vagrantfile
├── java-api-backend
├── build.gradle
└── src
│ └── main
│ ├── java
│ └── scraper
│ │ └── api
│ │ ├── ScraperApiApplication.java
│ │ ├── amqp
│ │ ├── RabbitMqConfiguration.java
│ │ ├── ScrapingResultConsumerConfiguration.java
│ │ ├── ScrapingResultHandler.java
│ │ ├── ScrapingResultMessage.java
│ │ ├── TaskMessage.java
│ │ ├── TaskProducer.java
│ │ └── TaskProducerConfiguration.java
│ │ ├── domain
│ │ ├── Bookmark.java
│ │ ├── BookmarkEventHandler.java
│ │ └── BookmarkRepository.java
│ │ └── filter
│ │ └── CORSFilter.java
│ └── resources
│ └── application.properties
├── knockout-frontend
├── README.md
├── css
│ └── style.css
├── index.html
├── js
│ └── bookmark.js
└── runserver.sh
├── python-scraping-service
├── requirements.txt
├── scraper.py
└── worker.py
└── vagrant-scripts
├── postgres.sh
└── rabbitmq.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | .vagrant
2 | .idea*
3 | .python*
4 | dataSources
5 | *.iml
6 | *.pyc
7 | *.class
8 | *.ids
9 | *.ipr
10 | *.iws
11 | python-scraping-service/venv
12 | java-api-backend/build
13 | java-api-backend/.gradle
14 | java-api-backend/java-api-backend.iws
15 | *.log
16 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
203 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # scraping-microservice-java-python-rabbitmq
2 |
3 | A sample web scraping service demonstrating how to build a message driven application using RabbitMQ. The application consists of three parts: front-end developed in Knockout.js, that is communicating with a Spring Boot Java api, which in turn is offloading scraping tasks to a Python microservice.
4 |
5 | 
6 |
7 | Original blog post:
--------------------------------------------------------------------------------
/Vagrantfile:
--------------------------------------------------------------------------------
1 | # -*- mode: ruby -*-
2 | # vi: set ft=ruby :
3 |
4 | # All Vagrant configuration is done below. The "2" in Vagrant.configure
5 | # configures the configuration version (we support older styles for
6 | # backwards compatibility). Please don't change it unless you know what
7 | # you're doing.
8 | Vagrant.configure("2") do |config|
9 | # The most common configuration options are documented and commented below.
10 | # For a complete reference, please see the online documentation at
11 | # https://docs.vagrantup.com.
12 |
13 | # Every Vagrant development environment requires a box. You can search for
14 | # boxes at https://atlas.hashicorp.com/search.
15 | config.vm.box = "ubuntu/trusty64"
16 |
17 | # Disable automatic box update checking. If you disable this, then
18 | # boxes will only be checked for updates when the user runs
19 | # `vagrant box outdated`. This is not recommended.
20 | # config.vm.box_check_update = false
21 |
22 | # Create a forwarded port mapping which allows access to a specific port
23 | # within the machine from a port on the host machine. In the example below,
24 | # accessing "localhost:8080" will access port 80 on the guest machine.
25 | config.vm.network "forwarded_port", guest: 80, host: 8080
26 | config.vm.network :forwarded_port, guest: 5432, host: 5432
27 | config.vm.network :forwarded_port, guest: 5672, host: 5672
28 | config.vm.network :forwarded_port, guest: 15672, host: 15672
29 |
30 | # Create a private network, which allows host-only access to the machine
31 | # using a specific IP.
32 | # config.vm.network "private_network", ip: "192.168.33.10"
33 |
34 | # Create a public network, which generally matched to bridged network.
35 | # Bridged networks make the machine appear as another physical device on
36 | # your network.
37 | # config.vm.network "public_network"
38 |
39 | # Share an additional folder to the guest VM. The first argument is
40 | # the path on the host to the actual folder. The second argument is
41 | # the path on the guest to mount the folder. And the optional third
42 | # argument is a set of non-required options.
43 | # config.vm.synced_folder "../data", "/vagrant_data"
44 |
45 | # Provider-specific configuration so you can fine-tune various
46 | # backing providers for Vagrant. These expose provider-specific options.
47 | # Example for VirtualBox:
48 | #
49 | # config.vm.provider "virtualbox" do |vb|
50 | # # Display the VirtualBox GUI when booting the machine
51 | # vb.gui = true
52 | #
53 | # # Customize the amount of memory on the VM:
54 | # vb.memory = "1024"
55 | # end
56 | #
57 | # View the documentation for the provider you are using for more
58 | # information on available options.
59 |
60 | # Define a Vagrant Push strategy for pushing to Atlas. Other push strategies
61 | # such as FTP and Heroku are also available. See the documentation at
62 | # https://docs.vagrantup.com/v2/push/atlas.html for more information.
63 | # config.push.define "atlas" do |push|
64 | # push.app = "YOUR_ATLAS_USERNAME/YOUR_APPLICATION_NAME"
65 | # end
66 |
67 | # Enable provisioning with a shell script. Additional provisioners such as
68 | # Puppet, Chef, Ansible, Salt, and Docker are also available. Please see the
69 | # documentation for more information about their specific syntax and use.
70 | # config.vm.provision "shell", inline: <<-SHELL
71 | # apt-get update
72 | # apt-get install -y apache2
73 | # SHELL
74 |
75 | # Provision RabbitMq
76 | config.vm.provision "shell", path: "vagrant-scripts/rabbitmq.sh", args: ["user", "password"]
77 |
78 |
79 | # Provision PostgreSQL
80 | config.vm.provision "shell", path: "vagrant-scripts/postgres.sh", args: ["root", "root", "scrapingservicedb"]
81 |
82 | end
83 |
--------------------------------------------------------------------------------
/java-api-backend/build.gradle:
--------------------------------------------------------------------------------
1 | plugins {
2 | id 'org.springframework.boot' version '1.5.1.RELEASE'
3 | }
4 |
5 | apply plugin: 'java'
6 | apply plugin: 'eclipse'
7 | apply plugin: 'idea'
8 | apply plugin: 'application'
9 |
10 | mainClassName = "scraper.api.ScraperApiApplication"
11 |
12 | applicationDefaultJvmArgs = [
13 | "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005"
14 | ]
15 |
16 | jar {
17 | baseName = 'demo'
18 | version = '0.0.1-SNAPSHOT'
19 | }
20 |
21 | repositories {
22 | mavenCentral()
23 | }
24 |
25 |
26 | dependencies {
27 | compile("org.springframework.boot:spring-boot-starter-amqp")
28 | compile("org.springframework.boot:spring-boot-starter-data-jpa")
29 | compile("org.springframework.boot:spring-boot-starter-data-rest")
30 | compile("org.springframework.boot:spring-boot-starter-logging")
31 | compile("postgresql:postgresql:9.1-901-1.jdbc4")
32 | compile("com.fasterxml.jackson.core:jackson-databind:2.6.0")
33 | testCompile("org.springframework.boot:spring-boot-starter-test")
34 | }
--------------------------------------------------------------------------------
/java-api-backend/src/main/java/scraper/api/ScraperApiApplication.java:
--------------------------------------------------------------------------------
1 | package scraper.api;
2 |
3 | import org.springframework.boot.SpringApplication;
4 | import org.springframework.boot.autoconfigure.SpringBootApplication;
5 | import org.springframework.boot.web.servlet.FilterRegistrationBean;
6 | import org.springframework.context.annotation.Bean;
7 | import scraper.api.domain.BookmarkEventHandler;
8 | import scraper.api.filter.CORSFilter;
9 |
10 | @SpringBootApplication
11 | public class ScraperApiApplication
12 | {
13 |
14 | public static void main(String[] args)
15 | {
16 | SpringApplication.run(ScraperApiApplication.class, args);
17 | }
18 |
19 | @Bean
20 | BookmarkEventHandler bookmarkEventHandler()
21 | {
22 | return new BookmarkEventHandler();
23 | }
24 |
25 | @Bean
26 | public FilterRegistrationBean commonsRequestLoggingFilter()
27 | {
28 | final FilterRegistrationBean registrationBean = new FilterRegistrationBean();
29 | registrationBean.setFilter(new CORSFilter());
30 | return registrationBean;
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/java-api-backend/src/main/java/scraper/api/amqp/RabbitMqConfiguration.java:
--------------------------------------------------------------------------------
1 | package scraper.api.amqp;
2 |
3 | import org.springframework.amqp.core.AmqpAdmin;
4 | import org.springframework.amqp.rabbit.connection.CachingConnectionFactory;
5 | import org.springframework.amqp.rabbit.connection.ConnectionFactory;
6 | import org.springframework.amqp.rabbit.core.RabbitAdmin;
7 | import org.springframework.amqp.support.converter.DefaultClassMapper;
8 | import org.springframework.amqp.support.converter.Jackson2JsonMessageConverter;
9 | import org.springframework.amqp.support.converter.MessageConverter;
10 | import org.springframework.context.annotation.Bean;
11 | import org.springframework.context.annotation.Configuration;
12 |
13 | @Configuration
14 | public class RabbitMqConfiguration
15 | {
16 | @Bean
17 | public ConnectionFactory connectionFactory()
18 | {
19 | CachingConnectionFactory connectionFactory = new CachingConnectionFactory("localhost");
20 | connectionFactory.setUsername("user");
21 | connectionFactory.setPassword("password");
22 | connectionFactory.setPort(5672);
23 | return connectionFactory;
24 | }
25 |
26 | @Bean
27 | public AmqpAdmin amqpAdmin()
28 | {
29 | return new RabbitAdmin(connectionFactory());
30 | }
31 |
32 |
33 | @Bean
34 | public MessageConverter jsonMessageConverter()
35 | {
36 | final Jackson2JsonMessageConverter converter = new Jackson2JsonMessageConverter();
37 | converter.setClassMapper(classMapper());
38 | return converter;
39 | }
40 |
41 | @Bean
42 | public DefaultClassMapper classMapper()
43 | {
44 | DefaultClassMapper typeMapper = new DefaultClassMapper();
45 | typeMapper.setDefaultType(ScrapingResultMessage.class);
46 | // Map> idClassMapping = new HashMap>();
47 | // idClassMapping.put("scrapingResult", UnitAmqpMessage.class);
48 | // typeMapper.setIdClassMapping(idClassMapping);
49 | return typeMapper;
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/java-api-backend/src/main/java/scraper/api/amqp/ScrapingResultConsumerConfiguration.java:
--------------------------------------------------------------------------------
1 | package scraper.api.amqp;
2 |
3 | import org.springframework.amqp.core.Queue;
4 | import org.springframework.amqp.rabbit.core.RabbitTemplate;
5 | import org.springframework.amqp.rabbit.listener.SimpleMessageListenerContainer;
6 | import org.springframework.amqp.rabbit.listener.adapter.MessageListenerAdapter;
7 | import org.springframework.amqp.support.converter.DefaultClassMapper;
8 | import org.springframework.beans.factory.annotation.Autowired;
9 | import org.springframework.context.annotation.Bean;
10 | import org.springframework.context.annotation.Configuration;
11 |
12 | @Configuration
13 | public class ScrapingResultConsumerConfiguration extends RabbitMqConfiguration
14 | {
15 | protected final String scrapingResultQueue = "scrapingresult.queue";
16 |
17 | @Autowired
18 | private ScrapingResultHandler scrapingResultHandler;
19 |
20 | @Bean
21 | public RabbitTemplate rabbitTemplate() {
22 | RabbitTemplate template = new RabbitTemplate(connectionFactory());
23 | template.setRoutingKey(this.scrapingResultQueue);
24 | template.setQueue(this.scrapingResultQueue);
25 | template.setMessageConverter(jsonMessageConverter());
26 | return template;
27 | }
28 |
29 | @Bean
30 | public Queue scrapingResultQueue() {
31 | return new Queue(this.scrapingResultQueue);
32 | }
33 |
34 | @Bean
35 | public SimpleMessageListenerContainer listenerContainer() {
36 | SimpleMessageListenerContainer container = new SimpleMessageListenerContainer();
37 | container.setConnectionFactory(connectionFactory());
38 | container.setQueueNames(this.scrapingResultQueue);
39 | container.setMessageListener(messageListenerAdapter());
40 |
41 | return container;
42 | }
43 |
44 | @Bean
45 | public MessageListenerAdapter messageListenerAdapter() {
46 | return new MessageListenerAdapter(scrapingResultHandler, jsonMessageConverter());
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/java-api-backend/src/main/java/scraper/api/amqp/ScrapingResultHandler.java:
--------------------------------------------------------------------------------
1 | package scraper.api.amqp;
2 |
3 | import org.springframework.beans.factory.annotation.Autowired;
4 | import org.springframework.stereotype.Component;
5 | import scraper.api.domain.Bookmark;
6 | import scraper.api.domain.BookmarkRepository;
7 |
8 | import java.util.List;
9 |
10 | @Component
11 | public class ScrapingResultHandler
12 | {
13 | @Autowired
14 | private BookmarkRepository bookmarkRepository;
15 |
16 | public void handleMessage(ScrapingResultMessage scrapingResultMessage)
17 | {
18 | System.out.println("Received summary: " + scrapingResultMessage.getSummary());
19 | final String url = scrapingResultMessage.getUrl();
20 | final List bookmarks = bookmarkRepository.findByUrl(url);
21 | if (bookmarks.size() == 0)
22 | {
23 | System.out.println("No bookmark of url: " + url + " found.");
24 | }
25 | else
26 | {
27 | for (Bookmark bookmark : bookmarks)
28 | {
29 | bookmark.setSummary(scrapingResultMessage.getSummary());
30 | bookmarkRepository.save(bookmarks);
31 | System.out.println("updated bookmark: " + url);
32 | }
33 | }
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/java-api-backend/src/main/java/scraper/api/amqp/ScrapingResultMessage.java:
--------------------------------------------------------------------------------
1 | package scraper.api.amqp;
2 |
3 | public class ScrapingResultMessage
4 | {
5 | private String url;
6 | private String summary;
7 | private String codeSnippets;
8 |
9 | public String getUrl()
10 | {
11 | return url;
12 | }
13 |
14 | public void setUrl(String url)
15 | {
16 | this.url = url;
17 | }
18 |
19 | public String getSummary()
20 | {
21 | return summary;
22 | }
23 |
24 | public void setSummary(String summary)
25 | {
26 | this.summary = summary;
27 | }
28 |
29 | public String getCodeSnippets()
30 | {
31 | return codeSnippets;
32 | }
33 |
34 | public void setCodeSnippets(String codeSnippets)
35 | {
36 | this.codeSnippets = codeSnippets;
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/java-api-backend/src/main/java/scraper/api/amqp/TaskMessage.java:
--------------------------------------------------------------------------------
1 | package scraper.api.amqp;
2 |
3 | public class TaskMessage
4 | {
5 | private String url;
6 |
7 | public String getUrl()
8 | {
9 | return url;
10 | }
11 |
12 | public void setUrl(String url)
13 | {
14 | this.url = url;
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/java-api-backend/src/main/java/scraper/api/amqp/TaskProducer.java:
--------------------------------------------------------------------------------
1 | package scraper.api.amqp;
2 |
3 | import org.springframework.beans.factory.annotation.Autowired;
4 | import org.springframework.stereotype.Component;
5 |
6 | @Component
7 | public class TaskProducer
8 | {
9 | @Autowired
10 | private TaskProducerConfiguration taskProducerConfiguration;
11 |
12 | public void sendNewTask(TaskMessage taskMessage)
13 | {
14 | taskProducerConfiguration.rabbitTemplate().convertAndSend(taskProducerConfiguration.tasksQueue, taskMessage);
15 | }
16 |
17 | }
18 |
--------------------------------------------------------------------------------
/java-api-backend/src/main/java/scraper/api/amqp/TaskProducerConfiguration.java:
--------------------------------------------------------------------------------
1 | package scraper.api.amqp;
2 |
3 | import org.springframework.amqp.core.Queue;
4 | import org.springframework.amqp.rabbit.core.RabbitTemplate;
5 | import org.springframework.context.annotation.Bean;
6 | import org.springframework.context.annotation.Configuration;
7 |
8 | @Configuration
9 | public class TaskProducerConfiguration extends RabbitMqConfiguration
10 | {
11 | protected final String tasksQueue = "tasks.queue";
12 |
13 | @Bean
14 | public RabbitTemplate rabbitTemplate()
15 | {
16 | RabbitTemplate template = new RabbitTemplate(connectionFactory());
17 | template.setRoutingKey(this.tasksQueue);
18 | template.setQueue(this.tasksQueue);
19 | template.setMessageConverter(jsonMessageConverter());
20 | return template;
21 | }
22 |
23 | @Bean
24 | public Queue tasksQueue()
25 | {
26 | return new Queue(this.tasksQueue);
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/java-api-backend/src/main/java/scraper/api/domain/Bookmark.java:
--------------------------------------------------------------------------------
1 | package scraper.api.domain;
2 |
3 | import javax.persistence.*;
4 | import java.util.Date;
5 |
6 | @Entity
7 | public class Bookmark
8 | {
9 | @Id
10 | @GeneratedValue(strategy = GenerationType.AUTO)
11 | private long id;
12 |
13 | @Column(nullable=false)
14 | private String url;
15 |
16 | @Column(columnDefinition = "Text")
17 | private String summary;
18 |
19 | @Column(nullable=false)
20 | private Date created;
21 |
22 | private String note;
23 |
24 | public Date getCreated()
25 | {
26 | return created;
27 | }
28 |
29 | public void setCreated(Date created)
30 | {
31 | this.created = created;
32 | }
33 |
34 | public String getUrl()
35 | {
36 | return url;
37 | }
38 |
39 | public void setUrl(String url)
40 | {
41 | this.url = url;
42 | }
43 |
44 | public String getSummary()
45 | {
46 | return summary;
47 | }
48 |
49 | public void setSummary(String summary)
50 | {
51 | this.summary = summary;
52 | }
53 |
54 | public String getNote()
55 | {
56 | return note;
57 | }
58 |
59 | public void setNote(String note)
60 | {
61 | this.note = note;
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/java-api-backend/src/main/java/scraper/api/domain/BookmarkEventHandler.java:
--------------------------------------------------------------------------------
1 | package scraper.api.domain;
2 |
3 | import org.springframework.beans.factory.annotation.Autowired;
4 | import org.springframework.data.rest.core.annotation.HandleAfterCreate;
5 | import org.springframework.data.rest.core.annotation.HandleBeforeCreate;
6 | import org.springframework.data.rest.core.annotation.RepositoryEventHandler;
7 | import scraper.api.amqp.TaskMessage;
8 | import scraper.api.amqp.TaskProducer;
9 |
10 | import java.util.Date;
11 |
12 | @RepositoryEventHandler(Bookmark.class)
13 | public class BookmarkEventHandler
14 | {
15 | @Autowired
16 | private TaskProducer taskProducer;
17 |
18 | @HandleBeforeCreate
19 | public void handleBookmarkCreate(Bookmark bookmark)
20 | {
21 | bookmark.setCreated(new Date());
22 | bookmark.setUrl(bookmark.getUrl().trim());
23 |
24 | }
25 |
26 | @HandleAfterCreate
27 | public void handleAfterBookmarkCreate(Bookmark bookmark)
28 | {
29 | final TaskMessage taskMessage = new TaskMessage();
30 | taskMessage.setUrl(bookmark.getUrl());
31 | taskProducer.sendNewTask(taskMessage);
32 | }
33 |
34 | }
35 |
--------------------------------------------------------------------------------
/java-api-backend/src/main/java/scraper/api/domain/BookmarkRepository.java:
--------------------------------------------------------------------------------
1 | package scraper.api.domain;
2 |
3 | import org.springframework.data.repository.CrudRepository;
4 | import org.springframework.data.repository.query.Param;
5 | import org.springframework.data.rest.core.annotation.RepositoryRestResource;
6 | import org.springframework.data.rest.core.annotation.RestResource;
7 |
8 | import java.util.List;
9 |
10 | @RepositoryRestResource
11 | public interface BookmarkRepository extends CrudRepository
12 | {
13 | @RestResource(path="url")
14 | List findByUrl(@Param("text") String url);
15 | }
16 |
--------------------------------------------------------------------------------
/java-api-backend/src/main/java/scraper/api/filter/CORSFilter.java:
--------------------------------------------------------------------------------
1 | package scraper.api.filter;
2 |
3 | import org.springframework.stereotype.Component;
4 |
5 | import javax.servlet.*;
6 | import javax.servlet.http.HttpServletResponse;
7 | import java.io.IOException;
8 |
9 | @Component
10 | public class CORSFilter implements Filter
11 | {
12 | public void doFilter(ServletRequest req, ServletResponse res, FilterChain chain) throws IOException, ServletException
13 | {
14 | HttpServletResponse response = (HttpServletResponse) res;
15 | response.setHeader("Access-Control-Allow-Origin", "*");
16 | response.setHeader("Access-Control-Allow-Methods", "POST, PUT, PATCH, GET, OPTIONS, DELETE");
17 | response.setHeader("Access-Control-Max-Age", "3600");
18 | response.setHeader("Access-Control-Allow-Headers", "x-requested-with");
19 | response.setHeader("Access-Control-Allow-Headers", "Content-Type");
20 | chain.doFilter(req, res);
21 | }
22 |
23 | public void init(FilterConfig filterConfig)
24 | {
25 | }
26 |
27 | public void destroy()
28 | {
29 | }
30 |
31 | }
--------------------------------------------------------------------------------
/java-api-backend/src/main/resources/application.properties:
--------------------------------------------------------------------------------
1 | spring.datasource.driverClassName=org.postgresql.Driver
2 | spring.datasource.url=jdbc:postgresql://localhost:5432/scrapingservicedb
3 | spring.datasource.username=root
4 | spring.datasource.password=root
5 |
6 | spring.jpa.generate-ddl=true
7 | spring.jpa.show-sql=true
8 | spring.jpa.database-platform=org.hibernate.dialect.PostgreSQLDialect
9 | # Note: change this line to
10 | # spring.jpa.hibernate.ddl-auto=validate
11 | # to avoid data loss between restarts.
12 | spring.jpa.hibernate.ddl-auto=create
--------------------------------------------------------------------------------
/knockout-frontend/README.md:
--------------------------------------------------------------------------------
1 | Serve `index.html` with
2 |
3 | knockout-frontend$ python -m SimpleHTTPServer 8090
--------------------------------------------------------------------------------
/knockout-frontend/css/style.css:
--------------------------------------------------------------------------------
1 | textarea {
2 | border: 1px solid #eee;
3 | font-size: 0.8em;
4 | color: #848484;
5 | }
6 |
7 | .summary {
8 | color: #7b7b7b;
9 | line-height: 1.3em;
10 | }
--------------------------------------------------------------------------------
/knockout-frontend/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Bookmarks frontend
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
My bookmarks
15 |
16 |
17 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
Url
46 |
Note
47 |
Summary
48 |
Actions
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
--------------------------------------------------------------------------------
/knockout-frontend/js/bookmark.js:
--------------------------------------------------------------------------------
1 |
2 | // The bookmark model
3 | function Bookmark(selfHref, url, created, note, summary) {
4 | var self = this;
5 | self.selfHref = selfHref;
6 | self.url = ko.observable(url);
7 | self.created = created;
8 | self.note = ko.observable(note);
9 | self.summary = summary;
10 | }
11 |
12 | // The bookmark view model
13 | function BookmarkViewModel() {
14 | var self = this;
15 |
16 | self.newUrl = ko.observable();
17 | self.newNote = ko.observable();
18 | self.bookmarks = ko.observableArray([]);
19 |
20 | // add bookmark: send POST to bookmarks resource
21 | self.addBookmark = function () {
22 | // a little bit of pre-processing of user entered url and note
23 | var newUrl = self.newUrl();
24 | if (typeof newUrl == "undefined") {
25 | alert("Url required");
26 | return;
27 | }
28 |
29 | // prefix with http:// if not added by user
30 | if (newUrl.search(/^http[s]?\:\/\//) == -1) {
31 | newUrl = 'http://' + newUrl;
32 | }
33 |
34 | var newNote = self.newNote();
35 | if (typeof newNote == "undefined") {
36 | newNote = "";
37 | }
38 |
39 | // make POST request
40 | $.ajax("http://localhost:8080/bookmarks", {
41 | data: '{"url": "' + newUrl + ' ", "note": "' + newNote + '"}',
42 | type: "post",
43 | contentType: "application/json",
44 | success: function (allData) {
45 | self.loadBookmarks();
46 | self.newUrl("");
47 | self.newNote("");
48 | }
49 | });
50 | };
51 |
52 | // update bookmark: send PUT to existing bookmarks resource
53 | self.updateBookmark = function (bookmark) {
54 |
55 | // same as in "addBookmark" a little bit of parameter checking. Some code duplication here
56 | // but we leave it for demonstration purposes
57 | var newUrl = bookmark.url();
58 | if (typeof newUrl == "undefined") {
59 | alert("Url required");
60 | return;
61 | }
62 |
63 | // prefix with http:// if not added by user
64 | if (newUrl.search(/^http[s]?\:\/\//) == -1) {
65 | newUrl = 'http://' + newUrl;
66 | }
67 |
68 | var newNote = bookmark.note();
69 | if (typeof newNote == "undefined") {
70 | newNote = "";
71 | }
72 |
73 | // make PUT request (or send PATCH then we don't need to include the created date)
74 | $.ajax(bookmark.selfHref, {
75 | data: '{"url": "' + newUrl + ' ", "note": "' + newNote + '", "created": "' + bookmark.created +'"}',
76 | type: "patch",
77 | contentType: "application/json",
78 | success: function (allData) {
79 | self.loadBookmarks();
80 | }
81 | });
82 | };
83 |
84 |
85 | // delete bookmark: send DELETE to bookmarks resource
86 | self.deleteBookmark = function (bookmark) {
87 | $.ajax(bookmark.selfHref, {
88 | type: "delete",
89 | success: function (allData) {
90 | self.loadBookmarks();
91 | }
92 | });
93 | };
94 |
95 | // load bookmarks from server: GET on bookmarks resource
96 | self.loadBookmarks = function () {
97 | $.ajax("http://localhost:8080/bookmarks", {
98 | type: "get",
99 | success: function (allData) {
100 | var json = ko.toJSON(allData);
101 | var parsed = JSON.parse(json);
102 | if (parsed._embedded) {
103 | var parsedBookmarks = parsed._embedded.bookmarks;
104 | var mappedBookmarks = $.map(parsedBookmarks, function (bookmark) {
105 | return new Bookmark(bookmark._links.self.href, bookmark.url, bookmark.created, bookmark.note, bookmark.summary)
106 | });
107 | self.bookmarks(mappedBookmarks);
108 | } else {
109 | self.bookmarks([]);
110 | }
111 |
112 | }
113 | });
114 | };
115 |
116 | // Load initial data
117 | self.loadBookmarks();
118 | }
119 |
120 |
121 | // Activates knockout.js
122 | ko.applyBindings(new BookmarkViewModel());
--------------------------------------------------------------------------------
/knockout-frontend/runserver.sh:
--------------------------------------------------------------------------------
1 | python -m SimpleHTTPServer 8090
2 |
--------------------------------------------------------------------------------
/python-scraping-service/requirements.txt:
--------------------------------------------------------------------------------
1 | appdirs==1.4.0
2 | breadability==0.1.20
3 | chardet==2.3.0
4 | docopt==0.6.2
5 | lxml==3.7.2
6 | numpy==1.12.0
7 | packaging==16.8
8 | pika==0.10.0
9 | pyparsing==2.1.10
10 | requests==2.13.0
11 | six==1.10.0
12 | sumy==0.5.1
13 |
--------------------------------------------------------------------------------
/python-scraping-service/scraper.py:
--------------------------------------------------------------------------------
1 | from sumy.parsers.html import HtmlParser
2 | from sumy.nlp.tokenizers import Tokenizer
3 | from sumy.summarizers.lex_rank import LexRankSummarizer as Summarizer
4 | from sumy.nlp.stemmers import Stemmer
5 | from sumy.utils import get_stop_words
6 |
7 | class ScrapingResult:
8 | def __init__(self):
9 | self.url = None
10 | self.summary = None
11 |
12 |
13 | LANGUAGE = "english"
14 | SENTENCES_COUNT = 2
15 |
16 |
17 | class Scraper:
18 |
19 | def scrape(self, url):
20 | complete_url = url
21 | try:
22 | # get summary
23 | print "Retrieving page summary of %s... " % url
24 |
25 | parser = HtmlParser.from_url(complete_url, Tokenizer(LANGUAGE))
26 | stemmer = Stemmer(LANGUAGE)
27 |
28 | summarizer = Summarizer(stemmer)
29 | summarizer.stop_words = get_stop_words(LANGUAGE)
30 |
31 | url_summary = ''.join(str(sentence) for sentence in summarizer(parser.document, SENTENCES_COUNT))
32 |
33 | except Exception, e:
34 | url_summary = "Could not scrape summary. Reason: %s" % e.message
35 |
36 | print "Done: %s = %s" % (url, url_summary)
37 |
38 | # create scraping result
39 | scraping_result = ScrapingResult()
40 |
41 | scraping_result.summary = url_summary
42 | scraping_result.url = url
43 |
44 | return scraping_result
45 |
--------------------------------------------------------------------------------
/python-scraping-service/worker.py:
--------------------------------------------------------------------------------
1 | import pika
2 | import json
3 | from scraper import Scraper
4 |
5 | credentials = pika.PlainCredentials("user", "password")
6 | parameters = pika.ConnectionParameters(host='localhost', credentials=credentials)
7 |
8 | connection = pika.BlockingConnection(parameters)
9 | channel = connection.channel()
10 | tasks_queue = channel.queue_declare(queue='tasks.queue', durable=True)
11 | scraping_result_queue = channel.queue_declare(queue='scrapingresult.queue', durable=True)
12 |
13 | print ' [*] Waiting for tasks. To exit press CTRL+C'
14 |
15 | def publish_result(scraping_result):
16 | j = json.dumps(scraping_result.__dict__)
17 | properties = pika.BasicProperties(content_type="application/json")
18 | channel.basic_publish(exchange='', routing_key='scrapingresult.queue', body=j, properties=properties)
19 |
20 | def callback(ch, method, properties, body):
21 | url = json.loads(body)['url']
22 | scraper = Scraper()
23 | result = scraper.scrape(url.strip())
24 | publish_result(result)
25 |
26 | channel.basic_consume(callback, queue='tasks.queue', no_ack=True)
27 | channel.start_consuming()
28 |
--------------------------------------------------------------------------------
/vagrant-scripts/postgres.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh -e
2 |
3 | # Thanks to https://github.com/jackdb/pg-app-dev-vm
4 |
5 | # Edit the following to change the name of the database user that will be created:
6 | APP_DB_USER=$1
7 | APP_DB_PASS=$2
8 |
9 | # Edit the following to change the name of the database that is created (defaults to the user name)
10 | APP_DB_NAME=$3
11 |
12 | # Edit the following to change the version of PostgreSQL that is installed
13 | PG_VERSION=9.4
14 |
15 | ###########################################################
16 | # Changes below this line are probably not necessary
17 | ###########################################################
18 | print_db_usage () {
19 | echo "Your PostgreSQL database has been setup and can be accessed on your local machine on the forwarded port (default: 15432)"
20 | echo " Host: localhost"
21 | echo " Port: 15432"
22 | echo " Database: $APP_DB_NAME"
23 | echo " Username: $APP_DB_USER"
24 | echo " Password: $APP_DB_PASS"
25 | echo ""
26 | echo "Admin access to postgres user via VM:"
27 | echo " vagrant ssh"
28 | echo " sudo su - postgres"
29 | echo ""
30 | echo "psql access to app database user via VM:"
31 | echo " vagrant ssh"
32 | echo " sudo su - postgres"
33 | echo " PGUSER=$APP_DB_USER PGPASSWORD=$APP_DB_PASS psql -h localhost $APP_DB_NAME"
34 | echo ""
35 | echo "Env variable for application development:"
36 | echo " DATABASE_URL=postgresql://$APP_DB_USER:$APP_DB_PASS@localhost:15432/$APP_DB_NAME"
37 | echo ""
38 | echo "Local command to access the database via psql:"
39 | echo " PGUSER=$APP_DB_USER PGPASSWORD=$APP_DB_PASS psql -h localhost -p 15432 $APP_DB_NAME"
40 | }
41 |
42 | export DEBIAN_FRONTEND=noninteractive
43 |
44 | PROVISIONED_ON=/etc/vm_provision_on_timestamp
45 | if [ -f "$PROVISIONED_ON" ]
46 | then
47 | echo "VM was already provisioned at: $(cat $PROVISIONED_ON)"
48 | echo "To run system updates manually login via 'vagrant ssh' and run 'apt-get update && apt-get upgrade'"
49 | echo ""
50 | print_db_usage
51 | exit
52 | fi
53 |
54 | PG_REPO_APT_SOURCE=/etc/apt/sources.list.d/pgdg.list
55 | if [ ! -f "$PG_REPO_APT_SOURCE" ]
56 | then
57 | # Add PG apt repo:
58 | echo "deb http://apt.postgresql.org/pub/repos/apt/ trusty-pgdg main" > "$PG_REPO_APT_SOURCE"
59 |
60 | # Add PGDG repo key:
61 | wget --quiet -O - https://apt.postgresql.org/pub/repos/apt/ACCC4CF8.asc | apt-key add -
62 | fi
63 |
64 | # Update package list and upgrade all packages
65 | apt-get update
66 | apt-get -y upgrade
67 |
68 | apt-get -y install "postgresql-$PG_VERSION" "postgresql-contrib-$PG_VERSION"
69 |
70 | PG_CONF="/etc/postgresql/$PG_VERSION/main/postgresql.conf"
71 | PG_HBA="/etc/postgresql/$PG_VERSION/main/pg_hba.conf"
72 | PG_DIR="/var/lib/postgresql/$PG_VERSION/main"
73 |
74 | # Edit postgresql.conf to change listen address to '*':
75 | sed -i "s/#listen_addresses = 'localhost'/listen_addresses = '*'/" "$PG_CONF"
76 |
77 | # Append to pg_hba.conf to add password auth:
78 | echo "host all all all md5" >> "$PG_HBA"
79 |
80 | # Explicitly set default client_encoding
81 | echo "client_encoding = utf8" >> "$PG_CONF"
82 |
83 | # Restart so that all new config is loaded:
84 | service postgresql restart
85 |
86 | cat << EOF | su - postgres -c psql
87 | -- Create the database user:
88 | CREATE USER $APP_DB_USER WITH PASSWORD '$APP_DB_PASS';
89 |
90 | -- Create the database:
91 | CREATE DATABASE $APP_DB_NAME WITH OWNER=$APP_DB_USER
92 | LC_COLLATE='en_US.utf8'
93 | LC_CTYPE='en_US.utf8'
94 | ENCODING='UTF8'
95 | TEMPLATE=template0;
96 | EOF
97 |
98 | # Tag the provision time:
99 | date > "$PROVISIONED_ON"
100 |
101 | echo "Successfully created PostgreSQL dev virtual machine."
102 | echo ""
103 | print_db_usage
--------------------------------------------------------------------------------
/vagrant-scripts/rabbitmq.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | echo ">>> Installing RabbitMQ"
4 |
5 | apt-get -y install erlang-nox
6 | wget https://www.rabbitmq.com/rabbitmq-release-signing-key.asc
7 | apt-key add rabbitmq-release-signing-key.asc
8 | echo "deb http://www.rabbitmq.com/debian/ testing main" > /etc/apt/sources.list.d/rabbitmq.list
9 | apt-get update
10 | apt-get install -y rabbitmq-server
11 |
12 | rabbitmqctl add_user $1 $2
13 | rabbitmqctl set_permissions -p / $1 ".*" ".*" ".*"
--------------------------------------------------------------------------------