├── .gitattributes
├── .gitignore
├── README.md
├── _commons-java
├── .gitignore
├── README.md
├── pom.xml
└── src
│ └── main
│ └── java
│ └── ro
│ └── code4
│ └── czl
│ └── scrape
│ ├── client
│ ├── ApiClient.java
│ ├── ApiInvoker.java
│ ├── AuthenticationStrategy.java
│ ├── BaseRequest.java
│ ├── BaseRequestBuilder.java
│ ├── Credential.java
│ ├── CzlApiUploadPipeline.java
│ ├── CzlApiV1.java
│ ├── CzlClient.java
│ ├── CzlClientConfig.java
│ ├── Request.java
│ ├── RequestBuilder.java
│ ├── Response.java
│ ├── authentication
│ │ └── TokenAuthenticationStrategy.java
│ ├── core
│ │ ├── CloseIdleConnectionsTask.java
│ │ ├── IdleConnectionMonitor.java
│ │ ├── JaxRsJacksonConfigurator.java
│ │ ├── JaxRsResponse.java
│ │ ├── JaxRsResponseDeserializationStrategy.java
│ │ ├── JerseyClientApiInvoker.java
│ │ └── LoggingFilter.java
│ ├── model
│ │ └── CreatePublicationRequest.java
│ ├── representation
│ │ ├── ContactRepresentation.java
│ │ ├── DocumentRepresentation.java
│ │ └── PublicationRepresentation.java
│ └── samples
│ │ └── CzlClientSample.java
│ └── text
│ ├── ProposalType.java
│ └── RomanianMonth.java
├── _config.yml
├── afaceri
├── README.md
├── package.json
└── server
│ ├── boot
│ ├── authentication.js
│ └── root.js
│ ├── component-config.json
│ ├── config.json
│ ├── config
│ └── keywords.js
│ ├── controllers
│ └── contentParser.js
│ ├── datasources.json
│ ├── middleware.development.json
│ ├── middleware.json
│ ├── model-config.json
│ └── server.js
├── agricultura
├── .gitignore
├── README.md
├── index.js
└── package.json
├── aparare
├── README.md
└── mapn_plugin.php
├── apepaduri
└── README.md
├── cdep
├── README.md
├── requirements.in
├── requirements.txt
└── scraper.py
├── cercetare
├── .editorconfig
├── .gitignore
├── README.md
├── app.js
├── package.json
├── parseProject.js
└── secrets.json.txt
├── dezvoltare
├── .gitignore
├── README.md
├── crawl_dezvoltare
│ ├── crawl_dezvoltare
│ │ ├── __init__.py
│ │ ├── exporters.py
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── spiders
│ │ │ ├── __init__.py
│ │ │ ├── mdrap.py
│ │ │ └── testing.py
│ └── scrapy.cfg
└── requirements.txt
├── economie
├── .editorconfig
├── .gitignore
├── README.md
├── app.js
├── package.json
├── parseProject.js
├── secrets.json.txt
└── yarn.lock
├── educatie
├── README.md
├── config.js
├── index.js
└── package.json
├── energie
├── .gitignore
├── README.md
├── pom.xml
└── src
│ └── main
│ ├── java
│ └── Main.java
│ └── resources
│ └── logback.xml
├── externe
├── README.md
├── __init__.py
├── eusebiu.py
├── scraper
│ ├── __init__.py
│ ├── article.py
│ ├── article_serializer.py
│ └── extractor.py
├── setup.py
└── utils
│ ├── __init__.py
│ ├── api_client.py
│ ├── lang.py
│ └── settings.py
├── finantepub
├── .gitignore
├── README.md
├── index.js
└── package.json
├── interne
├── .editorconfig
├── .gitignore
├── README.md
├── app.js
├── package.json
├── parseProject.js
├── secrets.json.txt
└── yarn.lock
├── justitie
├── .gitignore
├── README.md
├── doc
│ └── scraping.md
├── just
│ ├── __init__.py
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── __init__.py
│ │ └── publications.py
├── requirements.txt
└── scrapy.cfg
├── license
├── mediu
├── .gitignore
├── README.md
├── crawl_mediu
│ ├── crawl_mediu
│ │ ├── __init__.py
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── spiders
│ │ │ ├── __init__.py
│ │ │ └── mmediu.py
│ └── scrapy.cfg
└── requirements.txt
├── pom.xml
├── presedinte
└── README.md
├── pretutindeni
├── .gitignore
├── README.md
├── app.js
├── package.json
├── parseProject.example
└── yarn.lock
├── relparlament
├── README.md
├── index.js
└── package.json
├── sanatate
├── .gitignore
├── README.md
├── credentials.json
├── requirements.txt
├── scrapy.cfg
└── scrapy_proj
│ ├── __init__.py
│ ├── helpers
│ ├── __init__.py
│ ├── legal.py
│ ├── romanian.py
│ └── text.py
│ ├── items
│ ├── __init__.py
│ ├── act.py
│ └── contact.py
│ ├── loaders
│ ├── __init__.py
│ ├── act.py
│ └── contact.py
│ ├── pipelines
│ ├── __init__.py
│ ├── extrameta.py
│ └── post.py
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ └── sanatate.py
├── scrapy
├── .gitignore
├── Readme.md
├── czlscrape
│ ├── __init__.py
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ ├── spiders
│ │ ├── __init__.py
│ │ ├── afaceri.py
│ │ ├── comunicatii.py
│ │ ├── cultura.py
│ │ ├── dialog.py
│ │ ├── munca.py
│ │ └── senat.py
│ └── utils.py
├── requirements.in
├── requirements.txt
├── scrapy.cfg
└── testsuite
│ ├── conftest.py
│ └── test_validator.py
├── sgg
├── README.md
├── requirements.txt
└── sgg
│ ├── run.py
│ ├── scrapy.cfg
│ └── sgg
│ ├── __init__.py
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ └── sgg_spider.py
├── tineret
├── .gitignore
├── README.md
├── requirements.txt
├── scrapy.cfg
└── tineret
│ ├── __init__.py
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ └── tineret.py
├── transport
├── README.md
├── config.js
├── index.js
└── package.json
└── turism
├── README.md
├── out
├── production
│ └── scraper
│ │ └── com
│ │ └── company
│ │ ├── Main.class
│ │ └── Scraper.class
└── scraper.jar
├── out_files
├── Anexe
│ ├── Anexa1.1.1.pdf
│ ├── Anexa1.1.pdf
│ ├── Anexa1.2.pdf
│ ├── Anexa1.3.pdf
│ ├── Anexa1.4.pdf
│ ├── Anexa1.5.1.pdf
│ ├── Anexa1.5.pdf
│ ├── Anexa1.6.pdf
│ ├── Anexa1.7.pdf
│ ├── Anexa1.8.pdf
│ ├── Anexa1.pdf
│ ├── Anexa10.pdf
│ ├── Anexa11.pdf
│ ├── Anexa12.pdf
│ ├── Anexa13.pdf
│ ├── Anexa14.pdf
│ ├── Anexa15.pdf
│ ├── Anexa2.pdf
│ ├── Anexa3.pdf
│ ├── Anexa4.pdf
│ ├── Anexa5.pdf
│ ├── Anexa6.pdf
│ ├── Anexa7.pdf
│ ├── Anexa8.pdf
│ ├── Anexa9.2.pdf
│ ├── Anexa9.pdf
│ ├── AnexaAP.pdf
│ ├── Anexabrevet.pdf
│ └── Anexacazare.pdf
└── Proiecte
│ ├── Ordin-criterii-participare-targuri-externe.pdf
│ ├── Proiect-de-Ordin-al-Ministrului-delegat-pentru-intreprinderi-mici-şi-mijlocii-mediul-de-afaceri-şi-turism-pentru-modificarea-OMT-nr-235-2001.pdf
│ └── Proiect-ordin-modificare-Ordin-65.pdf
└── src
└── com
└── company
└── Main.java
/.gitattributes:
--------------------------------------------------------------------------------
1 | ###############################################################################
2 | # Set default behavior to automatically normalize line endings.
3 | ###############################################################################
4 | * text=auto
5 |
6 | ###############################################################################
7 | # Set default behavior for command prompt diff.
8 | #
9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs diff=csharp
14 |
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln merge=binary
26 | #*.csproj merge=binary
27 | #*.vbproj merge=binary
28 | #*.vcxproj merge=binary
29 | #*.vcproj merge=binary
30 | #*.dbproj merge=binary
31 | #*.fsproj merge=binary
32 | #*.lsproj merge=binary
33 | #*.wixproj merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj merge=binary
36 | #*.wwaproj merge=binary
37 |
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg binary
44 | #*.png binary
45 | #*.gif binary
46 |
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | #
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the
52 | # entries below.
53 | ###############################################################################
54 | #*.doc diff=astextplain
55 | #*.DOC diff=astextplain
56 | #*.docx diff=astextplain
57 | #*.DOCX diff=astextplain
58 | #*.dot diff=astextplain
59 | #*.DOT diff=astextplain
60 | #*.pdf diff=astextplain
61 | #*.PDF diff=astextplain
62 | #*.rtf diff=astextplain
63 | #*.RTF diff=astextplain
64 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # OS files
2 | .DS_Store
3 |
4 | # Java files
5 | *.class
6 |
7 | # Log files
8 | *.log
9 | logs
10 |
11 | # Maven
12 | target
13 | pom.xml.versionsBackup
14 |
15 | # Mobile Tools for Java (J2ME)
16 | .mtj.tmp/
17 |
18 | # Package Files
19 | *.jar
20 | *.war
21 | *.ear
22 |
23 | # IntelliJ IDEA
24 | *.iml
25 | .idea
26 |
27 | # Eclipse
28 | .project
29 | .settings
30 | .classpath
31 | test-output
32 |
33 | # Vim
34 | *.swp
35 |
36 | # Virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
37 | hs_err_pid*
38 |
39 | # Misc
40 | *git.properties
41 |
42 | # Python
43 | *.pyc
44 |
45 | # pyenv
46 | .python-version
47 |
48 | # Node
49 | node_modules/
50 |
--------------------------------------------------------------------------------
/_commons-java/.gitignore:
--------------------------------------------------------------------------------
1 | # OS files
2 | .DS_Store
3 |
4 | # Java files
5 | *.class
6 |
7 | # Log files
8 | *.log
9 | logs
10 |
11 | # Maven
12 | target
13 | pom.xml.versionsBackup
14 |
15 | # Dropwizard
16 | dependency-reduced-pom.xml
17 |
18 | # Mobile Tools for Java (J2ME)
19 | .mtj.tmp/
20 |
21 | # Package Files
22 | *.jar
23 | *.war
24 | *.ear
25 |
26 | # IntelliJ IDEA
27 | *.iml
28 | .idea
29 |
30 | # Eclipse
31 | .project
32 | .settings
33 | .classpath
34 | test-output
35 |
36 | # Vim
37 | *.swp
38 |
39 | # Virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
40 | hs_err_pid*
41 |
42 | # Misc
43 | *git.properties
44 |
45 | # Asciidoc
46 | .asciidoctor
47 | diag-*.png
48 |
--------------------------------------------------------------------------------
/_commons-java/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 |
6 | ro.code4.czl
7 | czl-scrape
8 | 0.0.1-SNAPSHOT
9 | ../
10 |
11 |
12 | czl-scrape-commons
13 | jar
14 |
15 | Ce Zice Legea :: Scraper :: Common Libraries
16 |
17 |
18 |
19 |
20 | us.codecraft
21 | webmagic-core
22 |
23 |
24 | org.slf4j
25 | slf4j-log4j12
26 |
27 |
28 |
29 |
30 |
31 |
32 | org.slf4j
33 | slf4j-api
34 |
35 |
36 |
37 |
38 | org.glassfish.jersey.core
39 | jersey-client
40 |
41 |
42 | org.glassfish.jersey.connectors
43 | jersey-apache-connector
44 |
45 |
46 | org.glassfish.jersey.media
47 | jersey-media-json-jackson
48 |
49 |
50 |
51 | org.apache.commons
52 | commons-lang3
53 |
54 |
55 |
56 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/ApiClient.java:
--------------------------------------------------------------------------------
1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 | *
3 | * ADOBE CONFIDENTIAL
4 | * ___________________
5 | *
6 | * Copyright 2016 Adobe Systems Incorporated
7 | * All Rights Reserved.
8 | *
9 | * NOTICE: All information contained herein is, and remains
10 | * the property of Adobe Systems Incorporated and its suppliers,
11 | * if any. The intellectual and technical concepts contained
12 | * herein are proprietary to Adobe Systems Incorporated and its
13 | * suppliers and are protected by all applicable intellectual property
14 | * laws, including trade secret and copyright laws.
15 | * Dissemination of this information or reproduction of this material
16 | * is strictly forbidden unless prior written permission is obtained
17 | * from Adobe Systems Incorporated.
18 | *
19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client;
21 |
22 | import ro.code4.czl.scrape.client.core.JerseyClientApiInvoker;
23 |
24 | /**
25 | * {@link ApiClient} instances are heavyweight objects that should be created sparingly. A {@link ApiClient} object is
26 | * thread-safe and should be reused when targeting the same service endpoint.
27 | *
28 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
29 | */
30 | public abstract class ApiClient implements AutoCloseable {
31 |
32 | protected final ApiInvoker apiInvoker;
33 |
34 | /**
35 | * Creates a new client instance using all the settings specified by the given configuration object.
36 | *
37 | * @param config a client configuration object
38 | */
39 | protected ApiClient(CzlClientConfig config) {
40 | this(config, new JerseyClientApiInvoker(config));
41 | }
42 |
43 | /**
44 | * Creates a new client instance using all the settings specified by the given configuration object and a custom {@link ApiInvoker} instance.
45 | *
46 | * @param config a client configuration object
47 | * @param apiInvoker a custom API invoker object
48 | */
49 | private ApiClient(CzlClientConfig config, ApiInvoker apiInvoker) {
50 | this.apiInvoker = apiInvoker;
51 | }
52 |
53 | /**
54 | * Retrieves the API invoker object used by this client.
55 | *
56 | * @return a {@link ApiInvoker} instance
57 | */
58 | public ApiInvoker getApiInvoker() {
59 | return apiInvoker;
60 | }
61 |
62 | @Override
63 | public void close() throws Exception {
64 | this.shutdown();
65 | }
66 |
67 | /**
68 | * Shuts down the connection manager used by this client and releases allocated resources. This includes closing all connections, whether they are
69 | * currently used or not.
70 | */
71 | private void shutdown() {
72 | apiInvoker.shutdown();
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/ApiInvoker.java:
--------------------------------------------------------------------------------
1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 | *
3 | * ADOBE CONFIDENTIAL
4 | * ___________________
5 | *
6 | * Copyright 2016 Adobe Systems Incorporated
7 | * All Rights Reserved.
8 | *
9 | * NOTICE: All information contained herein is, and remains
10 | * the property of Adobe Systems Incorporated and its suppliers,
11 | * if any. The intellectual and technical concepts contained
12 | * herein are proprietary to Adobe Systems Incorporated and its
13 | * suppliers and are protected by all applicable intellectual property
14 | * laws, including trade secret and copyright laws.
15 | * Dissemination of this information or reproduction of this material
16 | * is strictly forbidden unless prior written permission is obtained
17 | * from Adobe Systems Incorporated.
18 | *
19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client;
21 |
22 | /**
23 | * Basic API invoker contract.
24 | *
25 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
26 | */
27 | public interface ApiInvoker extends AutoCloseable {
28 |
29 | /**
30 | * Configures a request header that should be added to every request made via this API invoker.
31 | *
32 | * @param key request header name
33 | * @param value request header value
34 | */
35 | void addDefaultHeader(String key, String value);
36 |
37 | /**
38 | * Executes a request.
39 | *
40 | * @param request the request to execute
41 | * @param the type that the response should be deserialized into
42 | * @return a {@link Response} instance containing the response body deserialized into the desired type
43 | */
44 | Response invokeAPI(Request request);
45 |
46 | /**
47 | * Shuts down the connection manager used by this API invoker and releases allocated resources. This includes closing all connections, whether they
48 | * are currently used or not.
49 | */
50 | void shutdown();
51 | }
52 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/AuthenticationStrategy.java:
--------------------------------------------------------------------------------
1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 | *
3 | * ADOBE CONFIDENTIAL
4 | * ___________________
5 | *
6 | * Copyright 2016 Adobe Systems Incorporated
7 | * All Rights Reserved.
8 | *
9 | * NOTICE: All information contained herein is, and remains
10 | * the property of Adobe Systems Incorporated and its suppliers,
11 | * if any. The intellectual and technical concepts contained
12 | * herein are proprietary to Adobe Systems Incorporated and its
13 | * suppliers and are protected by all applicable intellectual property
14 | * laws, including trade secret and copyright laws.
15 | * Dissemination of this information or reproduction of this material
16 | * is strictly forbidden unless prior written permission is obtained
17 | * from Adobe Systems Incorporated.
18 | *
19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client;
21 |
22 | /**
23 | * Contract for an authentication strategy.
24 | *
25 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
26 | */
27 | public interface AuthenticationStrategy {
28 |
29 | /**
30 | * Processes the request with the goal of applying the authentication strategy. This is called before the request is executed.
31 | *
32 | * @param request the request.
33 | * @param the expected type of the response body
34 | */
35 | void process(Request request);
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/BaseRequestBuilder.java:
--------------------------------------------------------------------------------
1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 | *
3 | * ADOBE CONFIDENTIAL
4 | * ___________________
5 | *
6 | * Copyright 2016 Adobe Systems Incorporated
7 | * All Rights Reserved.
8 | *
9 | * NOTICE: All information contained herein is, and remains
10 | * the property of Adobe Systems Incorporated and its suppliers,
11 | * if any. The intellectual and technical concepts contained
12 | * herein are proprietary to Adobe Systems Incorporated and its
13 | * suppliers and are protected by all applicable intellectual property
14 | * laws, including trade secret and copyright laws.
15 | * Dissemination of this information or reproduction of this material
16 | * is strictly forbidden unless prior written permission is obtained
17 | * from Adobe Systems Incorporated.
18 | *
19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client;
21 |
22 | import java.util.HashMap;
23 | import java.util.Map;
24 |
25 | /**
26 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
27 | */
28 | public abstract class BaseRequestBuilder, U> implements RequestBuilder {
29 |
30 | String ifNoneMatch;
31 | String ifMatch;
32 | Map customHeaders = new HashMap<>();
33 | boolean head;
34 | Boolean followRedirects;
35 | Credential credential;
36 |
37 | @Override
38 | public RequestBuilder ifNoneMatch(String ifNoneMatch) {
39 | this.ifNoneMatch = ifNoneMatch;
40 | return this;
41 | }
42 |
43 | @Override
44 | public RequestBuilder ifMatch(String ifMatch) {
45 | this.ifMatch = ifMatch;
46 | return this;
47 | }
48 |
49 | @Override
50 | public RequestBuilder headersOnly() {
51 | this.head = true;
52 | return this;
53 | }
54 |
55 | @Override
56 | public RequestBuilder followRedirects(boolean followRedirects) {
57 | this.followRedirects = followRedirects;
58 | return this;
59 | }
60 |
61 | @Override
62 | public RequestBuilder credential(Credential credential) {
63 | this.credential = credential;
64 | return this;
65 | }
66 |
67 | @Override
68 | public RequestBuilder header(String headerName, String headerValue) {
69 | this.customHeaders.put(headerName, headerValue);
70 | return this;
71 | }
72 |
73 | @Override
74 | public Response execute() {
75 | return build().execute();
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/Credential.java:
--------------------------------------------------------------------------------
1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 | *
3 | * ADOBE CONFIDENTIAL
4 | * ___________________
5 | *
6 | * Copyright 2016 Adobe Systems Incorporated
7 | * All Rights Reserved.
8 | *
9 | * NOTICE: All information contained herein is, and remains
10 | * the property of Adobe Systems Incorporated and its suppliers,
11 | * if any. The intellectual and technical concepts contained
12 | * herein are proprietary to Adobe Systems Incorporated and its
13 | * suppliers and are protected by all applicable intellectual property
14 | * laws, including trade secret and copyright laws.
15 | * Dissemination of this information or reproduction of this material
16 | * is strictly forbidden unless prior written permission is obtained
17 | * from Adobe Systems Incorporated.
18 | *
19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client;
21 |
22 | /**
23 | * Marker interface for credential used during authentication. Used by {@linkplain AuthenticationStrategy} implementations.
24 | *
25 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
26 | */
27 | public interface Credential {
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/CzlApiUploadPipeline.java:
--------------------------------------------------------------------------------
1 | package ro.code4.czl.scrape.client;
2 |
3 | import static ro.code4.czl.scrape.client.representation.PublicationRepresentation.PublicationRepresentationBuilder.aPublicationRepresentation;
4 |
5 | import ro.code4.czl.scrape.client.representation.DocumentRepresentation;
6 | import us.codecraft.webmagic.ResultItems;
7 | import us.codecraft.webmagic.Task;
8 | import us.codecraft.webmagic.pipeline.Pipeline;
9 |
10 | import java.util.List;
11 | import java.util.Map;
12 |
13 | /**
14 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
15 | */
16 | public class CzlApiUploadPipeline implements Pipeline {
17 |
18 | private final CzlClient czlClient;
19 |
20 | public CzlApiUploadPipeline(CzlClient czlClient) {
21 | this.czlClient = czlClient;
22 | }
23 |
24 | @SuppressWarnings("unchecked")
25 | @Override
26 | public void process(ResultItems resultItems, Task task) {
27 | Map extractedFields = resultItems.getAll();
28 |
29 | czlClient.apiV1()
30 | .createPublication(aPublicationRepresentation()
31 | .withDate((String) extractedFields.get("date"))
32 | .withInstitution((String) extractedFields.get("institution"))
33 | .withIdentifier((String) extractedFields.get("identifier"))
34 | .withDescription((String) extractedFields.get("description"))
35 | .withDocuments((List) extractedFields.get("documents"))
36 | .withTitle((String) extractedFields.get("title"))
37 | .withType((String) extractedFields.get("type"))
38 | //.withFeedback_days((int) extractedFields.get("feedbackDays"))
39 | .withContact((Map) extractedFields.get("contact"))
40 | .build())
41 | .execute();
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/CzlApiV1.java:
--------------------------------------------------------------------------------
1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 | *
3 | * ADOBE CONFIDENTIAL
4 | * ___________________
5 | *
6 | * Copyright 2016 Adobe Systems Incorporated
7 | * All Rights Reserved.
8 | *
9 | * NOTICE: All information contained herein is, and remains
10 | * the property of Adobe Systems Incorporated and its suppliers,
11 | * if any. The intellectual and technical concepts contained
12 | * herein are proprietary to Adobe Systems Incorporated and its
13 | * suppliers and are protected by all applicable intellectual property
14 | * laws, including trade secret and copyright laws.
15 | * Dissemination of this information or reproduction of this material
16 | * is strictly forbidden unless prior written permission is obtained
17 | * from Adobe Systems Incorporated.
18 | *
19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client;
21 |
22 | import ro.code4.czl.scrape.client.model.CreatePublicationRequest;
23 | import ro.code4.czl.scrape.client.representation.PublicationRepresentation;
24 |
25 | /**
26 | * A class describing the API for Ce Zice Legea. Uses a fluent builder interface to create requests.
27 | *
28 | * @author Ionut -Maxim Margelatu (ionut.margelatu@gmail.com)
29 | */
30 | public class CzlApiV1 {
31 |
32 | private final ApiInvoker apiInvoker;
33 |
34 | /**
35 | * Creates a new request builder.
36 | *
37 | * @param apiInvoker the {@linkplain ApiInvoker} implementation to use for every request built via this class.
38 | * @see ApiInvoker
39 | */
40 | CzlApiV1(ApiInvoker apiInvoker) {
41 | this.apiInvoker = apiInvoker;
42 | }
43 |
44 | /**
45 | * Starts preparing a new request for creating a publication.
46 | *
47 | * @param publicationRepresentation the representation of the publication to create.
48 | * @return a request builder.
49 | */
50 | public CreatePublicationRequest.Builder createPublication(PublicationRepresentation publicationRepresentation) {
51 | return CreatePublicationRequest.builder(publicationRepresentation, apiInvoker);
52 | }
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/CzlClient.java:
--------------------------------------------------------------------------------
1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 | *
3 | * ADOBE CONFIDENTIAL
4 | * ___________________
5 | *
6 | * Copyright 2016 Adobe Systems Incorporated
7 | * All Rights Reserved.
8 | *
9 | * NOTICE: All information contained herein is, and remains
10 | * the property of Adobe Systems Incorporated and its suppliers,
11 | * if any. The intellectual and technical concepts contained
12 | * herein are proprietary to Adobe Systems Incorporated and its
13 | * suppliers and are protected by all applicable intellectual property
14 | * laws, including trade secret and copyright laws.
15 | * Dissemination of this information or reproduction of this material
16 | * is strictly forbidden unless prior written permission is obtained
17 | * from Adobe Systems Incorporated.
18 | *
19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client;
21 |
22 | /**
23 | * An REST client object. {@link CzlClient} instances are heavyweight objects that should be created sparingly. A {@link CzlClient} object is
24 | * thread-safe and should be reused when targeting the same service endpoint.
25 | *
26 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
27 | */
28 | public class CzlClient extends ApiClient {
29 |
30 | /**
31 | * Build a new client instance using all the settings specified by the given configuration object. {@link CzlClient} instances are heavyweight objects
32 | * that should be created sparingly. A {@link CzlClient} object is thread-safe and should be reused when targeting the same service endpoint.
33 | *
34 | * @param czlClientConfig a client configuration object
35 | * @return a new SDK client instance
36 | */
37 | public static CzlClient newClient(CzlClientConfig czlClientConfig) {
38 | return new CzlClient(czlClientConfig);
39 | }
40 |
41 | private CzlClient(CzlClientConfig czlClientConfig) {
42 | super(czlClientConfig);
43 | }
44 |
45 |
46 | /**
47 | * Access the API.
48 | *
49 | * @return an object describing the API.
50 | */
51 | public CzlApiV1 apiV1() {
52 | return new CzlApiV1(apiInvoker);
53 | }
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/Request.java:
--------------------------------------------------------------------------------
1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 | *
3 | * ADOBE CONFIDENTIAL
4 | * ___________________
5 | *
6 | * Copyright 2016 Adobe Systems Incorporated
7 | * All Rights Reserved.
8 | *
9 | * NOTICE: All information contained herein is, and remains
10 | * the property of Adobe Systems Incorporated and its suppliers,
11 | * if any. The intellectual and technical concepts contained
12 | * herein are proprietary to Adobe Systems Incorporated and its
13 | * suppliers and are protected by all applicable intellectual property
14 | * laws, including trade secret and copyright laws.
15 | * Dissemination of this information or reproduction of this material
16 | * is strictly forbidden unless prior written permission is obtained
17 | * from Adobe Systems Incorporated.
18 | *
19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client;
21 |
22 | import java.util.Map;
23 |
24 | /**
25 | * Contract for a request made by the client.
26 | *
27 | * @param the expected type of the response body
28 | * @author Ionut -Maxim Margelatu (ionut.margelatu@gmail.com)
29 | */
30 | public interface Request {
31 |
32 | /**
33 | * Executes the request and returns the response.
34 | *
35 | * @return the result of the execution. If the response contains a body, it will be automatically deserialized and ready for use.
36 | */
37 | Response execute();
38 |
39 | /**
40 | * Returns the type of response body, if any; null
otherwise.
41 | *
42 | * @return the response type
43 | */
44 | Class getResponseType();
45 |
46 | /**
47 | * Returns the absolute path of the target of this request.
48 | *
49 | * @return the absolute path.
50 | */
51 | String getPath();
52 |
53 | /**
54 | * Returns the HTTP method used by this request.
55 | *
56 | * @return the method.
57 | */
58 | String getMethod();
59 |
60 | /**
61 | * Returns the path parameters used by this request.
62 | *
63 | * @return the path parameters.
64 | */
65 | Map getPathParams();
66 |
67 | /**
68 | * Returns the query parameters used by this request.
69 | *
70 | * @return the query parameters.
71 | */
72 | Map getQueryParams();
73 |
74 | /**
75 | * Returns the matrix parameters used by this request.
76 | *
77 | * @return the matrix parameters.
78 | */
79 | Map getMatrixParams();
80 |
81 | /**
82 | * Returns the header parameters used by this request.
83 | *
84 | * @return the header parameters.
85 | */
86 | Map getHeaderParams();
87 |
88 | /**
89 | * Returns the body used by this request, if any.
90 | *
91 | * @return the body if one has been specified, null
otherwise.
92 | */
93 | Object getBody();
94 |
95 | /**
96 | * Returns the value of the Accept
used by this request.
97 | *
98 | * @return the value of the Accept
header.
99 | */
100 | String getAcceptHeader();
101 |
102 | /**
103 | * Indicates whether this request is supposed to follow redirects or not.
104 | *
105 | * @return true
if the request is supposed to follow redirects, false
otherwise.
106 | */
107 | Boolean isFollowRedirectsEnabled();
108 |
109 | /**
110 | * Returns the value of the credential used by this request.
111 | *
112 | * @return the credential, if any.
113 | */
114 | Credential getCredential();
115 | }
116 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/RequestBuilder.java:
--------------------------------------------------------------------------------
1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 | *
3 | * ADOBE CONFIDENTIAL
4 | * ___________________
5 | *
6 | * Copyright 2016 Adobe Systems Incorporated
7 | * All Rights Reserved.
8 | *
9 | * NOTICE: All information contained herein is, and remains
10 | * the property of Adobe Systems Incorporated and its suppliers,
11 | * if any. The intellectual and technical concepts contained
12 | * herein are proprietary to Adobe Systems Incorporated and its
13 | * suppliers and are protected by all applicable intellectual property
14 | * laws, including trade secret and copyright laws.
15 | * Dissemination of this information or reproduction of this material
16 | * is strictly forbidden unless prior written permission is obtained
17 | * from Adobe Systems Incorporated.
18 | *
19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client;
21 |
22 | /**
23 | * Contract for builders of {@linkplain Request} instances.
24 | *
25 | * @param the request type
26 | * @param the expected type of the response body
27 | * @author Ionut -Maxim Margelatu (ionut.margelatu@gmail.com)
28 | */
29 | public interface RequestBuilder, U> {
30 |
31 | /**
32 | * Sets the If-None-Match
header to the given value. Useful when making conditional requests.
33 | *
34 | * @param ifNoneMatch the value of the header.
35 | * @return the request builder.
36 | */
37 | RequestBuilder ifNoneMatch(String ifNoneMatch);
38 |
39 | /**
40 | * Sets the If-Match
header to the given value. Useful when making conditional requests.
41 | *
42 | * @param ifMatch the value of the header.
43 | * @return the request builder.
44 | */
45 | RequestBuilder ifMatch(String ifMatch);
46 |
47 | /**
48 | * Make the request to only ask for headers. Only applies when the original request is using GET
.
49 | *
50 | * @return the request builder.
51 | */
52 | RequestBuilder headersOnly();
53 |
54 | /**
55 | * Enables or disables following redirects.
56 | *
57 | * @param followRedirects set to true
to enable following redirects, otherwise to false
.
58 | * @return the request builder.
59 | */
60 | RequestBuilder followRedirects(boolean followRedirects);
61 |
62 | /**
63 | * Use the given credential for this request.
64 | *
65 | * @param credential the credential to use for this request.
66 | * @return the request builder.
67 | */
68 | RequestBuilder credential(Credential credential);
69 |
70 | /**
71 | * Adds a custom header to this request.
72 | *
73 | * @param headerName the header name for this request.
74 | * @param headerValue the header value for this request.
75 | * @return the request builder.
76 | */
77 | RequestBuilder header(String headerName, String headerValue);
78 |
79 | /**
80 | * Build the request. Does not execute it.
81 | *
82 | * @return the request.
83 | */
84 | T build();
85 |
86 | /**
87 | * Builds and executes the request.
88 | *
89 | * @return the result of the execution of the request.
90 | */
91 | Response execute();
92 |
93 | }
94 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/Response.java:
--------------------------------------------------------------------------------
1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 | *
3 | * ADOBE CONFIDENTIAL
4 | * ___________________
5 | *
6 | * Copyright 2016 Adobe Systems Incorporated
7 | * All Rights Reserved.
8 | *
9 | * NOTICE: All information contained herein is, and remains
10 | * the property of Adobe Systems Incorporated and its suppliers,
11 | * if any. The intellectual and technical concepts contained
12 | * herein are proprietary to Adobe Systems Incorporated and its
13 | * suppliers and are protected by all applicable intellectual property
14 | * laws, including trade secret and copyright laws.
15 | * Dissemination of this information or reproduction of this material
16 | * is strictly forbidden unless prior written permission is obtained
17 | * from Adobe Systems Incorporated.
18 | *
19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client;
21 |
22 | import java.util.Date;
23 | import java.util.Map;
24 |
25 | /**
26 | * Contract for a response to a request made by the client.
27 | *
28 | * @param the expected type of the response body
29 | * @author Ionut -Maxim Margelatu (ionut.margelatu@gmail.com)
30 | */
31 | public interface Response {
32 |
33 | /**
34 | * Returns the status code of the response.
35 | *
36 | * @return the status code.
37 | */
38 | int getStatusCode();
39 |
40 | /**
41 | * Returns the entity in the response.
42 | *
43 | * @return the entity.
44 | */
45 | T getEntity();
46 |
47 | /**
48 | * Returns the content type of the response.
49 | *
50 | * @return the content type.
51 | */
52 | String getContentType();
53 |
54 | /**
55 | * Returns the content length of the response.
56 | *
57 | * @return the content length.
58 | */
59 | long getContentLength();
60 |
61 | /**
62 | * Returns the ETag
header value, if any.
63 | *
64 | * @return the ETag
header value.
65 | */
66 | String getETag();
67 |
68 | /**
69 | * Returns the date of the response.
70 | *
71 | * @return the date.
72 | */
73 | Date getDate();
74 |
75 | /**
76 | * Returns the value of a given response header.
77 | *
78 | * @param headerName the header name.
79 | * @return the header value.
80 | */
81 | String getHeaderString(String headerName);
82 |
83 | /**
84 | * Returns all the response headers.
85 | *
86 | * @return the response headers.
87 | */
88 | Map getHeaders();
89 | }
90 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/authentication/TokenAuthenticationStrategy.java:
--------------------------------------------------------------------------------
1 | package ro.code4.czl.scrape.client.authentication;
2 |
3 | import ro.code4.czl.scrape.client.AuthenticationStrategy;
4 | import ro.code4.czl.scrape.client.Request;
5 |
6 | /**
7 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
8 | */
9 | public class TokenAuthenticationStrategy implements AuthenticationStrategy {
10 |
11 | private final String tokenValue = System.getProperty("czl.scrape.token");
12 |
13 | @Override
14 | public void process(Request request) {
15 | request.getHeaderParams().put("Authorization", "Token " + tokenValue);
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/core/CloseIdleConnectionsTask.java:
--------------------------------------------------------------------------------
1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 | *
3 | * ADOBE CONFIDENTIAL
4 | * ___________________
5 | *
6 | * Copyright 2016 Adobe Systems Incorporated
7 | * All Rights Reserved.
8 | *
9 | * NOTICE: All information contained herein is, and remains
10 | * the property of Adobe Systems Incorporated and its suppliers,
11 | * if any. The intellectual and technical concepts contained
12 | * herein are proprietary to Adobe Systems Incorporated and its
13 | * suppliers and are protected by all applicable intellectual property
14 | * laws, including trade secret and copyright laws.
15 | * Dissemination of this information or reproduction of this material
16 | * is strictly forbidden unless prior written permission is obtained
17 | * from Adobe Systems Incorporated.
18 | *
19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client.core;
21 |
22 | import org.apache.http.conn.HttpClientConnectionManager;
23 | import org.slf4j.Logger;
24 | import org.slf4j.LoggerFactory;
25 |
26 | import java.util.concurrent.TimeUnit;
27 |
28 | /**
29 | * Closes idle or expired connections created by a specific connection manager.
30 | *
31 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
32 | */
33 | class CloseIdleConnectionsTask implements Runnable {
34 |
35 | private static final Logger logger = LoggerFactory.getLogger(CloseIdleConnectionsTask.class);
36 |
37 | private final HttpClientConnectionManager connectionManager;
38 | private final int idleTime;
39 |
40 | /**
41 | * Creates a new task.
42 | *
43 | * @param connectionManager the connection manager that will be periodically checked
44 | * @param idleTime the inactivity time in milliseconds after which connections are considered to be idle
45 | */
46 | CloseIdleConnectionsTask(HttpClientConnectionManager connectionManager, int idleTime) {
47 | this.connectionManager = connectionManager;
48 | this.idleTime = idleTime;
49 | }
50 |
51 | @Override
52 | public void run() {
53 | try {
54 | connectionManager.closeExpiredConnections();
55 | connectionManager.closeIdleConnections(idleTime, TimeUnit.MILLISECONDS);
56 | } catch (Exception t) {
57 | logger.warn("Unable to close idle connections", t);
58 | }
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/core/JaxRsJacksonConfigurator.java:
--------------------------------------------------------------------------------
1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 | *
3 | * ADOBE CONFIDENTIAL
4 | * ___________________
5 | *
6 | * Copyright 2016 Adobe Systems Incorporated
7 | * All Rights Reserved.
8 | *
9 | * NOTICE: All information contained herein is, and remains
10 | * the property of Adobe Systems Incorporated and its suppliers,
11 | * if any. The intellectual and technical concepts contained
12 | * herein are proprietary to Adobe Systems Incorporated and its
13 | * suppliers and are protected by all applicable intellectual property
14 | * laws, including trade secret and copyright laws.
15 | * Dissemination of this information or reproduction of this material
16 | * is strictly forbidden unless prior written permission is obtained
17 | * from Adobe Systems Incorporated.
18 | *
19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client.core;
21 |
22 | import com.fasterxml.jackson.databind.DeserializationFeature;
23 | import com.fasterxml.jackson.databind.ObjectMapper;
24 |
25 | import javax.ws.rs.ext.ContextResolver;
26 | import javax.ws.rs.ext.Provider;
27 |
28 | /**
29 | * Provides custom configuration for Jackson.
30 | *
31 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
32 | */
33 | @Provider
34 | public class JaxRsJacksonConfigurator implements ContextResolver {
35 |
36 | private final ObjectMapper mapper;
37 |
38 | public JaxRsJacksonConfigurator() {
39 | mapper = new ObjectMapper();
40 | mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
41 | }
42 |
43 | @Override
44 | public ObjectMapper getContext(Class> type) {
45 | return mapper;
46 | }
47 |
48 | }
49 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/core/JaxRsResponse.java:
--------------------------------------------------------------------------------
1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 | *
3 | * ADOBE CONFIDENTIAL
4 | * ___________________
5 | *
6 | * Copyright 2016 Adobe Systems Incorporated
7 | * All Rights Reserved.
8 | *
9 | * NOTICE: All information contained herein is, and remains
10 | * the property of Adobe Systems Incorporated and its suppliers,
11 | * if any. The intellectual and technical concepts contained
12 | * herein are proprietary to Adobe Systems Incorporated and its
13 | * suppliers and are protected by all applicable intellectual property
14 | * laws, including trade secret and copyright laws.
15 | * Dissemination of this information or reproduction of this material
16 | * is strictly forbidden unless prior written permission is obtained
17 | * from Adobe Systems Incorporated.
18 | *
19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client.core;
21 |
22 | import jersey.repackaged.com.google.common.collect.Maps;
23 | import ro.code4.czl.scrape.client.Response;
24 |
25 | import java.util.Collections;
26 | import java.util.Date;
27 | import java.util.List;
28 | import java.util.Map;
29 |
30 | /**
31 | * Wrapper over {@linkplain javax.ws.rs.core.Response} that provides a safe body deserialization mechanism along with some syntactic sugar.
32 | *
33 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
34 | */
35 | class JaxRsResponse implements Response {
36 |
37 | private final javax.ws.rs.core.Response originalResponse;
38 | private final Map simplifiedHeaders;
39 | private final T entity;
40 |
41 | JaxRsResponse(javax.ws.rs.core.Response originalResponse, Class expectedType) {
42 | this.originalResponse = originalResponse;
43 | this.entity = new JaxRsResponseDeserializationStrategy().read(originalResponse, expectedType);
44 | this.simplifiedHeaders = Collections.unmodifiableMap(
45 | Maps.transformEntries(originalResponse.getStringHeaders(), new StringListToStringEntryTransformer()));
46 | }
47 |
48 | @Override
49 | public int getStatusCode() {
50 | return originalResponse.getStatus();
51 | }
52 |
53 | @Override
54 | public T getEntity() {
55 | return entity;
56 | }
57 |
58 | @Override
59 | public String getContentType() {
60 | return originalResponse.getMediaType().toString();
61 | }
62 |
63 | @Override
64 | public long getContentLength() {
65 | return originalResponse.getLength();
66 | }
67 |
68 | @Override
69 | public String getETag() {
70 | return originalResponse.getEntityTag().getValue();
71 | }
72 |
73 | @Override
74 | public Date getDate() {
75 | return originalResponse.getDate();
76 | }
77 |
78 | @Override
79 | public String getHeaderString(String headerName) {
80 | return originalResponse.getHeaderString(headerName);
81 | }
82 |
83 | @Override
84 | public Map getHeaders() {
85 | return simplifiedHeaders;
86 | }
87 |
88 | private static class StringListToStringEntryTransformer implements Maps.EntryTransformer, String> {
89 |
90 | @Override
91 | public String transformEntry(String s, List strings) {
92 | if (strings == null || strings.isEmpty()) {
93 | return null;
94 | }
95 | return strings.get(0);
96 | }
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/core/JaxRsResponseDeserializationStrategy.java:
--------------------------------------------------------------------------------
1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 | *
3 | * ADOBE CONFIDENTIAL
4 | * ___________________
5 | *
6 | * Copyright 2016 Adobe Systems Incorporated
7 | * All Rights Reserved.
8 | *
9 | * NOTICE: All information contained herein is, and remains
10 | * the property of Adobe Systems Incorporated and its suppliers,
11 | * if any. The intellectual and technical concepts contained
12 | * herein are proprietary to Adobe Systems Incorporated and its
13 | * suppliers and are protected by all applicable intellectual property
14 | * laws, including trade secret and copyright laws.
15 | * Dissemination of this information or reproduction of this material
16 | * is strictly forbidden unless prior written permission is obtained
17 | * from Adobe Systems Incorporated.
18 | *
19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client.core;
21 |
22 | import java.io.InputStream;
23 |
24 | import javax.ws.rs.core.Response;
25 |
26 | /**
27 | * Deserialization strategy that ensures the response body is safely deserialized and that the input stream is properly closed.
28 | *
29 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
30 | */
31 | class JaxRsResponseDeserializationStrategy {
32 |
33 | @SuppressWarnings("unchecked")
34 | T read(Response response, Class expectedType) {
35 | if (!response.hasEntity()) {
36 | response.close();
37 | return null;
38 | }
39 |
40 | if (InputStream.class.isAssignableFrom(expectedType)) {
41 | return (T) response.getEntity();
42 | } else {
43 | if (response.getStatusInfo().getFamily() == Response.Status.Family.SUCCESSFUL) {
44 | try {
45 | return response.readEntity(expectedType);
46 | } finally {
47 | response.close();
48 | }
49 | }
50 | }
51 |
52 | response.close();
53 | return null;
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/model/CreatePublicationRequest.java:
--------------------------------------------------------------------------------
1 | package ro.code4.czl.scrape.client.model;
2 |
3 | import ro.code4.czl.scrape.client.ApiInvoker;
4 | import ro.code4.czl.scrape.client.BaseRequest;
5 | import ro.code4.czl.scrape.client.BaseRequestBuilder;
6 | import ro.code4.czl.scrape.client.representation.PublicationRepresentation;
7 |
8 | import javax.ws.rs.HttpMethod;
9 | import javax.ws.rs.core.MediaType;
10 |
11 | /**
12 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
13 | */
14 | public class CreatePublicationRequest extends BaseRequest {
15 |
16 | private CreatePublicationRequest(CreatePublicationRequest.Builder builder) {
17 | super(builder, "publications/", HttpMethod.POST, MediaType.APPLICATION_JSON, builder.apiInvoker);
18 |
19 | setBody(builder.spaceRepresentation);
20 | }
21 |
22 | public static CreatePublicationRequest.Builder builder(PublicationRepresentation spaceRepresentation, ApiInvoker apiInvoker) {
23 | return new CreatePublicationRequest.Builder(spaceRepresentation, apiInvoker);
24 | }
25 |
26 | @Override
27 | public Class getResponseType() {
28 | return PublicationRepresentation.class;
29 | }
30 |
31 | public static class Builder extends BaseRequestBuilder {
32 |
33 | private final ApiInvoker apiInvoker;
34 | private final PublicationRepresentation spaceRepresentation;
35 |
36 | Builder(PublicationRepresentation spaceRepresentation, ApiInvoker apiInvoker) {
37 | this.apiInvoker = apiInvoker;
38 | this.spaceRepresentation = spaceRepresentation;
39 | }
40 |
41 | @Override
42 | public CreatePublicationRequest build() {
43 | return new CreatePublicationRequest(this);
44 | }
45 | }
46 |
47 | }
48 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/representation/ContactRepresentation.java:
--------------------------------------------------------------------------------
1 | package ro.code4.czl.scrape.client.representation;
2 |
3 | import com.fasterxml.jackson.annotation.JsonInclude;
4 | import com.fasterxml.jackson.annotation.JsonInclude.Include;
5 |
6 | /**
7 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
8 | */
9 | @JsonInclude(Include.NON_NULL)
10 | public class ContactRepresentation {
11 |
12 | private String tel;
13 | private String email;
14 |
15 | public ContactRepresentation() {
16 | }
17 |
18 | public ContactRepresentation(String tel, String email) {
19 | this.tel = tel;
20 | this.email = email;
21 | }
22 |
23 | public String getTel() {
24 | return tel;
25 | }
26 |
27 | public void setTel(String tel) {
28 | this.tel = tel;
29 | }
30 |
31 | public String getEmail() {
32 | return email;
33 | }
34 |
35 | public void setEmail(String email) {
36 | this.email = email;
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/representation/DocumentRepresentation.java:
--------------------------------------------------------------------------------
1 | package ro.code4.czl.scrape.client.representation;
2 |
3 | import com.fasterxml.jackson.annotation.JsonInclude;
4 | import com.fasterxml.jackson.annotation.JsonInclude.Include;
5 |
6 | import org.apache.commons.lang3.builder.ToStringBuilder;
7 | import org.apache.commons.lang3.builder.ToStringStyle;
8 |
9 | /**
10 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
11 | */
12 | @JsonInclude(Include.NON_NULL)
13 | public class DocumentRepresentation {
14 |
15 | private String type;
16 | private String url;
17 |
18 | public DocumentRepresentation() {
19 | }
20 |
21 | public DocumentRepresentation(String type, String url) {
22 | this.type = type;
23 | this.url = url;
24 | }
25 |
26 | public String getType() {
27 | return type;
28 | }
29 |
30 | public void setType(String type) {
31 | this.type = type;
32 | }
33 |
34 | public String getUrl() {
35 | return url;
36 | }
37 |
38 | public void setUrl(String url) {
39 | this.url = url;
40 | }
41 |
42 | @Override
43 | public String toString() {
44 | return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE)
45 | .append("type", type)
46 | .append("url", url)
47 | .toString();
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/samples/CzlClientSample.java:
--------------------------------------------------------------------------------
1 | package ro.code4.czl.scrape.client.samples;
2 |
3 | import org.slf4j.Logger;
4 | import org.slf4j.LoggerFactory;
5 |
6 | import ro.code4.czl.scrape.client.CzlClient;
7 | import ro.code4.czl.scrape.client.CzlClientConfig;
8 | import ro.code4.czl.scrape.client.authentication.TokenAuthenticationStrategy;
9 | import ro.code4.czl.scrape.client.representation.PublicationRepresentation.PublicationRepresentationBuilder;
10 |
11 | /**
12 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
13 | */
14 | public class CzlClientSample {
15 |
16 | private static final Logger logger = LoggerFactory.getLogger(CzlClientSample.class);
17 |
18 | public static void main(String[] args) {
19 |
20 | CzlClientConfig clientConfig = CzlClientConfig.builder()
21 | .endpointURI("http://czl-api.code4.ro/api/")
22 | .connectionRequestTimeout(500)
23 | .connectTimeout(500)
24 | .socketTimeout(3000)
25 | .authenticationStrategy(new TokenAuthenticationStrategy())
26 | .build();
27 |
28 | try (CzlClient czlClient = CzlClient.newClient(clientConfig)) {
29 | czlClient.apiV1().createPublication(PublicationRepresentationBuilder
30 | .aPublicationRepresentation()
31 | .withIdentifier("1")
32 | .withInstitution("finantepub")
33 | .withType("HG")
34 | .withDate("2017-03-08")
35 | .build())
36 | .execute();
37 | } catch (Exception e) {
38 | logger.error("Met an error.", e);
39 | }
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/text/ProposalType.java:
--------------------------------------------------------------------------------
1 | package ro.code4.czl.scrape.text;
2 |
3 | /**
4 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
5 | */
6 | public enum ProposalType {
7 |
8 | HG, LEGE, OM, OG, OUG, OTHER;
9 |
10 | public static ProposalType fromLabel(String label) {
11 | switch (label.toLowerCase()) {
12 | case "hg":
13 | case "hotarare": {
14 | return HG;
15 | }
16 | case "lege": {
17 | return LEGE;
18 | }
19 | case "om":
20 | case "ordin": {
21 | return OM;
22 | }
23 | case "og": {
24 | return OG;
25 | }
26 | case "oug": {
27 | return OUG;
28 | }
29 | default: {
30 | return OTHER;
31 | }
32 | }
33 | }
34 |
35 | }
36 |
--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/text/RomanianMonth.java:
--------------------------------------------------------------------------------
1 | package ro.code4.czl.scrape.text;
2 |
3 | /**
4 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
5 | */
6 | public enum RomanianMonth {
7 |
8 | IANUARIE(1),
9 | FEBRUARIE(2),
10 | MARTIE(3),
11 | APRILIE(4),
12 | MAI(5),
13 | IUNIE(6),
14 | IULIE(7),
15 | AUGUST(8),
16 | SEPTEMBRIE(9),
17 | OCTOMBRIE(10),
18 | NOIEMBRIE(11),
19 | DECEMBRIE(12);
20 |
21 | private final int number;
22 |
23 | RomanianMonth(int number) {
24 | this.number = number;
25 | }
26 |
27 | public int getNumber() {
28 | return number;
29 | }
30 |
31 | public static RomanianMonth fromLabel(String value) {
32 | switch (value.toLowerCase()) {
33 | case "ianuarie": {
34 | return IANUARIE;
35 | }
36 | case "februarie": {
37 | return FEBRUARIE;
38 | }
39 | case "martie": {
40 | return MARTIE;
41 | }
42 | case "aprilie": {
43 | return APRILIE;
44 | }
45 | case "mai": {
46 | return MAI;
47 | }
48 | case "iunie": {
49 | return IUNIE;
50 | }
51 | case "iulie": {
52 | return IULIE;
53 | }
54 | case "august": {
55 | return AUGUST;
56 | }
57 | case "septembrie": {
58 | return SEPTEMBRIE;
59 | }
60 | case "octombrie": {
61 | return OCTOMBRIE;
62 | }
63 | case "noiembrie": {
64 | return NOIEMBRIE;
65 | }
66 | case "decembrie": {
67 | return DECEMBRIE;
68 | }
69 | default: {
70 | throw new RuntimeException("Unrecognized month label " + value);
71 | }
72 | }
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-slate
--------------------------------------------------------------------------------
/afaceri/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul pentru Mediul de Afaceri, Comerț și Antreprenoriat
2 |
3 | Surse de documente este http://www.antreprenoriat.gov.ro/categorie/transparenta-decizionala/proiecte-in-dezbatere-publica/ .
4 |
5 | ### Tehnologie
6 | *NodeJS* - serverul se conecteaza la URL-ul setat in fisierul din config, descarca fisierele PDF, parseaza continutul lor, trimite obiectele generate la API si sterge fisierele PDF de pe disc.
7 |
8 | ### Instructiuni
9 | Token-ul de autentificare la API trebuie setat in fisierul *config.json*.
10 |
11 | Continutul PDF-urilor se proceseaza in paragrafe. Serverul obtine datele necesare din paragraful relevant. Paragraful relevant reprezinta primul paragraf cu un numar total mai mare de 8 cuvinte si 50 de litere (configurabil in *config.json*)
12 | ```
13 | npm install
14 | node server/server.js
15 | ```
16 |
17 | ### Exceptii
18 | Datele documentelor nu exista intr-un format standardizat. Date interpretabile exista in URL-urile fisierelor si in numele acestora.
19 |
20 | La fiecare rulare a server-ului, sunt (re)procesate fisierele din URL-ul principal.
21 |
--------------------------------------------------------------------------------
/afaceri/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "afaceri",
3 | "version": "1.0.0",
4 | "main": "server/server.js",
5 | "scripts": {
6 | "lint": "eslint .",
7 | "start": "node .",
8 | "posttest": "npm run lint && nsp check"
9 | },
10 | "dependencies": {
11 | "async": "^2.1.5",
12 | "cheerio": "^0.22.0",
13 | "compression": "^1.0.3",
14 | "cors": "^2.5.2",
15 | "helmet": "^1.3.0",
16 | "loopback": "^2.22.0",
17 | "loopback-boot": "^2.6.5",
18 | "loopback-component-explorer": "^2.4.0",
19 | "loopback-datasource-juggler": "^2.39.0",
20 | "pdf2json": "^1.1.7",
21 | "serve-favicon": "^2.0.1",
22 | "string": "^3.3.3",
23 | "strong-error-handler": "^1.0.1"
24 | },
25 | "devDependencies": {
26 | "eslint": "^2.13.1",
27 | "eslint-config-loopback": "^4.0.0",
28 | "nsp": "^2.1.0"
29 | },
30 | "repository": {
31 | "type": "",
32 | "url": ""
33 | },
34 | "license": "UNLICENSED",
35 | "description": "afaceri"
36 | }
37 |
--------------------------------------------------------------------------------
/afaceri/server/boot/authentication.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | module.exports = function enableAuthentication(server) {
4 | // enable authentication
5 | server.enableAuth();
6 | };
7 |
--------------------------------------------------------------------------------
/afaceri/server/boot/root.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | module.exports = function(server) {
4 | // Install a `/` route that returns server status
5 | var router = server.loopback.Router();
6 | router.get('/', server.loopback.status());
7 | server.use(router);
8 | };
9 |
--------------------------------------------------------------------------------
/afaceri/server/component-config.json:
--------------------------------------------------------------------------------
1 | {
2 | "loopback-component-explorer": {
3 | "mountPath": "/explorer"
4 | }
5 | }
6 |
--------------------------------------------------------------------------------
/afaceri/server/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "restApiRoot": "/api",
3 | "host": "0.0.0.0",
4 | "port": 3000,
5 | "remoting": {
6 | "context": false,
7 | "rest": {
8 | "handleErrors": false,
9 | "normalizeHttpPath": false,
10 | "xml": false
11 | },
12 | "json": {
13 | "strict": false,
14 | "limit": "100kb"
15 | },
16 | "urlencoded": {
17 | "extended": true,
18 | "limit": "100kb"
19 | },
20 | "cors": false
21 | },
22 | "legacyExplorer": false,
23 | "logoutSessionsOnSensitiveChanges": true,
24 | "userAgent": "jesus",
25 | "downloadsFolder": "downloads",
26 | "firstParagraphMinWords": 8,
27 | "firstParagraphMinLetters": 50,
28 | "APIKey": "Token dummy",
29 | "mainURL": "http://www.antreprenoriat.gov.ro/categorie/transparenta-decizionala/proiecte-in-dezbatere-publica/"
30 | }
31 |
--------------------------------------------------------------------------------
/afaceri/server/config/keywords.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Created by Andrei on 3/13/2017.
3 | */
4 |
5 | var docType = [
6 | {
7 | type: "LEGE",
8 | regex: new RegExp("proiect ([a-zA-Z]+\s?){1,3} ordonan", "i")
9 | },
10 | {
11 | type: "OUG",
12 | regex: new RegExp("ordonan\\S{1,2} de urgen\\S{1,2}", "i")
13 | },
14 | {
15 | type: "HG",
16 | regex: new RegExp("hot\S{1}r\S{1}re", "i")
17 | }
18 | ];
19 |
20 | var titleStartMarkStrings = [
21 | "privind ",
22 | "pentru "
23 | ];
24 |
25 | var titleEndMarkStrings = [
26 | "\n",
27 | "\r\n"
28 | ];
29 |
30 | var titleEndMarkRegex = [
31 | new RegExp("sec\\S{1}iune", "i")
32 | ];
33 |
34 | module.exports = {
35 | docType: docType,
36 | titleStartMarkStrings: titleStartMarkStrings,
37 | titleEndMarkStrings: titleEndMarkStrings,
38 | titleEndMarkRegex: titleEndMarkRegex
39 | };
--------------------------------------------------------------------------------
/afaceri/server/datasources.json:
--------------------------------------------------------------------------------
1 | {
2 | "db": {
3 | "name": "db",
4 | "connector": "memory"
5 | }
6 | }
7 |
--------------------------------------------------------------------------------
/afaceri/server/middleware.development.json:
--------------------------------------------------------------------------------
1 | {
2 | "final:after": {
3 | "strong-error-handler": {
4 | "params": {
5 | "debug": true,
6 | "log": true
7 | }
8 | }
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/afaceri/server/middleware.json:
--------------------------------------------------------------------------------
1 | {
2 | "initial:before": {
3 | "loopback#favicon": {}
4 | },
5 | "initial": {
6 | "compression": {},
7 | "cors": {
8 | "params": {
9 | "origin": true,
10 | "credentials": true,
11 | "maxAge": 86400
12 | }
13 | },
14 | "helmet#xssFilter": {},
15 | "helmet#frameguard": {
16 | "params": [
17 | "deny"
18 | ]
19 | },
20 | "helmet#hsts": {
21 | "params": {
22 | "maxAge": 0,
23 | "includeSubdomains": true
24 | }
25 | },
26 | "helmet#hidePoweredBy": {},
27 | "helmet#ieNoOpen": {},
28 | "helmet#noSniff": {},
29 | "helmet#noCache": {
30 | "enabled": false
31 | }
32 | },
33 | "session": {},
34 | "auth": {},
35 | "parse": {},
36 | "routes": {
37 | "loopback#rest": {
38 | "paths": [
39 | "${restApiRoot}"
40 | ]
41 | }
42 | },
43 | "files": {},
44 | "final": {
45 | "loopback#urlNotFound": {}
46 | },
47 | "final:after": {
48 | "strong-error-handler": {}
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/afaceri/server/model-config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_meta": {
3 | "sources": [
4 | "loopback/common/models",
5 | "loopback/server/models",
6 | "../common/models",
7 | "./models"
8 | ],
9 | "mixins": [
10 | "loopback/common/mixins",
11 | "loopback/server/mixins",
12 | "../common/mixins",
13 | "./mixins"
14 | ]
15 | },
16 | "User": {
17 | "dataSource": "db"
18 | },
19 | "AccessToken": {
20 | "dataSource": "db",
21 | "public": false
22 | },
23 | "ACL": {
24 | "dataSource": "db",
25 | "public": false
26 | },
27 | "RoleMapping": {
28 | "dataSource": "db",
29 | "public": false,
30 | "options": {
31 | "strictObjectIDCoercion": true
32 | }
33 | },
34 | "Role": {
35 | "dataSource": "db",
36 | "public": false
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/afaceri/server/server.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | var loopback = require('loopback');
4 | var boot = require('loopback-boot');
5 | var contentParser = require('./controllers/contentParser');
6 |
7 | var app = module.exports = loopback();
8 |
9 | app.start = function() {
10 | // start the web server
11 | return app.listen(function() {
12 | app.emit('started');
13 | var baseUrl = app.get('url').replace(/\/$/, '');
14 | console.log('Web server listening at: %s', baseUrl);
15 | if (app.get('loopback-component-explorer')) {
16 | var explorerPath = app.get('loopback-component-explorer').mountPath;
17 | console.log('Browse your REST API at %s%s', baseUrl, explorerPath);
18 | }
19 | contentParser.init();
20 | });
21 | };
22 |
23 | // Bootstrap the application, configure models, datasources and middleware.
24 | // Sub-apps like REST API are mounted via boot scripts.
25 | boot(app, __dirname, function(err) {
26 | if (err) throw err;
27 |
28 | // start the server if `$ node server.js`
29 | if (require.main === module)
30 | app.start();
31 | });
32 |
--------------------------------------------------------------------------------
/agricultura/.gitignore:
--------------------------------------------------------------------------------
1 | /node_modules/
--------------------------------------------------------------------------------
/agricultura/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul Agriculturii Şi Dezvoltării Rurale
2 |
3 | ## Tehnologie
4 | NodeJS, [Nightmare](http://www.nightmarejs.org)
5 |
6 | ## Instructiuni
7 | ```
8 | npm install
9 | API_TOKEN=the_secret_api_token npm start
10 | ```
11 |
12 | ## Exceptii
13 |
--------------------------------------------------------------------------------
/agricultura/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "agricultura",
3 | "version": "1.0.0",
4 | "description": "scraper pentru agricultura",
5 | "main": "index.js",
6 | "scripts": {
7 | "test": "todo: add tests",
8 | "start": "node index.js"
9 | },
10 | "repository": {
11 | "type": "git",
12 | "url": "git+https://github.com/ciprian-chichirita/czl-scrape.git"
13 | },
14 | "keywords": [
15 | "code4romania",
16 | "ce",
17 | "zice",
18 | "legea",
19 | "agricultura"
20 | ],
21 | "author": "ciprian chichirita, alex morega",
22 | "license": "MIT",
23 | "bugs": {
24 | "url": "https://github.com/ciprian-chichirita/czl-scrape/issues"
25 | },
26 | "homepage": "https://github.com/ciprian-chichirita/czl-scrape#readme",
27 | "devDependencies": {
28 | "moment": "^2.17.1",
29 | "nightmare": "^2.10.0",
30 | "request": "^2.81.0",
31 | "request-promise": "^4.1.1",
32 | "sha256": "^0.2.0"
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/aparare/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul Apărării Naţionale
2 | Sursa documente: http://dlaj.mapn.ro/
3 | ## Tehnologie
4 | *PHP* - Script simplu old-school
5 | ## Instructiuni
6 | Nu are instructiuni speciale.
7 |
8 | Tokenul va fi transmis ca argument:
9 | ```bash
10 | $ php mapn_plugin.php TOKEN
11 | ```
12 | ## Exceptii
13 | Din cauza faptului ca pagina html nu e consistenta, au fost folosite RegExuri pentru a lua informatiile.
14 |
15 | O problema a constat in faptul ca o intrare este constituita pe site-ul acesta din 2 elemente practic, mai exact
16 | titlul proiectului si documentele aferente, dar ele nu pot fi legate una de cealalta logic. De aceea, scriptul
17 | va functiona doar in cazul in care gaseste acelasi numar de titluri si grupuri de documente.
18 |
19 | Scriptul va intoarce false in urmatoarele situatii:
20 | * pagina este down
21 | * unul din elementele cheie de content este schimbat (titlurile nu mai au *, calea spre documente este schimbata)
22 | * numarul de titluri si numarul de grupuri de documente nu este acelasi
--------------------------------------------------------------------------------
/apepaduri/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul Apelor și Pădurilor
2 |
3 | ## Tehnologie
4 |
5 | ## Instructiuni
6 |
7 | ## Exceptii
--------------------------------------------------------------------------------
/cdep/README.md:
--------------------------------------------------------------------------------
1 | # Camera Deputatilor
2 |
3 | ## Tehnologie
4 | python, scrapy
5 |
6 | ## Instructiuni
7 | ```
8 | pip install -r requirements.txt
9 | API_TOKEN='the secret token' python scraper.py
10 | ```
11 |
12 | ## Exceptii
13 |
--------------------------------------------------------------------------------
/cdep/requirements.in:
--------------------------------------------------------------------------------
1 | scrapy
2 | requests
3 |
--------------------------------------------------------------------------------
/cdep/requirements.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This file is autogenerated by pip-compile
3 | # To update, run:
4 | #
5 | # pip-compile --output-file requirements.txt requirements.in
6 | #
7 | asn1crypto==0.21.1 # via cryptography
8 | attrs==19.1.0 # via automat, service-identity, twisted
9 | automat==0.5.0 # via twisted
10 | certifi==2019.9.11 # via requests
11 | cffi==1.9.1 # via cryptography
12 | chardet==3.0.4 # via requests
13 | constantly==15.1.0 # via twisted
14 | cryptography==2.7 # via pyopenssl
15 | cssselect==1.0.1 # via parsel, scrapy
16 | hyperlink==19.0.0 # via twisted
17 | idna==2.7 # via hyperlink, requests
18 | incremental==16.10.1 # via twisted
19 | lxml==3.7.3 # via parsel, scrapy
20 | parsel==1.1.0 # via scrapy
21 | pyasn1-modules==0.0.8 # via service-identity
22 | pyasn1==0.2.3 # via pyasn1-modules, service-identity
23 | pycparser==2.17 # via cffi
24 | pydispatcher==2.0.5 # via scrapy
25 | pyhamcrest==1.9.0 # via twisted
26 | pyopenssl==17.5.0 # via scrapy, service-identity
27 | queuelib==1.4.2 # via scrapy
28 | requests==2.20.0
29 | scrapy==1.3.3
30 | service-identity==16.0.0 # via scrapy
31 | six==1.10.0 # via automat, cryptography, parsel, pyhamcrest, pyopenssl, scrapy, w3lib
32 | twisted==19.7.0 # via scrapy
33 | urllib3==1.24.3 # via requests
34 | w3lib==1.17.0 # via parsel, scrapy
35 | zope.interface==4.6.0 # via twisted
36 |
37 | # The following packages are considered to be unsafe in a requirements file:
38 | # setuptools==41.2.0 # via pyhamcrest, zope.interface
39 |
--------------------------------------------------------------------------------
/cdep/scraper.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | import re
3 | import requests
4 | import os
5 |
6 | API_URL = 'http://czl-api.code4.ro/api/publications/'
7 | API_TOKEN = os.environ['API_TOKEN']
8 |
9 | INDEX_URL = 'http://www.cdep.ro/pls/proiecte/upl_pck2015.lista?cam=2&anp=2017'
10 |
11 | def upload(doc):
12 | headers = {'Authorization': 'Token ' + API_TOKEN}
13 | resp = requests.post(API_URL, json=doc, headers=headers)
14 | if resp.status_code == 400:
15 | if re.search(r'Integrity Error: Key .* already exists', resp.text):
16 | return
17 | assert resp.status_code == 201
18 |
19 | class EducatieSpider(scrapy.Spider):
20 |
21 | name = 'cdep'
22 | start_urls = [INDEX_URL]
23 |
24 | def parse(self, response):
25 | for tr in response.css('.grup-parlamentar-list > table > tbody > tr'):
26 | href = tr.css('a::attr(href)').extract_first()
27 | url = response.urljoin(href)
28 | yield scrapy.Request(url, self.parse_proposal)
29 |
30 | def parse_proposal(self, response):
31 | cale_txt = ' '.join(t.extract() for t in response.css('.cale *::text'))
32 | plx_code = 'pl-x ' + re.search(r'pl-x\s+(\S+)', cale_txt.lower()).group(1)
33 | title = response.css('.detalii-initiativa h4::text').extract_first()
34 |
35 | table = response.css('#olddiv > table')[-1]
36 | for td in table.css('td'):
37 | td_text = (td.css('::text').extract_first() or '').strip()
38 | m = re.match(r'^(\d{2})\.(\d{2})\.(\d{4})$', td_text)
39 | if m:
40 | date = '{}-{}-{}'.format(m.group(3), m.group(2), m.group(1))
41 | break
42 |
43 | documents = []
44 |
45 | for pdf_link in response.css('.program-lucru-detalii a'):
46 | target = pdf_link.css('::attr(target)').extract_first() or ''
47 | if target.lower() != 'pdf':
48 | continue
49 | pdf_href = pdf_link.css('::attr(href)').extract_first()
50 | pdf_url = response.urljoin(pdf_href)
51 | label_tds = pdf_link.xpath('../../td')
52 | pdf_label = ' '.join(
53 | td.css('::text').extract_first()
54 | for td in label_tds[1:]
55 | ).strip()
56 | documents.append({
57 | 'type': pdf_label,
58 | 'url': pdf_url,
59 | })
60 |
61 | doc = {
62 | 'identifier': plx_code,
63 | 'title': title,
64 | 'institution': 'cdep',
65 | 'description': '',
66 | 'type': 'LEGE',
67 | 'date': date,
68 | 'documents': documents,
69 | }
70 | upload(doc)
71 |
72 | def main():
73 | from scrapy.crawler import CrawlerProcess, Crawler
74 | process = CrawlerProcess()
75 | process.crawl(EducatieSpider)
76 | process.start()
77 |
78 | if __name__ == '__main__':
79 | main()
80 |
--------------------------------------------------------------------------------
/cercetare/.editorconfig:
--------------------------------------------------------------------------------
1 | [*]
2 | charset=utf-8
3 | end_of_line=crlf
4 | insert_final_newline=false
5 | indent_style=space
6 | indent_size=4
7 |
8 | [{*.jhm,*.xslt,*.xul,*.rng,*.xsl,*.xsd,*.ant,*.svg,*.tld,*.fxml,*.jrxml,*.xml,*.jnlp,*.wsdl}]
9 | indent_style=space
10 | indent_size=2
11 |
12 | [{.eslintrc,.babelrc,.stylelintrc,*.json,*.jsb3,*.jsb2,*.bowerrc}]
13 | indent_style=space
14 | indent_size=2
15 |
16 | [{*.applejs,*.js}]
17 | indent_style=space
18 | indent_size=4
19 |
20 | [{.analysis_options,*.yml,*.yaml}]
21 | indent_style=space
22 | indent_size=2
23 |
24 |
--------------------------------------------------------------------------------
/cercetare/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | node_modules
3 | secrets.json
4 | data.json
--------------------------------------------------------------------------------
/cercetare/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul Cercetării și Inovării
2 |
3 | ## Tehnologie
4 |
5 |
6 | JavaScript EcmaScript2015 (ES6)
7 |
8 | 1. nodejs - https://nodejs.org/en/
9 | 1. nightmare - https://github.com/segmentio/nightmare
10 | 1. cheerio - https://github.com/cheeriojs/cheerio
11 | 1. jsonfile - https://github.com/jprichardson/node-jsonfile
12 | 1. request - https://github.com/request/request
13 | 1. argv - https://github.com/yargs/yargs
14 | 1. diacritics - https://github.com/andrewrk/node-diacritics
15 |
16 | ## Instructiuni
17 |
18 | 1. install nodejs
19 | 1. run `npm update`
20 | 1. run `node app.js`, passing param `--post` will upload to api and also generate a file `data.json` to view data.
21 |
22 | ## Exceptii
23 |
--------------------------------------------------------------------------------
/cercetare/app.js:
--------------------------------------------------------------------------------
1 | let nightmareConfig = {show: false},
2 | cheerio = require('cheerio'),
3 | request = require('request'),
4 | parseProject = require('./parseProject'),
5 | jsonfile = require('jsonfile'),
6 | argv = require('yargs').argv,
7 | secrets = require('./secrets.json') || {};
8 |
9 | const URL = 'http://www.research.gov.ro/ro/articol/1029/despre-ancs-legislatie-proiecte-de-acte-normative',
10 | BASE = 'http://www.research.gov.ro';
11 |
12 | const FILE = 'data.json';
13 |
14 | /** ====== MAIN ====== */
15 |
16 | getNightmareInstance()
17 | .goto(URL)
18 | .wait('body')
19 | .evaluate(getHTMLContent)
20 | .end()
21 | .then(processHTMLContent)
22 | .then(parseListItems)
23 | .then(postParsedResults)
24 | .catch(handleErrors);
25 |
26 |
27 | /** ====== page ====== */
28 |
29 | function getHTMLContent() {
30 | return document.querySelector('.icr_main .special_edit').innerHTML;
31 | }
32 |
33 | function processHTMLContent(result) {
34 | console.log('processing html page...');
35 |
36 | return {
37 | feedback_days_element: cheerio.load(result)('p').children('a[href^=mailto]').parent()[0],
38 | items: cheerio.load(result)('table tbody tr') //.not(function(item) {return cheerio.load(item).text() && cheerio.load(item).text().indexOf('Data publicarii') === -1})
39 | };
40 | }
41 |
42 |
43 | /** ====== list items ====== */
44 |
45 | function parseListItems(resultObject) {
46 | let items = resultObject.items,
47 | parseResults = [];
48 |
49 | items.each(function (i, item) {
50 | let $ = cheerio.load(item),
51 | content = $.text().replace(/\n/g, '').replace(/\t/g, '');
52 |
53 | if(content && content.indexOf('Data publicarii') != 0) {
54 | parseResults.push(parseItem(resultObject.feedback_days_element, item));
55 | }
56 | });
57 |
58 | return parseResults;
59 | }
60 |
61 | function parseItem(feedback_days, item) {
62 | return parseProject(cheerio.load(item), BASE, cheerio.load(feedback_days));
63 | }
64 |
65 |
66 | /** ====== post ====== */
67 |
68 | function postParsedResults(parsedResultsArr) {
69 |
70 | console.log('saving data to file...');
71 |
72 | jsonfile.writeFileSync(FILE, parsedResultsArr, {spaces: 4});
73 |
74 | if (argv.post) {
75 | if (!(secrets.API_URL && secrets.TOKEN)) {
76 | throw new Error('Share your secrets with me. Pretty please :)');
77 | }
78 |
79 | console.log('posting data to api...');
80 |
81 | let requestsArr = [];
82 |
83 | parsedResultsArr.forEach(function (result, i) {
84 | let promise = new Promise(function (resolve, reject) {
85 | request({
86 | uri: secrets.API_URL,
87 | method: 'POST',
88 | headers: {
89 | 'Authorization': 'Token ' + secrets.TOKEN,
90 | 'Content-Type': 'application/json'
91 | },
92 | json: result
93 | }, function (error, response, body) {
94 | if (error || response.statusCode !== 200) {
95 | console.error('request failed: ', error)
96 | }
97 |
98 | resolve(body);
99 | })
100 | });
101 |
102 | requestsArr.push(promise);
103 | });
104 |
105 | Promise.all(requestsArr).then(function (response) {
106 | console.log('done!');
107 | process.exit(0);
108 | }).catch(function (err) {
109 | throw new Error(err);
110 | });
111 | } else {
112 | console.log('done!');
113 | process.exit(0);
114 | }
115 | }
116 |
117 |
118 | /** ====== utils ====== */
119 |
120 | function getNightmareInstance() {
121 | return require('nightmare')(nightmareConfig);
122 | }
123 |
124 | function handleErrors(error) {
125 | throw new Error(error);
126 | }
--------------------------------------------------------------------------------
/cercetare/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "pretutindeni",
3 | "version": "1.0.0",
4 | "description": "",
5 | "main": "app.js",
6 | "scripts": {
7 | "crawl": "node app.js"
8 | },
9 | "author": "",
10 | "license": "ISC",
11 | "dependencies": {
12 | "cheerio": "0.22.0",
13 | "diacritics": "1.3.0",
14 | "jsonfile": "2.4.0",
15 | "nightmare": "2.10.0",
16 | "nodemon": "1.11.0",
17 | "q": "1.4.1",
18 | "request": "^2.81.0",
19 | "yargs": "7.0.2"
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/cercetare/secrets.json.txt:
--------------------------------------------------------------------------------
1 | {
2 | "TOKEN": "something something",
3 | "API_URL": "http://something.com/api/post-parsed-results"
4 | }
--------------------------------------------------------------------------------
/dezvoltare/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.swo
3 | .DS_Store
4 | *.egg-info
5 | build
6 | *.pyc
7 | **/*.pyc
8 | dbs
9 |
--------------------------------------------------------------------------------
/dezvoltare/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul Dezvoltării Regionale, Administrației Publice și Fondurilor Europene
2 |
3 | http://www.mdrap.gov.ro/transparenta/consultari-publice/
4 |
5 | ## Tehnologie
6 |
7 | *Python 2.7*
8 | [Scrapy 1.3.3](https://scrapy.org/)
9 |
10 | ## Instructiuni
11 |
12 | ```
13 | pip install -r requirements.txt
14 | cd crawl_dezvoltare
15 | scrapy crawl mdrap -a token=xxxx
16 | ```
17 |
18 | ## Exceptii
--------------------------------------------------------------------------------
/dezvoltare/crawl_dezvoltare/crawl_dezvoltare/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/dezvoltare/crawl_dezvoltare/crawl_dezvoltare/__init__.py
--------------------------------------------------------------------------------
/dezvoltare/crawl_dezvoltare/crawl_dezvoltare/exporters.py:
--------------------------------------------------------------------------------
1 | from scrapy.exporters import BaseItemExporter
--------------------------------------------------------------------------------
/dezvoltare/crawl_dezvoltare/crawl_dezvoltare/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class CrawlDezvoltareItem(scrapy.Item):
12 | identifier = scrapy.Field()
13 | title = scrapy.Field()
14 | type = scrapy.Field()
15 | institution = scrapy.Field()
16 | institution = scrapy.Field()
17 | date = scrapy.Field()
18 | description = scrapy.Field()
19 | feedback_days = scrapy.Field()
20 | contact = scrapy.Field()
21 | tel = scrapy.Field()
22 | email = scrapy.Field()
23 | documents = scrapy.Field()
--------------------------------------------------------------------------------
/dezvoltare/crawl_dezvoltare/crawl_dezvoltare/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class CrawlDezvoltareSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/dezvoltare/crawl_dezvoltare/crawl_dezvoltare/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import requests
8 |
9 | class CrawlDezvoltarePipeline(object):
10 | def process_item(self, item, spider):
11 | doc = {
12 | 'identifier': item['identifier'],
13 | 'title': item['title'],
14 | 'institution': item['institution'],
15 | 'description': item['description'],
16 | 'type': item['type'],
17 | 'date': item['date'],
18 | 'documents': item['documents'],
19 | 'contact':item['contact'],
20 | 'feedback_days': item['feedback_days']
21 | }
22 |
23 | response = requests.post('http://czl-api.code4.ro/api/publications/', headers={'Authorization': 'Token ' + spider.token }, json=doc)
24 | # print '---------'
25 | # print response
26 | # print response.text
27 | # print '---------'
28 | return item
29 |
30 |
31 |
--------------------------------------------------------------------------------
/dezvoltare/crawl_dezvoltare/crawl_dezvoltare/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for crawl_dezvoltare project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'crawl_dezvoltare'
13 |
14 | SPIDER_MODULES = ['crawl_dezvoltare.spiders']
15 | NEWSPIDER_MODULE = 'crawl_dezvoltare.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'code4romania (http://code4.ro)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'crawl_dezvoltare.middlewares.CrawlDezvoltareSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'crawl_dezvoltare.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'crawl_dezvoltare.pipelines.CrawlDezvoltarePipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/dezvoltare/crawl_dezvoltare/crawl_dezvoltare/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/dezvoltare/crawl_dezvoltare/crawl_dezvoltare/spiders/testing.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 |
4 | item = {'contact': {'addr': u'Apolodor, nr. 17, sector 5',
5 | 'email': u'iulia.matei@mdrap.ro',
6 | 'fax': u'0372.114.569.'},
7 | 'date': u'22-02-2017',
8 | 'description': u'\xcen temeiul art. 7 din Legea nr. 52/2003 privind transparen\u0163a decizional\u0103 \xeen administra\u0163ia public\u0103, republicat\u0103, Ministerul Dezvolt\u0103rii Regionale, Administra\u0163iei Publice si Fondurilor Europene aduce la cuno\u015ftin\u0163a publicului textul urm\u0103torului proiect de act normativ \u2013 Ordin al viceprim-ministrului, ministrul dezvolt\u0103rii regionale, administra\u0163iei publice \u0219i fondurilor europene pentru aplicarea prevederilor art. III, alin. (11) din Ordonan\u0163a de urgen\u0163\u0103 a Guvernului nr. 63/2010 pentru modificarea \u015fi completarea Legii nr. 273/2006 privind finan\u0163ele publice locale, precum \u015fi pentru stabilirea unor m\u0103suri financiare.',
9 | 'documents': [{'type': u'Referat de aprobare',
10 | 'url': '/userfiles/referat_ordin_oug63.doc'}],
11 | 'feedback_days': u'10',
12 | 'identifier': u'proiect-de-omdrapfe-pentru-aplicarea-prevederilor-art-iii-alin-11-din-ordonanta-de-urgenta-a-guvernului-nr-632010-pentru-modificarea-si-completarea-legii-nr-2732006-privind-finantele-publice-locale-precum-si-pentru-stabilirea-unor-masuri-financiare-22-02-2017',
13 | 'institution': 'dezvoltare',
14 | 'title': u'Proiect de OMDRAPFE pentru aplicarea prevederilor art. III, alin. (11) din Ordonan\u0163a de urgen\u0163\u0103 a Guvernului nr. 63/2010 pentru modificarea \u015fi completarea Legii nr. 273/2006 privind finan\u0163ele publice locale, precum \u015fi pentru stabilirea unor m\u0103suri financiare ',
15 | 'type': 'OMDRAPFE'}
16 |
17 | r = requests.post('http://czl-api.code4.ro/api/publications/', headers={'Authorization': 'Token dezvoltare-very-secret-token'}, data=item)
18 |
--------------------------------------------------------------------------------
/dezvoltare/crawl_dezvoltare/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = crawl_dezvoltare.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = crawl_dezvoltare
12 |
--------------------------------------------------------------------------------
/dezvoltare/requirements.txt:
--------------------------------------------------------------------------------
1 | appdirs==1.4.3
2 | appnope==0.1.0
3 | asn1crypto==0.21.1
4 | attrs==16.3.0
5 | Automat==0.5.0
6 | backports.shutil-get-terminal-size==1.0.0
7 | beautifulsoup4==4.5.3
8 | cffi==1.9.1
9 | constantly==15.1.0
10 | cryptography==1.8.1
11 | cssselect==1.0.1
12 | decorator==4.0.11
13 | enum34==1.1.6
14 | idna==2.5
15 | incremental==16.10.1
16 | ipaddress==1.0.18
17 | ipython==5.3.0
18 | ipython-genutils==0.1.0
19 | lxml==3.7.3
20 | packaging==16.8
21 | parsel==1.1.0
22 | pathlib2==2.2.1
23 | pexpect==4.2.1
24 | pickleshare==0.7.4
25 | prompt-toolkit==1.0.13
26 | ptyprocess==0.5.1
27 | pyasn1==0.2.3
28 | pyasn1-modules==0.0.8
29 | pycparser==2.17
30 | PyDispatcher==2.0.5
31 | Pygments==2.2.0
32 | pyOpenSSL==17.5.0
33 | pyparsing==2.2.0
34 | queuelib==1.4.2
35 | requests==2.20.0
36 | scandir==1.5
37 | Scrapy==1.3.3
38 | service-identity==16.0.0
39 | simplegeneric==0.8.1
40 | six==1.10.0
41 | slugify==0.0.1
42 | traitlets==4.3.2
43 | Twisted==19.7.0
44 | w3lib==1.17.0
45 | wcwidth==0.1.7
46 | zope.interface==4.3.3
47 |
--------------------------------------------------------------------------------
/economie/.editorconfig:
--------------------------------------------------------------------------------
1 | [*]
2 | charset=utf-8
3 | end_of_line=crlf
4 | insert_final_newline=false
5 | indent_style=space
6 | indent_size=4
7 |
8 | [{*.jhm,*.xslt,*.xul,*.rng,*.xsl,*.xsd,*.ant,*.svg,*.tld,*.fxml,*.jrxml,*.xml,*.jnlp,*.wsdl}]
9 | indent_style=space
10 | indent_size=2
11 |
12 | [{.eslintrc,.babelrc,.stylelintrc,*.json,*.jsb3,*.jsb2,*.bowerrc}]
13 | indent_style=space
14 | indent_size=2
15 |
16 | [{*.applejs,*.js}]
17 | indent_style=space
18 | indent_size=4
19 |
20 | [{.analysis_options,*.yml,*.yaml}]
21 | indent_style=space
22 | indent_size=2
23 |
24 |
--------------------------------------------------------------------------------
/economie/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | node_modules
3 | secrets.json
4 | data.json
--------------------------------------------------------------------------------
/economie/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul Economiei, Comerțului și Relațiilor cu Mediul de Afaceri
2 |
3 | ## Tehnologie
4 |
5 | JavaScript EcmaScript2015 (ES6)
6 |
7 | 1. nodejs - https://nodejs.org/en/
8 | 1. nightmare - https://github.com/segmentio/nightmare
9 | 1. cheerio - https://github.com/cheeriojs/cheerio
10 | 1. jsonfile - https://github.com/jprichardson/node-jsonfile
11 | 1. request - https://github.com/request/request
12 | 1. argv - https://github.com/yargs/yargs
13 | 1. diacritics - https://github.com/andrewrk/node-diacritics
14 |
15 | ## Instructiuni
16 |
17 | 1. install nodejs
18 | 1. run `npm update`
19 | 1. run `node app.js`, passing param `--post` will upload to api and also generate a file `data.json` to view data.
20 |
21 | ## Exceptii
22 |
23 |
--------------------------------------------------------------------------------
/economie/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "pretutindeni",
3 | "version": "1.0.0",
4 | "description": "",
5 | "main": "app.js",
6 | "scripts": {
7 | "crawl": "node app.js"
8 | },
9 | "author": "",
10 | "license": "ISC",
11 | "dependencies": {
12 | "cheerio": "0.22.0",
13 | "diacritics": "1.3.0",
14 | "jsonfile": "2.4.0",
15 | "nightmare": "2.10.0",
16 | "nodemon": "1.11.0",
17 | "q": "1.4.1",
18 | "request": "^2.81.0",
19 | "yargs": "7.0.2"
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/economie/secrets.json.txt:
--------------------------------------------------------------------------------
1 | {
2 | "TOKEN": "something something",
3 | "API_URL": "http://something.com/api/post-parsed-results"
4 | }
--------------------------------------------------------------------------------
/educatie/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul Educaţiei Naţionale și Cercetării Științifice
2 |
3 | ## Tehnologie
4 |
5 | Node.js, [nightmare](http://www.nightmarejs.org/)
6 |
7 | ## Instrucțiuni
8 |
9 | ```
10 | npm install
11 | ```
12 |
13 | edit config.js, change API token (can also be specified on the command line) and other config vars
14 |
15 | ```
16 | [API_TOKEN=foobar] npm start
17 | ```
18 |
19 | ## Excepții
20 |
--------------------------------------------------------------------------------
/educatie/config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | api: {
3 | url: 'http://czl-api.code4.ro/api/publications/',
4 | token: 'educatie-very-secret-key'
5 | },
6 | scrape: {
7 | //url of the proposals listing page
8 | baseUrl: 'https://www.edu.ro/proiecte-acte-normative-0',
9 | //how many proposals to consider
10 | proposals: 20,
11 | defaultEmail: 'dgis@edu.gov.ro'
12 | }
13 | };
14 |
--------------------------------------------------------------------------------
/educatie/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "edu-scraper",
3 | "version": "1.0.0",
4 | "description": "Data scraper pentru Ministerul Educatiei",
5 | "main": "index.js",
6 | "scripts": {
7 | "start": "node index.js",
8 | "test": "echo \"Error: no test specified\" && exit 1"
9 | },
10 | "repository": {
11 | "type": "git",
12 | "url": "git+https://github.com/lbogdan/czl-scrape"
13 | },
14 | "author": {
15 | "name": "Bogdan Luca",
16 | "email": "luca.bogdan@gmail.com"
17 | },
18 | "license": "MIT",
19 | "dependencies": {
20 | "diacritics": "^1.3.0",
21 | "jsonfile": "^2.4.0",
22 | "moment": "^2.17.1",
23 | "nightmare": "^2.10.0",
24 | "request-promise": "^4.1.1"
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/energie/.gitignore:
--------------------------------------------------------------------------------
1 | # OS files
2 | .DS_Store
3 |
4 | # Java files
5 | *.class
6 |
7 | # Log files
8 | *.log
9 | logs
10 |
11 | # Maven
12 | target
13 | pom.xml.versionsBackup
14 |
15 | # Dropwizard
16 | dependency-reduced-pom.xml
17 |
18 | # Mobile Tools for Java (J2ME)
19 | .mtj.tmp/
20 |
21 | # Package Files
22 | *.jar
23 | *.war
24 | *.ear
25 |
26 | # IntelliJ IDEA
27 | *.iml
28 | .idea
29 |
30 | # Eclipse
31 | .project
32 | .settings
33 | .classpath
34 | test-output
35 |
36 | # Vim
37 | *.swp
38 |
39 | # Virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
40 | hs_err_pid*
41 |
42 | # Misc
43 | *git.properties
44 |
45 | # Asciidoc
46 | .asciidoctor
47 | diag-*.png
48 |
--------------------------------------------------------------------------------
/energie/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul Energiei
2 |
3 | ## Tehnologie
4 |
5 | ## Instructiuni
6 |
7 | ## Exceptii
--------------------------------------------------------------------------------
/energie/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 |
6 | ro.code4.czl
7 | czl-scrape
8 | 0.0.1-SNAPSHOT
9 | ../
10 |
11 |
12 | czl-scrape-energie
13 | jar
14 | Ce Zice Legea :: Scraper :: Energie
15 |
16 |
17 |
18 | ${project.groupId}
19 | czl-scrape-commons
20 | ${project.version}
21 |
22 |
23 |
24 | org.jsoup
25 | jsoup
26 |
27 |
28 |
29 |
30 | ch.qos.logback
31 | logback-classic
32 | runtime
33 |
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/energie/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | scraper-energie.log
6 |
7 |
8 | scraper-energie.%i.log.zip
9 | 1
10 | 3
11 |
12 |
13 |
14 | 500MB
15 |
16 |
17 |
18 | %date{"yyyy-MM-dd'T'HH:mm:ss,SSSXXX", UTC} [%t] %-5level %c{1.} %msg%n
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 | 2048
28 |
29 |
30 |
31 | 0
32 |
33 |
34 |
35 | false
36 |
37 |
38 |
39 |
40 | %date{"yyyy-MM-dd'T'HH:mm:ss,SSSXXX", UTC} [%t] %-5level %c{1.} %msg%n
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
--------------------------------------------------------------------------------
/externe/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul Afacerilor Externe
2 |
3 | ## Tehnologie
4 | - Python 3 (developed and tested on 3.5.2)
5 | - BeautifulSoup 4
6 | - Requests
7 | - Click
8 | - **E**xtraordinarily **U**nderwhelming but also **S**uper **E**levated **B**inary **I**nformation **U**nit.
9 |
10 | A.K.A Eusebiu.
11 |
12 | ## Instructiuni
13 | Pentru a-l convinge pe Eusebiu să ia la mână articolele de pe site-ul MAE, trebuie să:
14 | - Instalezi `python3` si `pip`
15 | - Rulezi `python3 setup.py install` sau cu `sudo` in fata, daca nu ai un virtualenv
16 | - Ca să aflii ce poate Eusebiu să facă pentru umanitate: `python eusebiu.py --help`:
17 | ```
18 | Options:
19 | --page TEXT Selects the page to scrape. Available options are:
20 |
21 | scrapes the latest articles and falls back to
22 | observer mode
23 | ____________________________________________________
24 |
25 | scrape the 2016 archive and switch to
26 | observer mode
27 | ____________________________________________________
28 |
29 | scrape the 2014-2015 archive and switch
30 | to observer mode
31 | ____________________________________________________
32 | --log_level TEXT Sets the logging level. Available values: ERROR,
33 | WARNING, INFO, DEBUG,
34 | --delay FLOAT Number of hours to wait before checking for changes.
35 | Default=1
36 | --observer Periodically checks for changes and scrapes them if
37 | available.
38 | --help Show this message and exit.
39 | ```
40 | ## Exceptii
41 | Eusebiu se bazeaza in mare parte pe regex-uri pentru a extrage (silit, sau nu) informatii
42 | de la MAE.
43 |
44 | In cazul in care persoanele responsabil pentru introducerea articolelor in sistem
45 | se decid subit sa foloseasca alte pattern-uri decat cele pe le intelege Eusebiu, scraperul va
46 | genera articole invalide. Daca un articol nu contine toate detalii obligatorii, Eusebiu nu-i va
47 | face POST la API.
48 |
--------------------------------------------------------------------------------
/externe/__init__.py:
--------------------------------------------------------------------------------
1 | VERSION = '17.03.12'
2 |
--------------------------------------------------------------------------------
/externe/eusebiu.py:
--------------------------------------------------------------------------------
1 | import time
2 | import click
3 | import inspect
4 | import logging
5 | import os
6 |
7 | from scraper.article_serializer import ArticleSerializer
8 | from scraper.extractor import *
9 | from utils.api_client import post_data
10 | from utils.settings import *
11 |
12 |
13 | @click.command()
14 | @click.option('--page', default='feed', help=CLICK_HELPER['page'])
15 | @click.option('--log_level', default='INFO', help=CLICK_HELPER['log-level'])
16 | @click.option('--delay', default=1, type=float, help=CLICK_HELPER['delay'])
17 | @click.option('--observer', is_flag=True, default=False,
18 | help=CLICK_HELPER['observer'])
19 | def get_to_work(page, delay, observer, log_level):
20 | # init logging
21 | if log_level not in LOG_LEVELS:
22 | logging.warning('Unrecognized log_level: %s. Defaulting to INFO')
23 | log_level = 'INFO'
24 |
25 | current_dir = os.path.dirname(
26 | os.path.abspath(inspect.getfile(inspect.currentframe()))
27 | )
28 | logs_dir = current_dir + LOGS_DIR
29 | if not os.path.exists(logs_dir):
30 | os.makedirs(logs_dir)
31 |
32 | logging.basicConfig(filename=LOG_FILE, level=LOG_LEVELS[log_level],
33 | format='%(asctime)s %(levelname)s %(message)s')
34 |
35 | # if observer flag is set, ignore everything else and start eavesdropping
36 | if observer:
37 | shut_up_and_listen(delay)
38 |
39 | # validate page selection
40 | if page not in SCRAPER_PAGES:
41 | logging.error('Page name: %s not recognized. See help for available pages', page)
42 | exit()
43 |
44 | # scrape all articles on this page, and dump them on the API
45 | dump_one_of_these(page)
46 |
47 | # then get back to eavesdropping
48 | shut_up_and_listen(delay)
49 |
50 |
51 | def shut_up_and_listen(delay):
52 | """ Eusebiu skillfully lurks in the shadows, waiting for a new article to be posted.
53 | :param delay: int: number of hours to wait before the next tactical strike.
54 | :return: None
55 | """
56 | current_latest = []
57 | while True:
58 | feed_extractor = Extractor(settings.URLS.get('feed'))
59 | latest_entries = feed_extractor.get_identifier_list()
60 | logging.debug('latest_entries: %s', latest_entries)
61 |
62 | if not current_latest:
63 | logging.info('Assuming current state of feed is the latest ...')
64 | current_latest = latest_entries[:]
65 |
66 | diff = set(current_latest) - set(latest_entries)
67 | for identifier in diff:
68 | # be polite to the MAE website
69 | time.sleep(0.5)
70 | logging.info('Found new article: %s', identifier)
71 | article = feed_extractor.get_article_by_id(identifier)
72 | diff.remove(article.identifier)
73 | post_article(article)
74 |
75 | logging.info('ETA until next scrape: %s hour(s)', delay)
76 | time.sleep(hours_to_sec(delay))
77 |
78 |
79 | def dump_one_of_these(page):
80 | """
81 | Eusebiu masterfully extracts all the articles on a given page, and swiftly dumps
82 | them onto the API.
83 | :param page: the page to eviscerate.
84 | :return: None
85 | """
86 | extractor = Extractor(settings.URLS.get(page))
87 | articles = extractor.get_all_articles()
88 | for article in articles:
89 | # be polite to the API
90 | time.sleep(0.5)
91 | post_article(article)
92 |
93 |
94 | def post_article(article):
95 | """Attempts to POST and article to the API.
96 | :param article: the object to POST.
97 | :return: True if successful, False otherwise.
98 | """
99 | if not ArticleSerializer().is_valid(article):
100 | logging.error('Invalid article: %s \n WILL NOT POST TO API', article)
101 | return False
102 | data = ArticleSerializer().serialize(article)
103 | return post_data(data)
104 |
105 |
106 | if __name__ == '__main__':
107 | get_to_work()
108 |
--------------------------------------------------------------------------------
/externe/scraper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/externe/scraper/__init__.py
--------------------------------------------------------------------------------
/externe/scraper/article_serializer.py:
--------------------------------------------------------------------------------
1 | from utils import settings
2 |
3 |
4 | class ArticleSerializer:
5 | @staticmethod
6 | def serialize(article):
7 | return dict(
8 | # TODO
9 | identifier=article.identifier,
10 | title=article.title,
11 | type=article.article_type,
12 | institution=settings.INSTITUTION,
13 | date=article.published_at.isoformat(),
14 | description='N\A',
15 | feedback_days=article.feedback_days,
16 | contact=article.contact,
17 | documents=article.documents,
18 | )
19 |
20 | @staticmethod
21 | def is_valid(article):
22 | """Checks if an Article is valid, according to the API specs.
23 | :param article: The Article instance to validate
24 | :return: True or False
25 | """
26 | for field in settings.MANDATORY_FIELDS:
27 | if not getattr(article, field):
28 | return False
29 | return True
30 |
--------------------------------------------------------------------------------
/externe/scraper/extractor.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup as beautiful_soup
3 |
4 | import utils.settings as settings
5 | from scraper.article import Article
6 |
7 |
8 | class Extractor:
9 | """Extractor object, responsible for fetching data from the MAE website.
10 | """
11 | url = None
12 | content = None
13 | articles = None
14 |
15 | def __init__(self, url):
16 | self.url = url
17 | self.content = self._fetch_page()
18 |
19 | def get_all_articles(self):
20 | """Generates a list of all Article objects fetched from MAE.
21 | :return: the list of Articles
22 | """
23 | self.articles = [Article(table) for table in self._get_tables()]
24 | return self.articles
25 |
26 | def get_article_by_id(self, identifier):
27 | """Returns the article matching the given identifier.
28 | :param identifier: the id
29 | :return: the matching Article, or None
30 | """
31 | if not self.articles:
32 | self.get_all_articles()
33 |
34 | for a in self.articles:
35 | if a.identifier == identifier:
36 | return a
37 |
38 | def get_identifier_list(self):
39 | """Extracts a list of identifiers of the latest articles.
40 | :return: list
41 | """
42 | latest = []
43 | for table in self._get_tables():
44 | tr = table.select('tr')
45 | article = Article()
46 | article._extract_article_type(tr)
47 | article._extract_title(tr)
48 | article._generate_id()
49 | latest.append(article.identifier)
50 | return latest
51 |
52 | def _fetch_page(self):
53 | page = requests.get(self.url, headers=settings.HEADERS)
54 | return beautiful_soup(page.text, 'html.parser')
55 |
56 | def _get_tables(self):
57 | return self.content.select_one('div.art').select('table')
58 |
--------------------------------------------------------------------------------
/externe/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | import re
3 | from setuptools import setup
4 |
5 | install_requires = [
6 | 'beautifulsoup4',
7 | 'requests',
8 | 'click',
9 | 'lxml'
10 | ]
11 |
12 | version_regex = re.compile("VERSION\s*=\s*'(.*?)'$")
13 |
14 | with open('__init__.py') as stream:
15 | VERSION = version_regex.search(stream.read()).group(1)
16 |
17 | setup(
18 | version=VERSION,
19 | name='mae-scraper',
20 | url='https://github.com/code4romania/czl-scrape/tree/master/externe',
21 | author='Rares Urdea, Alexandru Hodorogea',
22 | author_email='contact@code4.ro',
23 | description='Scraper pentru site-ul Ministerului de Afaceri Externe',
24 | install_requires=install_requires,
25 | )
26 |
--------------------------------------------------------------------------------
/externe/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/externe/utils/__init__.py
--------------------------------------------------------------------------------
/externe/utils/api_client.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import requests
3 | import time
4 |
5 | from utils.settings import *
6 |
7 |
8 | def post_data(data):
9 | attempts = 5
10 | success = False
11 |
12 | while not success and attempts > 0:
13 | attempts -= 1
14 | response = requests.post(URLS['api-publications'], data, headers=HEADERS)
15 |
16 | if _already_exists(response):
17 | logging.warning(
18 | 'Object: %s \nalready exists, according to API. Skipping.', data
19 | )
20 | break
21 |
22 | success = response.status_code == STATUS_CREATED
23 | if success:
24 | break
25 | time.sleep(30)
26 |
27 | if not success:
28 | logging.error('Failed to POST data to API: %s', data)
29 |
30 | return success
31 |
32 |
33 | def _already_exists(response):
34 | return response.status_code == STATUS_BAD_REQUEST \
35 | and ALREADY_EXISTS in response.text.lower()
36 |
--------------------------------------------------------------------------------
/externe/utils/lang.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | class LangHelper(object):
3 | FUCK_NO = [
4 | # new line
5 | '\n',
6 | # tab
7 | '\t',
8 | # non-breaking space
9 | '\xa0',
10 | # 0 width space
11 | '\u200b'
12 | ]
13 |
14 | @staticmethod
15 | def englishize_romanian(string):
16 | symbols = (u"țţȚŢșşȘŞăĂîÎâÂ",
17 | u"ttTTssSSaAiIaA")
18 |
19 | tr = {ord(a): ord(b) for a, b in zip(*symbols)}
20 | return string.translate(tr)
21 |
22 | @staticmethod
23 | def beautify_romanian(string):
24 | symbols = (u"ţşŢŞ",
25 | u"țșȚȘ")
26 | tr = {ord(a): ord(b) for a, b in zip(*symbols)}
27 | return string.translate(tr)
28 |
29 | @staticmethod
30 | def sanitize(string):
31 | """Sanitize a string.
32 | Removes new lines and 0 width spaces, because fuck those.
33 |
34 | :param string: The string to sanitize.
35 | :return: A clean string.
36 | """
37 | if string:
38 | for this_little_shit in LangHelper.FUCK_NO:
39 | string = string.replace(this_little_shit, '')
40 | return string
41 |
--------------------------------------------------------------------------------
/externe/utils/settings.py:
--------------------------------------------------------------------------------
1 | WAIT = {
2 | '1_sec': 1,
3 | '0.5_sec': 0.5
4 | }
5 |
6 | HEADERS = {
7 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) '
8 | 'AppleWebKit/537.36 (KHTML, like Gecko) '
9 | 'Chrome/39.0.2171.95 Safari/537.36',
10 | 'Authorization': 'Token externe-very-secret-key'
11 | }
12 |
13 | SCRAPER_PAGES = [
14 | 'arhiva-1415',
15 | 'arhiva-2016',
16 | 'feed'
17 | ]
18 |
19 | # The keys linking to MAE pages need to match the items in SCRAPER_PAGES
20 | URLS = {
21 | 'mae_base': 'http://www.mae.ro',
22 | 'feed': 'https://www.mae.ro/node/2011#null',
23 | 'arhiva-2016': 'http://www.mae.ro/node/40248',
24 | 'arhiva-1415': 'http://www.mae.ro/node/35609',
25 | 'api-publications': 'http://czl-api.code4.ro/api/publications/'
26 | }
27 |
28 | STATUS_CREATED = 201
29 | STATUS_BAD_REQUEST = 400
30 | ALREADY_EXISTS = 'already exists'
31 |
32 | TYPES = {
33 | 'HOTARARE': 'HG',
34 | 'ORDONANTA': 'OG',
35 | 'ORDONANTA DE URGENTA': 'OUG',
36 | 'ORDINUL MINISTRULUI AFACERILOR EXTERNE': 'OM',
37 | 'ORDIN': 'OM',
38 | 'PROIECT DE LEGE': 'LEGE',
39 | 'LEGE': 'LEGE',
40 | 'OTHER': 'OTHER'
41 | }
42 |
43 | MONTHS = dict(
44 | ianuarie='01',
45 | februarie='02',
46 | martie='03',
47 | aprilie='04',
48 | mai='05',
49 | iunie='06',
50 | iulie='07',
51 | august='08',
52 | septembrie='09',
53 | octombrie='10',
54 | noiembrie='11',
55 | decembrie='12'
56 | )
57 |
58 | CLICK_HELPER = {
59 |
60 | 'log-level': '\b Sets the logging level. Available values: ERROR, WARNING, INFO, DEBUG,',
61 | 'page': """
62 | \b Selects the page to scrape. Available options are:
63 | \b scrapes the latest articles and falls back to observer mode
64 | ____________________________________________________
65 | \b scrape the 2016 archive and switch to observer mode
66 | ____________________________________________________
67 | \b scrape the 2014-2015 archive and switch
68 | to observer mode
69 | ____________________________________________________
70 | """,
71 | 'observer': 'Periodically checks for changes and scrapes them if available. '
72 | 'NOTE: in observer mode, any argument is ignored.',
73 | 'delay': 'Number of hours to wait before checking for changes. Default=1'
74 | }
75 |
76 | LOG_LEVELS = {
77 | 'ERROR': 40,
78 | 'WARNING': 30,
79 | 'INFO': 20,
80 | 'DEBUG': 10
81 | }
82 |
83 | LOG_FILE = 'logs/scraper.log'
84 | LOGS_DIR = '/logs'
85 |
86 | INSTITUTION = 'externe'
87 |
88 | MANDATORY_FIELDS = ['identifier', 'title', 'published_at', 'article_type']
89 |
90 | DATE_FMT = '%Y-%m-%d'
91 |
92 |
93 | def hours_to_sec(hours):
94 | return hours * 3600
95 |
--------------------------------------------------------------------------------
/finantepub/.gitignore:
--------------------------------------------------------------------------------
1 | /node_modules
2 |
--------------------------------------------------------------------------------
/finantepub/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul Finanţelor Publice
2 |
3 | ## Tehnologie
4 | NodeJS, [Nightmare](http://www.nightmarejs.org)
5 |
6 | ## Instructiuni
7 | ```
8 | npm install
9 | API_TOKEN=the_secret_api_token npm start
10 | ```
11 |
12 | ## Exceptii
13 |
--------------------------------------------------------------------------------
/finantepub/index.js:
--------------------------------------------------------------------------------
1 | const sha256 = require('sha256');
2 | const rp = require('request-promise');
3 | const Nightmare = require('nightmare');
4 | const nightmare = Nightmare({ show: false, typeInterval: 2, waitTimeout: 5000 });
5 |
6 | const YEAR_THRESHOLD = 2017;
7 |
8 | const API_TOKEN = process.env['API_TOKEN'];
9 |
10 | function guessType(text) {
11 | text = text.toLowerCase().trim();
12 | text = text.replace(/^proiect\s*/, '');
13 | if(text.match(/^ordonanță de urgență/)) return 'OUG';
14 | if(text.match(/^lege/)) return 'LEGE';
15 | if(text.match(/^ordin/)) return 'OG';
16 | if(text.match(/^hotărâre/)) return 'HG';
17 | throw new Error(`failz: ${text}`);
18 | }
19 |
20 | function parsePage(page = 1) {
21 | nightmare
22 | .cookies.clear()
23 | .useragent(`Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.${Math.round(Math.random()*100)}`)
24 | .goto(`http://www.mfinante.gov.ro/transparent.html?method=transparenta&pagina=acasa&locale=ro&d-6834775-p=${page}`)
25 | .wait("#transparentaList")
26 | .evaluate(()=> {
27 | if(document.querySelector('#transparentaList').innerText.trim() == '') return;
28 | let itemsList = [], items = [... document.querySelectorAll('#transparentaList > tbody > tr ')];
29 | for (let item of items) {
30 | let text = item.innerText;
31 | let match = text.replace(/\s+/g, ' ').match(
32 | /(.*?)\s*- publicat în data de\s*(\d{2})\.(\d{2})\.(\d{4})/);
33 |
34 | if(! match) {
35 | throw new Error(`Can't match title and date in text: "${text}"`);
36 | }
37 |
38 | let documents = []
39 | let links = item.querySelectorAll('a.downlPDF');
40 | for (let doc of links) {
41 | documents.push({
42 | type: 'act',
43 | url: doc.href
44 | });
45 | }
46 |
47 | let returnObj = {
48 | title: match[1],
49 | date: `${match[4]}-${match[3]}-${match[2]}`,
50 | documents: documents,
51 | label: links[0].innerText
52 | };
53 |
54 | itemsList.push(returnObj);
55 | }
56 | return itemsList;
57 | })
58 | .then((result) => {
59 |
60 | if(! result) {
61 | console.log("halt!");
62 | nightmare.halt();
63 | return;
64 | }
65 |
66 | let itemsList = [];
67 |
68 | for(let val of result) {
69 | let year = val.date.split('-')[0]
70 | if (year < YEAR_THRESHOLD) {
71 | console.log("halt!");
72 | nightmare.halt();
73 | return;
74 | }
75 |
76 | val.identifier = sha256(val.documents[0].url);
77 | val.institution = 'finantepub';
78 | val.description = '';
79 | val.type = guessType(val.label);
80 | delete val.label;
81 | itemsList.push(val);
82 | }
83 |
84 | function postAllItems(remaining) {
85 | if(! remaining.length) return;
86 | let val = remaining[0];
87 | return rp.post({
88 | url: 'http://czl-api.code4.ro/api/publications/',
89 | headers: {Authorization: `Token ${API_TOKEN}`},
90 | json: val
91 | })
92 | .then(() => {
93 | console.log('posted item: ', val.identifier);
94 | return postAllItems(remaining.slice(1));
95 | });
96 | }
97 |
98 | return postAllItems(itemsList);
99 |
100 | })
101 | .then(() => {
102 | parsePage(page + 1);
103 | })
104 | .catch((error) => {
105 | console.error('error:', error);
106 | nightmare.halt();
107 | });
108 | }
109 |
110 | parsePage();
111 |
--------------------------------------------------------------------------------
/finantepub/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "finantepub",
3 | "version": "1.0.0",
4 | "description": "## Tehnologie",
5 | "main": "index.js",
6 | "scripts": {
7 | "start": "node index.js"
8 | },
9 | "repository": {
10 | "type": "git",
11 | "url": "git+https://github.com/mgax/czl-scrape.git"
12 | },
13 | "author": "ciprian chichirita, alex morega",
14 | "license": "MIT",
15 | "devDependencies": {
16 | "nightmare": "^2.10.0",
17 | "request-promise": "^4.1.1",
18 | "sha256": "^0.2.0"
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/interne/.editorconfig:
--------------------------------------------------------------------------------
1 | [*]
2 | charset=utf-8
3 | end_of_line=crlf
4 | insert_final_newline=false
5 | indent_style=space
6 | indent_size=4
7 |
8 | [{*.jhm,*.xslt,*.xul,*.rng,*.xsl,*.xsd,*.ant,*.svg,*.tld,*.fxml,*.jrxml,*.xml,*.jnlp,*.wsdl}]
9 | indent_style=space
10 | indent_size=2
11 |
12 | [{.eslintrc,.babelrc,.stylelintrc,*.json,*.jsb3,*.jsb2,*.bowerrc}]
13 | indent_style=space
14 | indent_size=2
15 |
16 | [{*.applejs,*.js}]
17 | indent_style=space
18 | indent_size=4
19 |
20 | [{.analysis_options,*.yml,*.yaml}]
21 | indent_style=space
22 | indent_size=2
23 |
24 |
--------------------------------------------------------------------------------
/interne/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | node_modules
3 | secrets.json
4 | data.json
--------------------------------------------------------------------------------
/interne/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul Afacerilor Interne
2 |
3 | ## Tehnologie
4 |
5 | ## Instructiuni
6 |
7 | ## Exceptii
--------------------------------------------------------------------------------
/interne/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "pretutindeni",
3 | "version": "1.0.0",
4 | "description": "",
5 | "main": "app.js",
6 | "scripts": {
7 | "crawl": "node app.js"
8 | },
9 | "author": "",
10 | "license": "ISC",
11 | "dependencies": {
12 | "cheerio": "0.22.0",
13 | "diacritics": "1.3.0",
14 | "jsonfile": "2.4.0",
15 | "nightmare": "2.10.0",
16 | "nodemon": "1.11.0",
17 | "q": "1.4.1",
18 | "request": "2.81.0",
19 | "yargs": "7.0.2"
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/interne/secrets.json.txt:
--------------------------------------------------------------------------------
1 | {
2 | "TOKEN": "something something",
3 | "API_URL": "http://something.com/api/post-parsed-results"
4 | }
--------------------------------------------------------------------------------
/justitie/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .scrapy
3 |
4 | *.pyc
5 | __pycache__
6 | **/__pycache__
7 |
--------------------------------------------------------------------------------
/justitie/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul Justiţiei
2 |
3 | ## Tehnologie
4 |
5 | *Python 3.6*, [virtualenv](https://virtualenv.pypa.io/) e un prieten bun
6 | [Scrapy](https://scrapy.org/)
7 |
8 | ```
9 | pip install -r requirements.txt
10 |
11 | # on windows:
12 | pip install win32api
13 | ```
14 |
15 | ## Instructiuni
16 |
17 | ```
18 | scrapy crawl publication
19 | ```
20 |
21 | ## Altele
22 |
23 | Data understading & values
24 | * [online](https://etherpad.net/p/hackajust)
25 | * see doc folder
26 |
--------------------------------------------------------------------------------
/justitie/just/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/justitie/just/__init__.py
--------------------------------------------------------------------------------
/justitie/just/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 | class JustPublication(scrapy.Item):
11 | # define the fields for your item here like:
12 | # name = scrapy.Field()
13 | identifier = scrapy.Field()
14 | title = scrapy.Field()
15 | type = scrapy.Field()
16 | institution = scrapy.Field()
17 | date = scrapy.Field()
18 | description = scrapy.Field()
19 | feedback_days = scrapy.Field()
20 | contact = scrapy.Field()
21 | documents = scrapy.Field()
22 |
23 | pass
24 |
25 |
--------------------------------------------------------------------------------
/justitie/just/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class JustSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/justitie/just/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | import requests
9 | import json
10 | import logging
11 |
12 | from just.items import JustPublication
13 | import logging
14 |
15 | API_KEY = 'justitie-very-secret-key'
16 | API_PUBLICATIONS = 'http://czl-api.code4.ro/api/publications/'
17 |
18 | class JustPublicationsToApiPipeline(object):
19 | def process_item(self, item, spider):
20 |
21 | if type(item) != JustPublication:
22 | return item
23 |
24 | r = requests.post(API_PUBLICATIONS, json=dict(item), headers={'Authorization': 'Token %s' % (API_KEY,) } )
25 |
26 |
27 | if r.status_code == 200 or r.status_code == '200':
28 | logging.log(msg=r.status_code, level=logging.INFO)
29 | else:
30 | logging.log(msg=r.status_code, level=logging.ERROR)
31 | logging.log(msg=r.content, level=logging.INFO)
32 |
33 | return item
34 |
--------------------------------------------------------------------------------
/justitie/just/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for just project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'just'
13 |
14 | SPIDER_MODULES = ['just.spiders']
15 | NEWSPIDER_MODULE = 'just.spiders'
16 |
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | USER_AGENT = 'code4romania (+http://www.code4.ro)'
19 |
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = True
22 |
23 | LOG_ENABLED = True
24 |
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 |
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | DOWNLOAD_DELAY = 5
32 | # The download delay setting will honor only one of:
33 | CONCURRENT_REQUESTS_PER_DOMAIN = 1
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 |
36 | # Disable cookies (enabled by default)
37 | #COOKIES_ENABLED = False
38 |
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 |
42 | # Override the default request headers:
43 | #DEFAULT_REQUEST_HEADERS = {
44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | # 'Accept-Language': 'en',
46 | #}
47 |
48 | # Enable or disable spider middlewares
49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
50 | #SPIDER_MIDDLEWARES = {
51 | # 'just.middlewares.JustSpiderMiddleware': 543,
52 | #}
53 |
54 | # Enable or disable downloader middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
56 | #DOWNLOADER_MIDDLEWARES = {
57 | # 'just.middlewares.MyCustomDownloaderMiddleware': 543,
58 | #}
59 |
60 | # Enable or disable extensions
61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | # 'scrapy.extensions.telnet.AutoThrottle': None,
64 | #}
65 |
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 | 'just.pipelines.JustPublicationsToApiPipeline': 100,
70 | }
71 |
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | # AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | # AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | # AUTOTHROTTLE_MAX_DELAY = 30
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0
82 | # Enable showing throttling stats for every response received:
83 | # AUTOTHROTTLE_DEBUG = True
84 |
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | HTTPCACHE_ENABLED = True
88 | HTTPCACHE_EXPIRATION_SECS = 30
89 | HTTPCACHE_DIR = 'httpcache'
90 | HTTPCACHE_IGNORE_HTTP_CODES = []
91 | HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 |
--------------------------------------------------------------------------------
/justitie/just/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/justitie/requirements.txt:
--------------------------------------------------------------------------------
1 | appdirs==1.4.3
2 | asn1crypto==0.21.1
3 | attrs==16.3.0
4 | Automat==0.5.0
5 | cffi==1.9.1
6 | constantly==15.1.0
7 | convertdate==2.1.0
8 | cryptography==1.8.1
9 | cssselect==1.0.1
10 | ephem==3.7.6.0
11 | idna==2.5
12 | incremental==16.10.1
13 | jdatetime==1.8.2
14 | lxml==3.7.3
15 | packaging==16.8
16 | parsel==1.1.0
17 | pyasn1==0.2.3
18 | pyasn1-modules==0.0.8
19 | pycparser==2.17
20 | PyDispatcher==2.0.5
21 | pyOpenSSL==17.5.0
22 | pyparsing==2.2.0
23 | pytz==2016.10
24 | queuelib==1.4.2
25 | regex==2017.2.8
26 | ruamel.yaml==0.13.14
27 | Scrapy==1.3.3
28 | service-identity==16.0.0
29 | six==1.10.0
30 | Twisted==19.7.0
31 | tzlocal==1.3
32 | umalqurra==0.2
33 | Unidecode==0.4.20
34 | w3lib==1.17.0
35 | zope.interface==4.3.3
36 |
--------------------------------------------------------------------------------
/justitie/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = just.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = just
12 |
--------------------------------------------------------------------------------
/mediu/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.swo
3 | .DS_Store
4 | *.egg-info
5 | build
6 | *.pyc
7 | **/*.pyc
8 | dbs
9 |
--------------------------------------------------------------------------------
/mediu/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul Mediului
2 |
3 | ## Tehnologie
4 |
5 | *Python 2.7*
6 | [Scrapy 1.3.3](https://scrapy.org/)
7 |
8 | ## Instructiuni
9 |
10 | ```
11 | pip install -r requirements.txt
12 | cd crawl_mediu
13 | scrapy crawl mmediu -a token=xxxx
14 | ```
15 |
16 |
17 | ## Exceptii
--------------------------------------------------------------------------------
/mediu/crawl_mediu/crawl_mediu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/mediu/crawl_mediu/crawl_mediu/__init__.py
--------------------------------------------------------------------------------
/mediu/crawl_mediu/crawl_mediu/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class CrawlMediuItem(scrapy.Item):
12 | identifier = scrapy.Field()
13 | title = scrapy.Field()
14 | type = scrapy.Field()
15 | institution = scrapy.Field()
16 | institution = scrapy.Field()
17 | date = scrapy.Field()
18 | description = scrapy.Field()
19 | feedback_days = scrapy.Field()
20 | contact = scrapy.Field()
21 | tel = scrapy.Field()
22 | email = scrapy.Field()
23 | documents = scrapy.Field()
--------------------------------------------------------------------------------
/mediu/crawl_mediu/crawl_mediu/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class CrawlMediuSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/mediu/crawl_mediu/crawl_mediu/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import requests
8 |
9 | class CrawlMediuPipeline(object):
10 | def process_item(self, item, spider):
11 | doc = {
12 | 'identifier': item['identifier'],
13 | 'title': item['title'],
14 | 'institution': item['institution'],
15 | 'description': item['description'],
16 | 'type': item['type'],
17 | 'date': item['date'],
18 | 'documents': item['documents'],
19 | 'contact':item['contact'],
20 | 'feedback_days': item['feedback_days']
21 | }
22 |
23 | response = requests.post('http://czl-api.code4.ro/api/publications/', headers={'Authorization': 'Token ' + spider.token }, json=doc)
24 | # print '---------'
25 | # print response
26 | # print response.text
27 | # print '---------'
28 | return item
29 |
30 |
--------------------------------------------------------------------------------
/mediu/crawl_mediu/crawl_mediu/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for crawl_mediu project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'crawl_mediu'
13 |
14 | SPIDER_MODULES = ['crawl_mediu.spiders']
15 | NEWSPIDER_MODULE = 'crawl_mediu.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'crawl_mediu (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'crawl_mediu.middlewares.CrawlMediuSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'crawl_mediu.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'crawl_mediu.pipelines.CrawlMediuPipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/mediu/crawl_mediu/crawl_mediu/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/mediu/crawl_mediu/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = crawl_mediu.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = crawl_mediu
12 |
--------------------------------------------------------------------------------
/mediu/requirements.txt:
--------------------------------------------------------------------------------
1 | appdirs==1.4.3
2 | appnope==0.1.0
3 | asn1crypto==0.21.1
4 | attrs==16.3.0
5 | Automat==0.5.0
6 | backports.shutil-get-terminal-size==1.0.0
7 | beautifulsoup4==4.5.3
8 | cffi==1.9.1
9 | constantly==15.1.0
10 | cryptography==1.8.1
11 | cssselect==1.0.1
12 | decorator==4.0.11
13 | enum34==1.1.6
14 | idna==2.5
15 | incremental==16.10.1
16 | ipaddress==1.0.18
17 | ipython==5.3.0
18 | ipython-genutils==0.1.0
19 | lxml==3.7.3
20 | packaging==16.8
21 | parsel==1.1.0
22 | pathlib2==2.2.1
23 | pexpect==4.2.1
24 | pickleshare==0.7.4
25 | prompt-toolkit==1.0.13
26 | ptyprocess==0.5.1
27 | pyasn1==0.2.3
28 | pyasn1-modules==0.0.8
29 | pycparser==2.17
30 | PyDispatcher==2.0.5
31 | Pygments==2.2.0
32 | pyOpenSSL==16.2.0
33 | pyparsing==2.2.0
34 | queuelib==1.4.2
35 | requests==2.13.0
36 | scandir==1.5
37 | Scrapy==1.3.3
38 | service-identity==16.0.0
39 | simplegeneric==0.8.1
40 | six==1.10.0
41 | slugify==0.0.1
42 | traitlets==4.3.2
43 | Twisted==17.1.0
44 | w3lib==1.17.0
45 | wcwidth==0.1.7
46 | zope.interface==4.3.3
47 |
--------------------------------------------------------------------------------
/presedinte/README.md:
--------------------------------------------------------------------------------
1 | # Presedintia
2 |
3 | ## Tehnologie
4 |
5 | ## Instructiuni
6 |
7 | ## Exceptii
--------------------------------------------------------------------------------
/pretutindeni/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | node_modules
3 | parseProject.js
--------------------------------------------------------------------------------
/pretutindeni/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul pentru Românii de Pretutindeni
2 |
3 | 1. http://www.dprp.gov.ro/documente-in-consultare-publica/
4 |
5 | ## Tehnologie
6 |
7 | 1. nodejs - https://nodejs.org/en/
8 | 2. nightmarejs - https://github.com/segmentio/nightmare
9 |
10 | ## Instructiuni
11 |
12 | ## Exceptii
13 |
14 | Oamenii care updateaza chestia asta sunt exceptii. Paragrafe fara structura aruncate pur si simplu acolo. Foarte dificil de parsat.
--------------------------------------------------------------------------------
/pretutindeni/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "pretutindeni",
3 | "version": "1.0.0",
4 | "description": "",
5 | "main": "app.js",
6 | "scripts": {
7 | "crawl": "node app.js"
8 | },
9 | "author": "",
10 | "license": "ISC",
11 | "dependencies": {
12 | "cheerio": "0.22.0",
13 | "nightmare": "2.10.0"
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/pretutindeni/parseProject.example:
--------------------------------------------------------------------------------
1 | var cheerio = require('cheerio')
2 |
3 | module.exports = function(project) {
4 | "use strict";
5 |
6 | console.log(project);
7 | };
--------------------------------------------------------------------------------
/relparlament/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul pentru Relaţia cu Parlamentul
2 |
3 | ## Tehnologie
4 |
5 | ## Instructiuni
6 |
7 | ## Exceptii
--------------------------------------------------------------------------------
/relparlament/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "relparlament",
3 | "version": "1.0.0",
4 | "description": "## Tehnologie",
5 | "main": "index.js",
6 | "scripts": {
7 | "start": "node index.js",
8 | "test": "echo \"Error: no test specified\" && exit 1"
9 | },
10 | "author": "Mihnea Beldescu",
11 | "license": "ISC",
12 | "dependencies": {
13 | "cheerio": "^0.22.0",
14 | "lokijs": "^1.4.3",
15 | "nightmare": "^2.10.0",
16 | "request": "^2.81.0"
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/sanatate/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.pyc
3 |
--------------------------------------------------------------------------------
/sanatate/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul Sănătăţii
2 | Crawler simplu, de la țară, făcut cu scrapy. Nu știe bine românește, dar înțelege oricum (face fuzzy matching pe titluri ca să scoată tipul de act normativ).
3 | ## Tehnologie
4 | - python3, pip
5 | - scrapy, fuzzywuzzy, urllib3
6 | - python-Levenshtein [opțional]
7 |
8 | ## Instructiuni
9 | Bagi chestii în _credentials.json_, după care un clasic _pip install -r requirements.txt_ și un clasic _scrapy crawl sanatate_.
10 | ## Exceptii
11 | Detectarea tipului de act normativ nu e perfectă, și nici a tipului de documente. Asta e o problemă mai mare, și nu are sens să o tratăm doar într-un singur crawler.
12 |
--------------------------------------------------------------------------------
/sanatate/credentials.json:
--------------------------------------------------------------------------------
1 | {
2 | "endpoint": "http://czl-api.code4.ro/api/publications/",
3 | "authorization": "weeee"
4 | }
--------------------------------------------------------------------------------
/sanatate/requirements.txt:
--------------------------------------------------------------------------------
1 | fuzzywuzzy==0.15.0
2 | python-Levenshtein==0.12.0
3 | Scrapy==1.3.3
4 | urllib3==1.20
5 |
--------------------------------------------------------------------------------
/sanatate/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = scrapy_proj.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapy_proj
12 |
--------------------------------------------------------------------------------
/sanatate/scrapy_proj/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/sanatate/scrapy_proj/__init__.py
--------------------------------------------------------------------------------
/sanatate/scrapy_proj/helpers/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from scrapy_proj.helpers.legal import *
4 | from scrapy_proj.helpers.romanian import *
5 | from scrapy_proj.helpers.text import *
6 |
--------------------------------------------------------------------------------
/sanatate/scrapy_proj/helpers/legal.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import re
4 | import fuzzywuzzy.fuzz as fuzz
5 |
6 | from scrapy_proj.helpers.romanian import *
7 |
8 | class LegalHelper(object):
9 | @staticmethod
10 | def get_type_from_title(title):
11 | engrol = RomanianHelper.englishize_romanian(title).lower()
12 |
13 | stop_pos = len(title)
14 | magic_keyword_search_result = re.search(r'(pentru|privind)', engrol)
15 | if magic_keyword_search_result != None:
16 | stop_pos = magic_keyword_search_result.start()
17 |
18 | search_space = engrol[:stop_pos]
19 |
20 | type_to_keywords = {
21 | 'HG': 'hotarare',
22 | 'OM': 'ordin',
23 | 'LEGE': 'lege',
24 | 'OG': 'ordonanta',
25 | 'OUG': 'ordonanta de urgenta'
26 | }
27 |
28 | final_type = None
29 | max_ratio = 0
30 |
31 | for key in type_to_keywords:
32 | ratio = fuzz.ratio(type_to_keywords[key], search_space)
33 | if ratio > max_ratio:
34 | max_ratio = ratio
35 | final_type = key
36 |
37 | return final_type
38 |
--------------------------------------------------------------------------------
/sanatate/scrapy_proj/helpers/romanian.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | class RomanianHelper(object):
4 | @staticmethod
5 | def englishize_romanian(string):
6 | symbols = (u"țţȚŢșşȘŞăǎĂîÎâÂ",
7 | u"ttTTssSSaaAiIaA")
8 |
9 | tr = {ord(a):ord(b) for a, b in zip(*symbols)}
10 |
11 | return string.translate(tr)
12 |
13 | @staticmethod
14 | def beautify_romanian(string):
15 | symbols = (u"ǎţşŢŞ",
16 | u"ățșȚȘ")
17 | tr = {ord(a):ord(b) for a, b in zip(*symbols)}
18 | return string.translate(tr)
19 |
--------------------------------------------------------------------------------
/sanatate/scrapy_proj/helpers/text.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import re
4 |
5 | class TextHelper(object):
6 |
7 | @staticmethod
8 | def remove_non_ascii(string):
9 | return re.sub(r'[^\x00-\x7F]+', ' ', string)
10 |
11 | @staticmethod
12 | def remove_non_numeric(string):
13 | return re.sub('[^0-9]+', '', string)
14 |
15 | @staticmethod
16 | def rws(str):
17 | if str:
18 | return ' '.join(str.split())
19 | else:
20 | return None
21 |
22 | @staticmethod
23 | def titleize(string):
24 | if string:
25 | return string.title()
26 | else:
27 | return None
28 |
--------------------------------------------------------------------------------
/sanatate/scrapy_proj/items/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from scrapy_proj.items.act import *
4 | from scrapy_proj.items.contact import *
5 |
--------------------------------------------------------------------------------
/sanatate/scrapy_proj/items/act.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import scrapy
4 |
5 | class ActItem(scrapy.Item):
6 | identifier = scrapy.Field()
7 | title = scrapy.Field(serializer=str)
8 | type = scrapy.Field()
9 | institution = scrapy.Field()
10 | date = scrapy.Field()
11 | description = scrapy.Field()
12 | feedback_days = scrapy.Field(serializer=int)
13 | contact = scrapy.Field()
14 | documents = scrapy.Field()
15 |
--------------------------------------------------------------------------------
/sanatate/scrapy_proj/items/contact.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import scrapy
4 |
5 | class ContactItem(scrapy.Item):
6 | tel = scrapy.Field(serializer=str)
7 | email = scrapy.Field(serializer=str)
8 |
--------------------------------------------------------------------------------
/sanatate/scrapy_proj/loaders/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from scrapy_proj.loaders.act import *
4 | from scrapy_proj.loaders.contact import *
5 |
--------------------------------------------------------------------------------
/sanatate/scrapy_proj/loaders/act.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from scrapy.loader import ItemLoader
4 | from scrapy_proj.helpers import *
5 | from scrapy.loader.processors import *
6 | from datetime import datetime as dt
7 |
8 | class ActLoader(ItemLoader):
9 | default_output_processor = TakeFirst()
10 | title_in = MapCompose(TextHelper.rws, RomanianHelper.beautify_romanian)
11 | contact_in = Compose(TakeFirst(), lambda x: dict(x))
12 | date_in = MapCompose(lambda d: dt.strptime(d, '%d-%m-%Y').strftime('%Y-%m-%d'))
13 | feedback_days_in = MapCompose(int)
14 | documents_in = Identity()
15 | documents_out = Identity()
16 |
--------------------------------------------------------------------------------
/sanatate/scrapy_proj/loaders/contact.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from scrapy.loader import ItemLoader
4 | from scrapy_proj.helpers import *
5 | from scrapy.loader.processors import *
6 |
7 | class ContactLoader(ItemLoader):
8 | default_output_processor = TakeFirst()
9 | email_in = MapCompose(str.lower)
10 | tel_in = MapCompose(TextHelper.remove_non_numeric)
11 |
--------------------------------------------------------------------------------
/sanatate/scrapy_proj/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from scrapy_proj.pipelines.extrameta import *
4 | from scrapy_proj.pipelines.post import *
5 |
--------------------------------------------------------------------------------
/sanatate/scrapy_proj/pipelines/extrameta.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import datetime
4 | import hashlib
5 |
6 | from scrapy_proj.helpers import *
7 |
8 | class SanatatePipelineExtraMeta(object):
9 | def process_item(self, item, spider):
10 | item['institution'] = spider.name
11 | act_type = LegalHelper.get_type_from_title(item['title'])
12 | if act_type == None:
13 | raise scrapy.exceptions.DropItem
14 | item['type'] = act_type
15 | engrol = RomanianHelper.englishize_romanian(item['title']).lower()
16 | engrolna = TextHelper.remove_non_ascii(engrol)
17 | identifier_text = '{0} {1}'.format(engrolna, item['date'] if 'date' in item else 'NA')
18 | identifier_text_hashed = hashlib.md5(identifier_text.encode()).hexdigest()
19 | item['identifier'] = '{0}-{1}-{2}'.format(item['institution'], item['type'], identifier_text_hashed)
20 | return item
21 |
--------------------------------------------------------------------------------
/sanatate/scrapy_proj/pipelines/post.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import json
4 | import urllib3
5 |
6 | class SanatatePipelinePost(object):
7 | def open_spider(self, spider):
8 | with open('credentials.json') as credentials_file:
9 | self.credentials = json.load(credentials_file)
10 | def process_item(self, item, spider):
11 | http = urllib3.PoolManager()
12 | r = http.request(
13 | 'POST',
14 | self.credentials['endpoint'],
15 | headers={
16 | 'Content-Type': 'application/json',
17 | 'Authorization': self.credentials['authorization']
18 | },
19 | body=json.dumps(dict(item))
20 | )
21 | return item
22 |
--------------------------------------------------------------------------------
/sanatate/scrapy_proj/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for scrapy_proj project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'scrapy_proj'
13 |
14 | SPIDER_MODULES = ['scrapy_proj.spiders']
15 | NEWSPIDER_MODULE = 'scrapy_proj.spiders'
16 |
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | #USER_AGENT = 'scrapy_proj (+http://www.yourdomain.com)'
19 |
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = True
22 |
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | #CONCURRENT_REQUESTS = 32
25 |
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | #DOWNLOAD_DELAY = 3
30 | # The download delay setting will honor only one of:
31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | #CONCURRENT_REQUESTS_PER_IP = 16
33 |
34 | # Disable cookies (enabled by default)
35 | #COOKIES_ENABLED = False
36 |
37 | # Disable Telnet Console (enabled by default)
38 | #TELNETCONSOLE_ENABLED = False
39 |
40 | # Override the default request headers:
41 | #DEFAULT_REQUEST_HEADERS = {
42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | # 'Accept-Language': 'en',
44 | #}
45 |
46 | # Enable or disable spider middlewares
47 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
48 | #SPIDER_MIDDLEWARES = {
49 | # 'scrapy_proj.middlewares.ScrapyProjSpiderMiddleware': 543,
50 | #}
51 |
52 | # Enable or disable downloader middlewares
53 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
54 | #DOWNLOADER_MIDDLEWARES = {
55 | # 'scrapy_proj.middlewares.MyCustomDownloaderMiddleware': 543,
56 | #}
57 |
58 | # Enable or disable extensions
59 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
60 | #EXTENSIONS = {
61 | # 'scrapy.extensions.telnet.TelnetConsole': None,
62 | #}
63 |
64 | # Configure item pipelines
65 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
66 | ITEM_PIPELINES = {
67 | 'scrapy_proj.pipelines.SanatatePipelineExtraMeta': 298,
68 | 'scrapy_proj.pipelines.SanatatePipelinePost': 299,
69 | }
70 |
71 | LOG_LEVEL = 'WARNING'
72 |
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 |
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 |
--------------------------------------------------------------------------------
/sanatate/scrapy_proj/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/sanatate/scrapy_proj/spiders/sanatate.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import scrapy
4 | import scrapy_proj.items as items
5 | import scrapy_proj.loaders as loaders
6 | import re
7 | import sys
8 |
9 | class SanatateSpider(scrapy.Spider):
10 | name = 'sanatate'
11 |
12 | def start_requests(self):
13 | urls = [
14 | 'http://www.ms.ro/acte-normative-in-transparenta/?vpage=2',
15 | ]
16 |
17 | for url in urls:
18 | yield scrapy.Request(url=url, callback=self.parse)
19 |
20 | def parse(self, response):
21 | date_regex = re.compile('de\s+la\s+(\d{1,2}[-/]\d{2}[-/]\d{4})')
22 | email_regex = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')
23 | tel_regex = re.compile(r'[^0-9](0(?:[0-9].?){9})')
24 | feedback_days_regex = re.compile(r'termen.*limita.*[^[0-9]]*([0-9]{1,2}).*zi')
25 |
26 | for item in response.css('.panel'):
27 | heading = item.css('div.panel-heading')
28 | body = item.css('div.panel-body')
29 | body_text = ''.join(body.xpath('.//text()').extract()).lower()
30 |
31 | title = item.css('a.panel-title::text').extract_first()
32 |
33 | loader = loaders.ActLoader(items.ActItem())
34 | loader.add_value('title', title)
35 |
36 | contact_loader = loaders.ContactLoader(items.ContactItem())
37 | contact_loader.add_value('tel', tel_regex.findall(body_text))
38 | contact_loader.add_value('email', email_regex.findall(body_text))
39 | loader.add_value('contact', contact_loader.load_item())
40 | loader.add_value('date', date_regex.findall(body_text))
41 | loader.add_value('feedback_days', feedback_days_regex.findall(body_text))
42 |
43 | keys = ['type', 'url']
44 | types = body.xpath('.//a[contains(@href, ".pdf")]').xpath('text()').extract()
45 | urls = body.xpath('.//a[contains(@href, ".pdf")]').xpath('@href').extract()
46 | docs = [[types[i], urls[i]] for i in range(len(types))]
47 | loader.add_value('documents', [dict(zip(keys, doc)) for doc in docs])
48 |
49 | yield loader.load_item()
50 |
51 | next_pages = response.css('.pt-cv-pagination a::attr(href)').extract()
52 | next_pages.reverse()
53 | for next_page in next_pages:
54 | next_page = response.urljoin(next_page)
55 | yield scrapy.Request(next_page, callback=self.parse)
56 |
--------------------------------------------------------------------------------
/scrapy/.gitignore:
--------------------------------------------------------------------------------
1 | /.cache
2 | *.pyc
3 |
--------------------------------------------------------------------------------
/scrapy/Readme.md:
--------------------------------------------------------------------------------
1 | # Scrapere scrise cu scrapy
2 |
3 | O colecție de scrapere implementate folosind [scrapy](https://scrapy.org).
4 | Fiecărei instituții îi corespunde un scraper care descarcă publicații de pe site.
5 | Mai departe, publicațiile sunt validate într-un pipeline comun, și trimise la
6 | [api](http://czl-api.code4.ro).
7 |
8 | ## Spidere implementate
9 | * [`dialog`](czlscrape/spiders/dialog.py) - Ministerul Consultărilor Publice și
10 | Dialogului Social
11 |
12 | ## Instrucțiuni
13 | * Ai nevoie de python3, preferabil cu un
14 | [virtualenv](https://virtualenv.pypa.io).
15 |
16 | * Instalezi dependențele:
17 | ```sh
18 | pip install -r requirements.txt
19 | ```
20 |
21 | * Configurezi variabile de mediu:
22 | ```sh
23 | export API_TOKEN='the secret token'
24 | export SENTRY_DSN='the sentry dsn' # opțional
25 | ```
26 |
27 | * Rulezi unul din spidere:
28 | ```sh
29 | scrapy crawl dialog
30 | ```
31 |
32 | * După ce faci schimbări în cod, rulezi testele:
33 | ```sh
34 | pytest
35 | ```
36 |
--------------------------------------------------------------------------------
/scrapy/czlscrape/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import logging
4 |
5 | if 'SENTRY_DSN' in os.environ:
6 | import logging
7 | from raven.handlers.logging import SentryHandler
8 | from raven.conf import setup_logging
9 | setup_logging(SentryHandler(os.environ['SENTRY_DSN'], level=logging.WARN))
10 |
11 | logging.Formatter.converter = time.gmtime
12 |
--------------------------------------------------------------------------------
/scrapy/czlscrape/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class Publication(scrapy.Item):
12 | institution = scrapy.Field()
13 | identifier = scrapy.Field()
14 | type = scrapy.Field()
15 | date = scrapy.Field()
16 | title = scrapy.Field()
17 | description = scrapy.Field()
18 | documents = scrapy.Field()
19 | contact = scrapy.Field()
20 | feedback_days = scrapy.Field()
21 | max_feedback_date = scrapy.Field()
22 |
--------------------------------------------------------------------------------
/scrapy/czlscrape/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class CzlScrapeSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/scrapy/czlscrape/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | import os
9 | import re
10 | import logging
11 | from scrapy.exceptions import DropItem
12 | import requests
13 |
14 | API_URL = 'http://czl-api.code4.ro/api/publications/'
15 | API_TOKEN = os.environ.get('API_TOKEN')
16 |
17 | logger = logging.getLogger(__name__)
18 | logger.setLevel(logging.WARN)
19 |
20 |
21 | class UploadPipeline(object):
22 | def process_item(self, item, spider):
23 | self.upload(item)
24 | return item
25 |
26 | def upload(self, item):
27 | if not API_TOKEN:
28 | print(item)
29 | return
30 |
31 | headers = {'Authorization': 'Token ' + API_TOKEN}
32 | resp = requests.post(API_URL, json=dict(item), headers=headers)
33 | if resp.status_code == 400:
34 | if re.search(r'Integrity Error: Key .* already exists', resp.text):
35 | return
36 | if resp.status_code != 201:
37 | msg = "Failed to upload publication: {!r}".format(resp)
38 | raise RuntimeError(msg)
39 |
40 |
41 | class PublicationValidatorPipeline(object):
42 |
43 | REQUIRED_FIELDS = [
44 | 'identifier',
45 | 'title',
46 | 'institution',
47 | 'description',
48 | 'type',
49 | 'date',
50 | ]
51 |
52 | def process_item(self, item, spider):
53 | for field in self.REQUIRED_FIELDS:
54 | if not item.get(field):
55 | message = "Missing field {}".format(field)
56 | logger.warn(message)
57 | raise DropItem(message)
58 | return item
59 |
--------------------------------------------------------------------------------
/scrapy/czlscrape/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for czlscrape project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'czlscrape'
13 |
14 | SPIDER_MODULES = ['czlscrape.spiders']
15 | NEWSPIDER_MODULE = 'czlscrape.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'czlscrape (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'czlscrape.middlewares.CzlScrapeSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'czlscrape.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'czlscrape.pipelines.PublicationValidatorPipeline': 300,
69 | 'czlscrape.pipelines.UploadPipeline': 1000,
70 | }
71 |
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 |
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 |
93 | LOG_LEVEL = 'DEBUG'
94 | LOG_FORMAT = '%(asctime)s srv="czl-scrape" [%(thread)d] %(levelname)s %(name)s %(funcName)s: %(message)s'
95 | LOG_DATEFORMAT = '%Y-%m-%dT%H:%M:%SZ'
96 |
--------------------------------------------------------------------------------
/scrapy/czlscrape/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapy/czlscrape/spiders/afaceri.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | import re
3 | from ..items import Publication
4 |
5 | INDEX_URL = 'http://www.aippimm.ro/categorie/transparenta-decizionala---modificare-hg-96-2011/'
6 |
7 | def text_from(sel):
8 | return (sel.xpath('string(.)').extract_first() or "").strip()
9 |
10 | def guess_publication_type(text):
11 | text = text.lower()
12 | text = re.sub(r'[șş]', 's', text)
13 | text = re.sub(r'[țţ]', 't', text)
14 | text = re.sub(r'[ăâ]', 'a', text)
15 | text = re.sub(r'[î]', 'i', text)
16 | rules = [
17 | ("lege", "LEGE"),
18 | ("hotarare de guvern", "HG"),
19 | ("hotarare a guvernului", "HG"),
20 | ("hg", "HG"),
21 | ("ordonanta de guvern", "OG"),
22 | ("oug", "OUG"),
23 | ("ordonanta de urgenta", "OUG"),
24 | ("ordin de ministru", "OM"),
25 | ("ordinul", "OM"),
26 | ]
27 | for substr, publication_type in rules:
28 | if substr in text:
29 | return publication_type
30 | else:
31 | return "OTHER"
32 |
33 | class AfaceriSpider(scrapy.Spider):
34 |
35 | name = 'afaceri'
36 | start_urls = [INDEX_URL]
37 |
38 | def parse(self, response):
39 | for article in response.css('.article_container'):
40 | link = article.css('a.lead_subcat')
41 | title = text_from(link)
42 | if not title:
43 | continue
44 |
45 | date_match = re.search(
46 | r'(?P\d{2})\.(?P\d{2})\.(?P\d{4})$',
47 | text_from(article.css('ul.lead')),
48 | )
49 | date = "{year}-{month}-{day}".format(**date_match.groupdict())
50 |
51 | identifier = link.css('::attr(href)').extract_first().split('/')[-1]
52 | publication_type = guess_publication_type(title)
53 |
54 | documents = [
55 | {
56 | 'type': href.split('.')[-1],
57 | 'url': href,
58 | }
59 | for href in article.css('a.files::attr(href)').extract()
60 | ]
61 |
62 | yield Publication(
63 | identifier=identifier,
64 | title=title,
65 | institution='afaceri',
66 | description=title,
67 | type=publication_type,
68 | date=date,
69 | documents=documents,
70 | )
71 |
--------------------------------------------------------------------------------
/scrapy/czlscrape/spiders/dialog.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | import re
3 |
4 | from czlscrape.utils import guess_initiative_type
5 | from ..items import Publication
6 |
7 | INDEX_URL = 'http://dialogsocial.gov.ro/categorie/proiecte-de-acte-normative/'
8 |
9 | DOC_EXTENSIONS = [
10 | ".docs", ".doc", ".txt", ".crt", ".xls",
11 | ".xml", ".pdf", ".docx", ".xlsx",
12 | ]
13 |
14 | TYPE_RULES = [
15 | ("lege", "LEGE"),
16 | ("hotarare de guvern", "HG"),
17 | ("hotarare a guvernului", "HG"),
18 | ("ordonanta de guvern", "OG"),
19 | ("ordonanta de urgenta", "OUG"),
20 | ("ordin de ministru", "OM"),
21 | ("ordinul", "OM"),
22 | ]
23 |
24 |
25 | def text_from(sel):
26 | return sel.xpath('string(.)').extract_first().strip()
27 |
28 |
29 | class DialogSpider(scrapy.Spider):
30 |
31 | name = 'dialog'
32 | start_urls = [INDEX_URL]
33 |
34 | def parse(self, response):
35 | for article in response.css('#content article.post'):
36 | href = article.css('.entry-title a::attr(href)').extract_first()
37 | yield scrapy.Request(response.urljoin(href), self.parse_article)
38 |
39 | def parse_article(self, response):
40 | title = text_from(response.css('h1'))
41 | publication_type = guess_initiative_type(title, TYPE_RULES)
42 |
43 | article = response.css('#content article.post')[0]
44 |
45 | id_value = article.css('::attr(id)').extract_first()
46 | identifier = re.match(r'post-(\d+)', id_value).group(1)
47 |
48 | date = (
49 | article.css('time.entry-date::attr(datetime)')
50 | .extract_first()[:10]
51 | )
52 |
53 | # remove