├── .gitattributes
├── .gitignore
├── README.md
├── _commons-java
    ├── .gitignore
    ├── README.md
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           └── ro
    │               └── code4
    │                   └── czl
    │                       └── scrape
    │                           ├── client
    │                               ├── ApiClient.java
    │                               ├── ApiInvoker.java
    │                               ├── AuthenticationStrategy.java
    │                               ├── BaseRequest.java
    │                               ├── BaseRequestBuilder.java
    │                               ├── Credential.java
    │                               ├── CzlApiUploadPipeline.java
    │                               ├── CzlApiV1.java
    │                               ├── CzlClient.java
    │                               ├── CzlClientConfig.java
    │                               ├── Request.java
    │                               ├── RequestBuilder.java
    │                               ├── Response.java
    │                               ├── authentication
    │                               │   └── TokenAuthenticationStrategy.java
    │                               ├── core
    │                               │   ├── CloseIdleConnectionsTask.java
    │                               │   ├── IdleConnectionMonitor.java
    │                               │   ├── JaxRsJacksonConfigurator.java
    │                               │   ├── JaxRsResponse.java
    │                               │   ├── JaxRsResponseDeserializationStrategy.java
    │                               │   ├── JerseyClientApiInvoker.java
    │                               │   └── LoggingFilter.java
    │                               ├── model
    │                               │   └── CreatePublicationRequest.java
    │                               ├── representation
    │                               │   ├── ContactRepresentation.java
    │                               │   ├── DocumentRepresentation.java
    │                               │   └── PublicationRepresentation.java
    │                               └── samples
    │                               │   └── CzlClientSample.java
    │                           └── text
    │                               ├── ProposalType.java
    │                               └── RomanianMonth.java
├── _config.yml
├── afaceri
    ├── README.md
    ├── package.json
    └── server
    │   ├── boot
    │       ├── authentication.js
    │       └── root.js
    │   ├── component-config.json
    │   ├── config.json
    │   ├── config
    │       └── keywords.js
    │   ├── controllers
    │       └── contentParser.js
    │   ├── datasources.json
    │   ├── middleware.development.json
    │   ├── middleware.json
    │   ├── model-config.json
    │   └── server.js
├── agricultura
    ├── .gitignore
    ├── README.md
    ├── index.js
    └── package.json
├── aparare
    ├── README.md
    └── mapn_plugin.php
├── apepaduri
    └── README.md
├── cdep
    ├── README.md
    ├── requirements.in
    ├── requirements.txt
    └── scraper.py
├── cercetare
    ├── .editorconfig
    ├── .gitignore
    ├── README.md
    ├── app.js
    ├── package.json
    ├── parseProject.js
    └── secrets.json.txt
├── dezvoltare
    ├── .gitignore
    ├── README.md
    ├── crawl_dezvoltare
    │   ├── crawl_dezvoltare
    │   │   ├── __init__.py
    │   │   ├── exporters.py
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   ├── mdrap.py
    │   │   │   └── testing.py
    │   └── scrapy.cfg
    └── requirements.txt
├── economie
    ├── .editorconfig
    ├── .gitignore
    ├── README.md
    ├── app.js
    ├── package.json
    ├── parseProject.js
    ├── secrets.json.txt
    └── yarn.lock
├── educatie
    ├── README.md
    ├── config.js
    ├── index.js
    └── package.json
├── energie
    ├── .gitignore
    ├── README.md
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── Main.java
    │       └── resources
    │           └── logback.xml
├── externe
    ├── README.md
    ├── __init__.py
    ├── eusebiu.py
    ├── scraper
    │   ├── __init__.py
    │   ├── article.py
    │   ├── article_serializer.py
    │   └── extractor.py
    ├── setup.py
    └── utils
    │   ├── __init__.py
    │   ├── api_client.py
    │   ├── lang.py
    │   └── settings.py
├── finantepub
    ├── .gitignore
    ├── README.md
    ├── index.js
    └── package.json
├── interne
    ├── .editorconfig
    ├── .gitignore
    ├── README.md
    ├── app.js
    ├── package.json
    ├── parseProject.js
    ├── secrets.json.txt
    └── yarn.lock
├── justitie
    ├── .gitignore
    ├── README.md
    ├── doc
    │   └── scraping.md
    ├── just
    │   ├── __init__.py
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── publications.py
    ├── requirements.txt
    └── scrapy.cfg
├── license
├── mediu
    ├── .gitignore
    ├── README.md
    ├── crawl_mediu
    │   ├── crawl_mediu
    │   │   ├── __init__.py
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   └── mmediu.py
    │   └── scrapy.cfg
    └── requirements.txt
├── pom.xml
├── presedinte
    └── README.md
├── pretutindeni
    ├── .gitignore
    ├── README.md
    ├── app.js
    ├── package.json
    ├── parseProject.example
    └── yarn.lock
├── relparlament
    ├── README.md
    ├── index.js
    └── package.json
├── sanatate
    ├── .gitignore
    ├── README.md
    ├── credentials.json
    ├── requirements.txt
    ├── scrapy.cfg
    └── scrapy_proj
    │   ├── __init__.py
    │   ├── helpers
    │       ├── __init__.py
    │       ├── legal.py
    │       ├── romanian.py
    │       └── text.py
    │   ├── items
    │       ├── __init__.py
    │       ├── act.py
    │       └── contact.py
    │   ├── loaders
    │       ├── __init__.py
    │       ├── act.py
    │       └── contact.py
    │   ├── pipelines
    │       ├── __init__.py
    │       ├── extrameta.py
    │       └── post.py
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       └── sanatate.py
├── scrapy
    ├── .gitignore
    ├── Readme.md
    ├── czlscrape
    │   ├── __init__.py
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   ├── afaceri.py
    │   │   ├── comunicatii.py
    │   │   ├── cultura.py
    │   │   ├── dialog.py
    │   │   ├── munca.py
    │   │   └── senat.py
    │   └── utils.py
    ├── requirements.in
    ├── requirements.txt
    ├── scrapy.cfg
    └── testsuite
    │   ├── conftest.py
    │   └── test_validator.py
├── sgg
    ├── README.md
    ├── requirements.txt
    └── sgg
    │   ├── run.py
    │   ├── scrapy.cfg
    │   └── sgg
    │       ├── __init__.py
    │       ├── items.py
    │       ├── middlewares.py
    │       ├── pipelines.py
    │       ├── settings.py
    │       └── spiders
    │           ├── __init__.py
    │           └── sgg_spider.py
├── tineret
    ├── .gitignore
    ├── README.md
    ├── requirements.txt
    ├── scrapy.cfg
    └── tineret
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       └── tineret.py
├── transport
    ├── README.md
    ├── config.js
    ├── index.js
    └── package.json
└── turism
    ├── README.md
    ├── out
        ├── production
        │   └── scraper
        │   │   └── com
        │   │       └── company
        │   │           ├── Main.class
        │   │           └── Scraper.class
        └── scraper.jar
    ├── out_files
        ├── Anexe
        │   ├── Anexa1.1.1.pdf
        │   ├── Anexa1.1.pdf
        │   ├── Anexa1.2.pdf
        │   ├── Anexa1.3.pdf
        │   ├── Anexa1.4.pdf
        │   ├── Anexa1.5.1.pdf
        │   ├── Anexa1.5.pdf
        │   ├── Anexa1.6.pdf
        │   ├── Anexa1.7.pdf
        │   ├── Anexa1.8.pdf
        │   ├── Anexa1.pdf
        │   ├── Anexa10.pdf
        │   ├── Anexa11.pdf
        │   ├── Anexa12.pdf
        │   ├── Anexa13.pdf
        │   ├── Anexa14.pdf
        │   ├── Anexa15.pdf
        │   ├── Anexa2.pdf
        │   ├── Anexa3.pdf
        │   ├── Anexa4.pdf
        │   ├── Anexa5.pdf
        │   ├── Anexa6.pdf
        │   ├── Anexa7.pdf
        │   ├── Anexa8.pdf
        │   ├── Anexa9.2.pdf
        │   ├── Anexa9.pdf
        │   ├── AnexaAP.pdf
        │   ├── Anexabrevet.pdf
        │   └── Anexacazare.pdf
        └── Proiecte
        │   ├── Ordin-criterii-participare-targuri-externe.pdf
        │   ├── Proiect-de-Ordin-al-Ministrului-delegat-pentru-intreprinderi-mici-şi-mijlocii-mediul-de-afaceri-şi-turism-pentru-modificarea-OMT-nr-235-2001.pdf
        │   └── Proiect-ordin-modificare-Ordin-65.pdf
    └── src
        └── com
            └── company
                └── Main.java


/.gitattributes:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | # Set default behavior to automatically normalize line endings.
 3 | ###############################################################################
 4 | * text=auto
 5 | 
 6 | ###############################################################################
 7 | # Set default behavior for command prompt diff.
 8 | #
 9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs     diff=csharp
14 | 
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following 
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln       merge=binary
26 | #*.csproj    merge=binary
27 | #*.vbproj    merge=binary
28 | #*.vcxproj   merge=binary
29 | #*.vcproj    merge=binary
30 | #*.dbproj    merge=binary
31 | #*.fsproj    merge=binary
32 | #*.lsproj    merge=binary
33 | #*.wixproj   merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj   merge=binary
36 | #*.wwaproj   merge=binary
37 | 
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg   binary
44 | #*.png   binary
45 | #*.gif   binary
46 | 
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | # 
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the 
52 | # entries below.
53 | ###############################################################################
54 | #*.doc   diff=astextplain
55 | #*.DOC   diff=astextplain
56 | #*.docx  diff=astextplain
57 | #*.DOCX  diff=astextplain
58 | #*.dot   diff=astextplain
59 | #*.DOT   diff=astextplain
60 | #*.pdf   diff=astextplain
61 | #*.PDF   diff=astextplain
62 | #*.rtf   diff=astextplain
63 | #*.RTF   diff=astextplain
64 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # OS files
 2 | .DS_Store
 3 | 
 4 | # Java files
 5 | *.class
 6 | 
 7 | # Log files
 8 | *.log
 9 | logs
10 | 
11 | # Maven
12 | target
13 | pom.xml.versionsBackup
14 | 
15 | # Mobile Tools for Java (J2ME)
16 | .mtj.tmp/
17 | 
18 | # Package Files
19 | *.jar
20 | *.war
21 | *.ear
22 | 
23 | # IntelliJ IDEA
24 | *.iml
25 | .idea
26 | 
27 | # Eclipse
28 | .project
29 | .settings
30 | .classpath
31 | test-output
32 | 
33 | # Vim
34 | *.swp
35 | 
36 | # Virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
37 | hs_err_pid*
38 | 
39 | # Misc
40 | *git.properties
41 | 
42 | # Python
43 | *.pyc
44 | 
45 | # pyenv
46 | .python-version
47 | 
48 | # Node
49 | node_modules/
50 | 


--------------------------------------------------------------------------------
/_commons-java/.gitignore:
--------------------------------------------------------------------------------
 1 | # OS files
 2 | .DS_Store
 3 | 
 4 | # Java files
 5 | *.class
 6 | 
 7 | # Log files
 8 | *.log
 9 | logs
10 | 
11 | # Maven
12 | target
13 | pom.xml.versionsBackup
14 | 
15 | # Dropwizard
16 | dependency-reduced-pom.xml
17 | 
18 | # Mobile Tools for Java (J2ME)
19 | .mtj.tmp/
20 | 
21 | # Package Files
22 | *.jar
23 | *.war
24 | *.ear
25 | 
26 | # IntelliJ IDEA
27 | *.iml
28 | .idea
29 | 
30 | # Eclipse
31 | .project
32 | .settings
33 | .classpath
34 | test-output
35 | 
36 | # Vim
37 | *.swp
38 | 
39 | # Virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
40 | hs_err_pid*
41 | 
42 | # Misc
43 | *git.properties
44 | 
45 | # Asciidoc
46 | .asciidoctor
47 | diag-*.png
48 | 


--------------------------------------------------------------------------------
/_commons-java/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |     <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |     <parent>
 6 |         <groupId>ro.code4.czl</groupId>
 7 |         <artifactId>czl-scrape</artifactId>
 8 |         <version>0.0.1-SNAPSHOT</version>
 9 |         <relativePath>../</relativePath>
10 |     </parent>
11 | 
12 |     <artifactId>czl-scrape-commons</artifactId>
13 |     <packaging>jar</packaging>
14 | 
15 |     <name>Ce Zice Legea :: Scraper :: Common Libraries</name>
16 | 
17 |     <dependencies>
18 |         <!-- Webmagic pipeline -->
19 |         <dependency>
20 |             <groupId>us.codecraft</groupId>
21 |             <artifactId>webmagic-core</artifactId>
22 |             <exclusions>
23 |                 <exclusion>
24 |                     <groupId>org.slf4j</groupId>
25 |                     <artifactId>slf4j-log4j12</artifactId>
26 |                 </exclusion>
27 |             </exclusions>
28 |         </dependency>
29 | 
30 |         <!-- Logging -->
31 |         <dependency>
32 |             <groupId>org.slf4j</groupId>
33 |             <artifactId>slf4j-api</artifactId>
34 |         </dependency>
35 | 
36 |         <!-- HTTP client dependencies -->
37 |         <dependency>
38 |             <groupId>org.glassfish.jersey.core</groupId>
39 |             <artifactId>jersey-client</artifactId>
40 |         </dependency>
41 |         <dependency>
42 |             <groupId>org.glassfish.jersey.connectors</groupId>
43 |             <artifactId>jersey-apache-connector</artifactId>
44 |         </dependency>
45 |         <dependency>
46 |             <groupId>org.glassfish.jersey.media</groupId>
47 |             <artifactId>jersey-media-json-jackson</artifactId>
48 |         </dependency>
49 | 
50 |         <dependency>
51 |             <groupId>org.apache.commons</groupId>
52 |             <artifactId>commons-lang3</artifactId>
53 |         </dependency>
54 |     </dependencies>
55 | </project>
56 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/ApiClient.java:
--------------------------------------------------------------------------------
 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 2 |   *
 3 |   *    ADOBE CONFIDENTIAL
 4 |   *    ___________________
 5 |   *
 6 |   *    Copyright 2016 Adobe Systems Incorporated
 7 |   *    All Rights Reserved.
 8 |   *
 9 |   *    NOTICE:  All information contained herein is, and remains
10 |   *    the property of Adobe Systems Incorporated and its suppliers,
11 |   *    if any.  The intellectual and technical concepts contained
12 |   *    herein are proprietary to Adobe Systems Incorporated and its
13 |   *    suppliers and are protected by all applicable intellectual property
14 |   *    laws, including trade secret and copyright laws.
15 |   *    Dissemination of this information or reproduction of this material
16 |   *    is strictly forbidden unless prior written permission is obtained
17 |   *    from Adobe Systems Incorporated.
18 |   *
19 |   * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client;
21 | 
22 | import ro.code4.czl.scrape.client.core.JerseyClientApiInvoker;
23 | 
24 | /**
25 |  * {@link ApiClient} instances are heavyweight objects that should be created sparingly. A {@link ApiClient} object is
26 |  * thread-safe and should be reused when targeting the same service endpoint.
27 |  *
28 |  * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
29 |  */
30 | public abstract class ApiClient implements AutoCloseable {
31 | 
32 |   protected final ApiInvoker apiInvoker;
33 | 
34 |   /**
35 |    * Creates a new client instance using all the settings specified by the given configuration object.
36 |    *
37 |    * @param config a client configuration object
38 |    */
39 |   protected ApiClient(CzlClientConfig config) {
40 |     this(config, new JerseyClientApiInvoker(config));
41 |   }
42 | 
43 |   /**
44 |    * Creates a new client instance using all the settings specified by the given configuration object and a custom {@link ApiInvoker} instance.
45 |    *
46 |    * @param config     a client configuration object
47 |    * @param apiInvoker a custom API invoker object
48 |    */
49 |   private ApiClient(CzlClientConfig config, ApiInvoker apiInvoker) {
50 |     this.apiInvoker = apiInvoker;
51 |   }
52 | 
53 |   /**
54 |    * Retrieves the API invoker object used by this client.
55 |    *
56 |    * @return a {@link ApiInvoker} instance
57 |    */
58 |   public ApiInvoker getApiInvoker() {
59 |     return apiInvoker;
60 |   }
61 | 
62 |   @Override
63 |   public void close() throws Exception {
64 |     this.shutdown();
65 |   }
66 | 
67 |   /**
68 |    * Shuts down the connection manager used by this client and releases allocated resources. This includes closing all connections, whether they are
69 |    * currently used or not.
70 |    */
71 |   private void shutdown() {
72 |     apiInvoker.shutdown();
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/ApiInvoker.java:
--------------------------------------------------------------------------------
 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 2 |   *
 3 |   *    ADOBE CONFIDENTIAL
 4 |   *    ___________________
 5 |   *
 6 |   *    Copyright 2016 Adobe Systems Incorporated
 7 |   *    All Rights Reserved.
 8 |   *
 9 |   *    NOTICE:  All information contained herein is, and remains
10 |   *    the property of Adobe Systems Incorporated and its suppliers,
11 |   *    if any.  The intellectual and technical concepts contained
12 |   *    herein are proprietary to Adobe Systems Incorporated and its
13 |   *    suppliers and are protected by all applicable intellectual property
14 |   *    laws, including trade secret and copyright laws.
15 |   *    Dissemination of this information or reproduction of this material
16 |   *    is strictly forbidden unless prior written permission is obtained
17 |   *    from Adobe Systems Incorporated.
18 |   *
19 |   * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client;
21 | 
22 | /**
23 |  * Basic API invoker contract.
24 |  *
25 |  * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
26 |  */
27 | public interface ApiInvoker extends AutoCloseable {
28 | 
29 |   /**
30 |    * Configures a request header that should be added to every request made via this API invoker.
31 |    *
32 |    * @param key   request header name
33 |    * @param value request header value
34 |    */
35 |   void addDefaultHeader(String key, String value);
36 | 
37 |   /**
38 |    * Executes a request.
39 |    *
40 |    * @param request the request to execute
41 |    * @param <T>     the type that the response should be deserialized into
42 |    * @return a {@link Response} instance containing the response body deserialized into the desired type
43 |    */
44 |   <T> Response<T> invokeAPI(Request<T> request);
45 | 
46 |   /**
47 |    * Shuts down the connection manager used by this API invoker and releases allocated resources. This includes closing all connections, whether they
48 |    * are currently used or not.
49 |    */
50 |   void shutdown();
51 | }
52 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/AuthenticationStrategy.java:
--------------------------------------------------------------------------------
 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 2 |   *
 3 |   *    ADOBE CONFIDENTIAL
 4 |   *    ___________________
 5 |   *
 6 |   *    Copyright 2016 Adobe Systems Incorporated
 7 |   *    All Rights Reserved.
 8 |   *
 9 |   *    NOTICE:  All information contained herein is, and remains
10 |   *    the property of Adobe Systems Incorporated and its suppliers,
11 |   *    if any.  The intellectual and technical concepts contained
12 |   *    herein are proprietary to Adobe Systems Incorporated and its
13 |   *    suppliers and are protected by all applicable intellectual property
14 |   *    laws, including trade secret and copyright laws.
15 |   *    Dissemination of this information or reproduction of this material
16 |   *    is strictly forbidden unless prior written permission is obtained
17 |   *    from Adobe Systems Incorporated.
18 |   *
19 |   * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client;
21 | 
22 | /**
23 |  * Contract for an authentication strategy.
24 |  *
25 |  * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
26 |  */
27 | public interface AuthenticationStrategy {
28 | 
29 |   /**
30 |    * Processes the request with the goal of applying the authentication strategy. This is called before the request is executed.
31 |    *
32 |    * @param request the request.
33 |    * @param <T>     the expected type of the response body
34 |    */
35 |   <T> void process(Request<T> request);
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/BaseRequestBuilder.java:
--------------------------------------------------------------------------------
 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 2 |   *
 3 |   *    ADOBE CONFIDENTIAL
 4 |   *    ___________________
 5 |   *
 6 |   *    Copyright 2016 Adobe Systems Incorporated
 7 |   *    All Rights Reserved.
 8 |   *
 9 |   *    NOTICE:  All information contained herein is, and remains
10 |   *    the property of Adobe Systems Incorporated and its suppliers,
11 |   *    if any.  The intellectual and technical concepts contained
12 |   *    herein are proprietary to Adobe Systems Incorporated and its
13 |   *    suppliers and are protected by all applicable intellectual property
14 |   *    laws, including trade secret and copyright laws.
15 |   *    Dissemination of this information or reproduction of this material
16 |   *    is strictly forbidden unless prior written permission is obtained
17 |   *    from Adobe Systems Incorporated.
18 |   *
19 |   * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client;
21 | 
22 | import java.util.HashMap;
23 | import java.util.Map;
24 | 
25 | /**
26 |  * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
27 |  */
28 | public abstract class BaseRequestBuilder<T extends Request<U>, U> implements RequestBuilder<T, U> {
29 | 
30 |   String ifNoneMatch;
31 |   String ifMatch;
32 |   Map<String, String> customHeaders = new HashMap<>();
33 |   boolean head;
34 |   Boolean followRedirects;
35 |   Credential credential;
36 | 
37 |   @Override
38 |   public RequestBuilder<T, U> ifNoneMatch(String ifNoneMatch) {
39 |     this.ifNoneMatch = ifNoneMatch;
40 |     return this;
41 |   }
42 | 
43 |   @Override
44 |   public RequestBuilder<T, U> ifMatch(String ifMatch) {
45 |     this.ifMatch = ifMatch;
46 |     return this;
47 |   }
48 | 
49 |   @Override
50 |   public RequestBuilder<T, U> headersOnly() {
51 |     this.head = true;
52 |     return this;
53 |   }
54 | 
55 |   @Override
56 |   public RequestBuilder<T, U> followRedirects(boolean followRedirects) {
57 |     this.followRedirects = followRedirects;
58 |     return this;
59 |   }
60 | 
61 |   @Override
62 |   public RequestBuilder<T, U> credential(Credential credential) {
63 |     this.credential = credential;
64 |     return this;
65 |   }
66 | 
67 |   @Override
68 |   public RequestBuilder<T, U> header(String headerName, String headerValue) {
69 |     this.customHeaders.put(headerName, headerValue);
70 |     return this;
71 |   }
72 | 
73 |   @Override
74 |   public Response<U> execute() {
75 |     return build().execute();
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/Credential.java:
--------------------------------------------------------------------------------
 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 2 |   *
 3 |   *    ADOBE CONFIDENTIAL
 4 |   *    ___________________
 5 |   *
 6 |   *    Copyright 2016 Adobe Systems Incorporated
 7 |   *    All Rights Reserved.
 8 |   *
 9 |   *    NOTICE:  All information contained herein is, and remains
10 |   *    the property of Adobe Systems Incorporated and its suppliers,
11 |   *    if any.  The intellectual and technical concepts contained
12 |   *    herein are proprietary to Adobe Systems Incorporated and its
13 |   *    suppliers and are protected by all applicable intellectual property
14 |   *    laws, including trade secret and copyright laws.
15 |   *    Dissemination of this information or reproduction of this material
16 |   *    is strictly forbidden unless prior written permission is obtained
17 |   *    from Adobe Systems Incorporated.
18 |   *
19 |   * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client;
21 | 
22 | /**
23 |  * Marker interface for credential used during authentication. Used by {@linkplain AuthenticationStrategy} implementations.
24 |  *
25 |  * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
26 |  */
27 | public interface Credential {
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/CzlApiUploadPipeline.java:
--------------------------------------------------------------------------------
 1 | package ro.code4.czl.scrape.client;
 2 | 
 3 | import static ro.code4.czl.scrape.client.representation.PublicationRepresentation.PublicationRepresentationBuilder.aPublicationRepresentation;
 4 | 
 5 | import ro.code4.czl.scrape.client.representation.DocumentRepresentation;
 6 | import us.codecraft.webmagic.ResultItems;
 7 | import us.codecraft.webmagic.Task;
 8 | import us.codecraft.webmagic.pipeline.Pipeline;
 9 | 
10 | import java.util.List;
11 | import java.util.Map;
12 | 
13 | /**
14 |  * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
15 |  */
16 | public class CzlApiUploadPipeline implements Pipeline {
17 | 
18 |   private final CzlClient czlClient;
19 | 
20 |   public CzlApiUploadPipeline(CzlClient czlClient) {
21 |     this.czlClient = czlClient;
22 |   }
23 | 
24 |   @SuppressWarnings("unchecked")
25 |   @Override
26 |   public void process(ResultItems resultItems, Task task) {
27 |     Map<String, Object> extractedFields = resultItems.getAll();
28 | 
29 |     czlClient.apiV1()
30 |         .createPublication(aPublicationRepresentation()
31 |                                .withDate((String) extractedFields.get("date"))
32 |                                .withInstitution((String) extractedFields.get("institution"))
33 |                                .withIdentifier((String) extractedFields.get("identifier"))
34 |                                .withDescription((String) extractedFields.get("description"))
35 |                                .withDocuments((List<DocumentRepresentation>) extractedFields.get("documents"))
36 |                                .withTitle((String) extractedFields.get("title"))
37 |                                .withType((String) extractedFields.get("type"))
38 |                                //.withFeedback_days((int) extractedFields.get("feedbackDays"))
39 |                                .withContact((Map<String, String>) extractedFields.get("contact"))
40 |                                .build())
41 |         .execute();
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/CzlApiV1.java:
--------------------------------------------------------------------------------
 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 2 |   *
 3 |   *    ADOBE CONFIDENTIAL
 4 |   *    ___________________
 5 |   *
 6 |   *    Copyright 2016 Adobe Systems Incorporated
 7 |   *    All Rights Reserved.
 8 |   *
 9 |   *    NOTICE:  All information contained herein is, and remains
10 |   *    the property of Adobe Systems Incorporated and its suppliers,
11 |   *    if any.  The intellectual and technical concepts contained
12 |   *    herein are proprietary to Adobe Systems Incorporated and its
13 |   *    suppliers and are protected by all applicable intellectual property
14 |   *    laws, including trade secret and copyright laws.
15 |   *    Dissemination of this information or reproduction of this material
16 |   *    is strictly forbidden unless prior written permission is obtained
17 |   *    from Adobe Systems Incorporated.
18 |   *
19 |   * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client;
21 | 
22 | import ro.code4.czl.scrape.client.model.CreatePublicationRequest;
23 | import ro.code4.czl.scrape.client.representation.PublicationRepresentation;
24 | 
25 | /**
26 |  * A class describing the API for Ce Zice Legea. Uses a fluent builder interface to create requests.
27 |  *
28 |  * @author Ionut -Maxim Margelatu (ionut.margelatu@gmail.com)
29 |  */
30 | public class CzlApiV1 {
31 | 
32 |   private final ApiInvoker apiInvoker;
33 | 
34 |   /**
35 |    * Creates a new request builder.
36 |    *
37 |    * @param apiInvoker the {@linkplain ApiInvoker} implementation to use for every request built via this class.
38 |    * @see ApiInvoker
39 |    */
40 |   CzlApiV1(ApiInvoker apiInvoker) {
41 |     this.apiInvoker = apiInvoker;
42 |   }
43 | 
44 |   /**
45 |    * Starts preparing a new request for creating a publication.
46 |    *
47 |    * @param publicationRepresentation the representation of the publication to create.
48 |    * @return a request builder.
49 |    */
50 |   public CreatePublicationRequest.Builder createPublication(PublicationRepresentation publicationRepresentation) {
51 |     return CreatePublicationRequest.builder(publicationRepresentation, apiInvoker);
52 |   }
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/CzlClient.java:
--------------------------------------------------------------------------------
 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 2 |   *
 3 |   *    ADOBE CONFIDENTIAL
 4 |   *    ___________________
 5 |   *
 6 |   *    Copyright 2016 Adobe Systems Incorporated
 7 |   *    All Rights Reserved.
 8 |   *
 9 |   *    NOTICE:  All information contained herein is, and remains
10 |   *    the property of Adobe Systems Incorporated and its suppliers,
11 |   *    if any.  The intellectual and technical concepts contained
12 |   *    herein are proprietary to Adobe Systems Incorporated and its
13 |   *    suppliers and are protected by all applicable intellectual property
14 |   *    laws, including trade secret and copyright laws.
15 |   *    Dissemination of this information or reproduction of this material
16 |   *    is strictly forbidden unless prior written permission is obtained
17 |   *    from Adobe Systems Incorporated.
18 |   *
19 |   * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client;
21 | 
22 | /**
23 |  * An REST client object. {@link CzlClient} instances are heavyweight objects that should be created sparingly. A {@link CzlClient} object is
24 |  * thread-safe and should be reused when targeting the same service endpoint.
25 |  *
26 |  * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
27 |  */
28 | public class CzlClient extends ApiClient {
29 | 
30 |   /**
31 |    * Build a new client instance using all the settings specified by the given configuration object. {@link CzlClient} instances are heavyweight objects
32 |    * that should be created sparingly. A {@link CzlClient} object is thread-safe and should be reused when targeting the same service endpoint.
33 |    *
34 |    * @param czlClientConfig a client configuration object
35 |    * @return a new SDK client instance
36 |    */
37 |   public static CzlClient newClient(CzlClientConfig czlClientConfig) {
38 |     return new CzlClient(czlClientConfig);
39 |   }
40 | 
41 |   private CzlClient(CzlClientConfig czlClientConfig) {
42 |     super(czlClientConfig);
43 |   }
44 | 
45 | 
46 |   /**
47 |    * Access the API.
48 |    *
49 |    * @return an object describing the API.
50 |    */
51 |   public CzlApiV1 apiV1() {
52 |     return new CzlApiV1(apiInvoker);
53 |   }
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/Request.java:
--------------------------------------------------------------------------------
  1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  2 |   *
  3 |   *    ADOBE CONFIDENTIAL
  4 |   *    ___________________
  5 |   *
  6 |   *    Copyright 2016 Adobe Systems Incorporated
  7 |   *    All Rights Reserved.
  8 |   *
  9 |   *    NOTICE:  All information contained herein is, and remains
 10 |   *    the property of Adobe Systems Incorporated and its suppliers,
 11 |   *    if any.  The intellectual and technical concepts contained
 12 |   *    herein are proprietary to Adobe Systems Incorporated and its
 13 |   *    suppliers and are protected by all applicable intellectual property
 14 |   *    laws, including trade secret and copyright laws.
 15 |   *    Dissemination of this information or reproduction of this material
 16 |   *    is strictly forbidden unless prior written permission is obtained
 17 |   *    from Adobe Systems Incorporated.
 18 |   *
 19 |   * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
 20 | package ro.code4.czl.scrape.client;
 21 | 
 22 | import java.util.Map;
 23 | 
 24 | /**
 25 |  * Contract for a request made by the client.
 26 |  *
 27 |  * @param <T> the expected type of the response body
 28 |  * @author Ionut -Maxim Margelatu (ionut.margelatu@gmail.com)
 29 |  */
 30 | public interface Request<T> {
 31 | 
 32 |   /**
 33 |    * Executes the request and returns the response.
 34 |    *
 35 |    * @return the result of the execution. If the response contains a body, it will be automatically deserialized and ready for use.
 36 |    */
 37 |   Response<T> execute();
 38 | 
 39 |   /**
 40 |    * Returns the type of response body, if any; <code>null</code> otherwise.
 41 |    *
 42 |    * @return the response type
 43 |    */
 44 |   Class<T> getResponseType();
 45 | 
 46 |   /**
 47 |    * Returns the absolute path of the target of this request.
 48 |    *
 49 |    * @return the absolute path.
 50 |    */
 51 |   String getPath();
 52 | 
 53 |   /**
 54 |    * Returns the HTTP method used by this request.
 55 |    *
 56 |    * @return the method.
 57 |    */
 58 |   String getMethod();
 59 | 
 60 |   /**
 61 |    * Returns the path parameters used by this request.
 62 |    *
 63 |    * @return the path parameters.
 64 |    */
 65 |   Map<String, Object> getPathParams();
 66 | 
 67 |   /**
 68 |    * Returns the query parameters used by this request.
 69 |    *
 70 |    * @return the query parameters.
 71 |    */
 72 |   Map<String, String> getQueryParams();
 73 | 
 74 |   /**
 75 |    * Returns the matrix parameters used by this request.
 76 |    *
 77 |    * @return the matrix parameters.
 78 |    */
 79 |   Map<String, String> getMatrixParams();
 80 | 
 81 |   /**
 82 |    * Returns the header parameters used by this request.
 83 |    *
 84 |    * @return the header parameters.
 85 |    */
 86 |   Map<String, String> getHeaderParams();
 87 | 
 88 |   /**
 89 |    * Returns the body used by this request, if any.
 90 |    *
 91 |    * @return the body if one has been specified, <code>null</code> otherwise.
 92 |    */
 93 |   Object getBody();
 94 | 
 95 |   /**
 96 |    * Returns the value of the <code>Accept</code> used by this request.
 97 |    *
 98 |    * @return the value of the <code>Accept</code> header.
 99 |    */
100 |   String getAcceptHeader();
101 | 
102 |   /**
103 |    * Indicates whether this request is supposed to follow redirects or not.
104 |    *
105 |    * @return <code>true</code> if the request is supposed to follow redirects, <code>false</code> otherwise.
106 |    */
107 |   Boolean isFollowRedirectsEnabled();
108 | 
109 |   /**
110 |    * Returns the value of the credential used by this request.
111 |    *
112 |    * @return the credential, if any.
113 |    */
114 |   Credential getCredential();
115 | }
116 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/RequestBuilder.java:
--------------------------------------------------------------------------------
 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 2 |   *
 3 |   *    ADOBE CONFIDENTIAL
 4 |   *    ___________________
 5 |   *
 6 |   *    Copyright 2016 Adobe Systems Incorporated
 7 |   *    All Rights Reserved.
 8 |   *
 9 |   *    NOTICE:  All information contained herein is, and remains
10 |   *    the property of Adobe Systems Incorporated and its suppliers,
11 |   *    if any.  The intellectual and technical concepts contained
12 |   *    herein are proprietary to Adobe Systems Incorporated and its
13 |   *    suppliers and are protected by all applicable intellectual property
14 |   *    laws, including trade secret and copyright laws.
15 |   *    Dissemination of this information or reproduction of this material
16 |   *    is strictly forbidden unless prior written permission is obtained
17 |   *    from Adobe Systems Incorporated.
18 |   *
19 |   * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client;
21 | 
22 | /**
23 |  * Contract for builders of {@linkplain Request} instances.
24 |  *
25 |  * @param <T> the request type
26 |  * @param <U> the expected type of the response body
27 |  * @author Ionut -Maxim Margelatu (ionut.margelatu@gmail.com)
28 |  */
29 | public interface RequestBuilder<T extends Request<U>, U> {
30 | 
31 |   /**
32 |    * Sets the <code>If-None-Match</code> header to the given value. Useful when making conditional requests.
33 |    *
34 |    * @param ifNoneMatch the value of the header.
35 |    * @return the request builder.
36 |    */
37 |   RequestBuilder<T, U> ifNoneMatch(String ifNoneMatch);
38 | 
39 |   /**
40 |    * Sets the <code>If-Match</code> header to the given value. Useful when making conditional requests.
41 |    *
42 |    * @param ifMatch the value of the header.
43 |    * @return the request builder.
44 |    */
45 |   RequestBuilder<T, U> ifMatch(String ifMatch);
46 | 
47 |   /**
48 |    * Make the request to only ask for headers. Only applies when the original request is using <code>GET</code>.
49 |    *
50 |    * @return the request builder.
51 |    */
52 |   RequestBuilder<T, U> headersOnly();
53 | 
54 |   /**
55 |    * Enables or disables following redirects.
56 |    *
57 |    * @param followRedirects set to <code>true</code> to enable following redirects, otherwise to <code>false</code>.
58 |    * @return the request builder.
59 |    */
60 |   RequestBuilder<T, U> followRedirects(boolean followRedirects);
61 | 
62 |   /**
63 |    * Use the given credential for this request.
64 |    *
65 |    * @param credential the credential to use for this request.
66 |    * @return the request builder.
67 |    */
68 |   RequestBuilder<T, U> credential(Credential credential);
69 | 
70 |   /**
71 |    * Adds a custom header to this request.
72 |    *
73 |    * @param headerName  the header name for this request.
74 |    * @param headerValue the header value for this request.
75 |    * @return the request builder.
76 |    */
77 |   RequestBuilder<T, U> header(String headerName, String headerValue);
78 | 
79 |   /**
80 |    * Build the request. Does not execute it.
81 |    *
82 |    * @return the request.
83 |    */
84 |   T build();
85 | 
86 |   /**
87 |    * Builds and executes the request.
88 |    *
89 |    * @return the result of the execution of the request.
90 |    */
91 |   Response<U> execute();
92 | 
93 | }
94 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/Response.java:
--------------------------------------------------------------------------------
 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 2 |   *
 3 |   *    ADOBE CONFIDENTIAL
 4 |   *    ___________________
 5 |   *
 6 |   *    Copyright 2016 Adobe Systems Incorporated
 7 |   *    All Rights Reserved.
 8 |   *
 9 |   *    NOTICE:  All information contained herein is, and remains
10 |   *    the property of Adobe Systems Incorporated and its suppliers,
11 |   *    if any.  The intellectual and technical concepts contained
12 |   *    herein are proprietary to Adobe Systems Incorporated and its
13 |   *    suppliers and are protected by all applicable intellectual property
14 |   *    laws, including trade secret and copyright laws.
15 |   *    Dissemination of this information or reproduction of this material
16 |   *    is strictly forbidden unless prior written permission is obtained
17 |   *    from Adobe Systems Incorporated.
18 |   *
19 |   * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client;
21 | 
22 | import java.util.Date;
23 | import java.util.Map;
24 | 
25 | /**
26 |  * Contract for a response to a request made by the client.
27 |  *
28 |  * @param <T> the expected type of the response body
29 |  * @author Ionut -Maxim Margelatu (ionut.margelatu@gmail.com)
30 |  */
31 | public interface Response<T> {
32 | 
33 |   /**
34 |    * Returns the status code of the response.
35 |    *
36 |    * @return the status code.
37 |    */
38 |   int getStatusCode();
39 | 
40 |   /**
41 |    * Returns the entity in the response.
42 |    *
43 |    * @return the entity.
44 |    */
45 |   T getEntity();
46 | 
47 |   /**
48 |    * Returns the content type of the response.
49 |    *
50 |    * @return the content type.
51 |    */
52 |   String getContentType();
53 | 
54 |   /**
55 |    * Returns the content length of the response.
56 |    *
57 |    * @return the content length.
58 |    */
59 |   long getContentLength();
60 | 
61 |   /**
62 |    * Returns the <code>ETag</code> header value, if any.
63 |    *
64 |    * @return the <code>ETag</code> header value.
65 |    */
66 |   String getETag();
67 | 
68 |   /**
69 |    * Returns the date of the response.
70 |    *
71 |    * @return the date.
72 |    */
73 |   Date getDate();
74 | 
75 |   /**
76 |    * Returns the value of a given response header.
77 |    *
78 |    * @param headerName the header name.
79 |    * @return the header value.
80 |    */
81 |   String getHeaderString(String headerName);
82 | 
83 |   /**
84 |    * Returns all the response headers.
85 |    *
86 |    * @return the response headers.
87 |    */
88 |   Map<String, String> getHeaders();
89 | }
90 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/authentication/TokenAuthenticationStrategy.java:
--------------------------------------------------------------------------------
 1 | package ro.code4.czl.scrape.client.authentication;
 2 | 
 3 | import ro.code4.czl.scrape.client.AuthenticationStrategy;
 4 | import ro.code4.czl.scrape.client.Request;
 5 | 
 6 | /**
 7 |  * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
 8 |  */
 9 | public class TokenAuthenticationStrategy implements AuthenticationStrategy {
10 | 
11 |   private final String tokenValue = System.getProperty("czl.scrape.token");
12 | 
13 |   @Override
14 |   public <T> void process(Request<T> request) {
15 |     request.getHeaderParams().put("Authorization", "Token " + tokenValue);
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/core/CloseIdleConnectionsTask.java:
--------------------------------------------------------------------------------
 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 2 |   *
 3 |   *    ADOBE CONFIDENTIAL
 4 |   *    ___________________
 5 |   *
 6 |   *    Copyright 2016 Adobe Systems Incorporated
 7 |   *    All Rights Reserved.
 8 |   *
 9 |   *    NOTICE:  All information contained herein is, and remains
10 |   *    the property of Adobe Systems Incorporated and its suppliers,
11 |   *    if any.  The intellectual and technical concepts contained
12 |   *    herein are proprietary to Adobe Systems Incorporated and its
13 |   *    suppliers and are protected by all applicable intellectual property
14 |   *    laws, including trade secret and copyright laws.
15 |   *    Dissemination of this information or reproduction of this material
16 |   *    is strictly forbidden unless prior written permission is obtained
17 |   *    from Adobe Systems Incorporated.
18 |   *
19 |   * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client.core;
21 | 
22 | import org.apache.http.conn.HttpClientConnectionManager;
23 | import org.slf4j.Logger;
24 | import org.slf4j.LoggerFactory;
25 | 
26 | import java.util.concurrent.TimeUnit;
27 | 
28 | /**
29 |  * Closes idle or expired connections created by a specific connection manager.
30 |  *
31 |  * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
32 |  */
33 | class CloseIdleConnectionsTask implements Runnable {
34 | 
35 |   private static final Logger logger = LoggerFactory.getLogger(CloseIdleConnectionsTask.class);
36 | 
37 |   private final HttpClientConnectionManager connectionManager;
38 |   private final int idleTime;
39 | 
40 |   /**
41 |    * Creates a new task.
42 |    *
43 |    * @param connectionManager the connection manager that will be periodically checked
44 |    * @param idleTime          the inactivity time in milliseconds after which connections are considered to be idle
45 |    */
46 |   CloseIdleConnectionsTask(HttpClientConnectionManager connectionManager, int idleTime) {
47 |     this.connectionManager = connectionManager;
48 |     this.idleTime = idleTime;
49 |   }
50 | 
51 |   @Override
52 |   public void run() {
53 |     try {
54 |       connectionManager.closeExpiredConnections();
55 |       connectionManager.closeIdleConnections(idleTime, TimeUnit.MILLISECONDS);
56 |     } catch (Exception t) {
57 |       logger.warn("Unable to close idle connections", t);
58 |     }
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/core/JaxRsJacksonConfigurator.java:
--------------------------------------------------------------------------------
 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 2 |   *
 3 |   *    ADOBE CONFIDENTIAL
 4 |   *    ___________________
 5 |   *
 6 |   *    Copyright 2016 Adobe Systems Incorporated
 7 |   *    All Rights Reserved.
 8 |   *
 9 |   *    NOTICE:  All information contained herein is, and remains
10 |   *    the property of Adobe Systems Incorporated and its suppliers,
11 |   *    if any.  The intellectual and technical concepts contained
12 |   *    herein are proprietary to Adobe Systems Incorporated and its
13 |   *    suppliers and are protected by all applicable intellectual property
14 |   *    laws, including trade secret and copyright laws.
15 |   *    Dissemination of this information or reproduction of this material
16 |   *    is strictly forbidden unless prior written permission is obtained
17 |   *    from Adobe Systems Incorporated.
18 |   *
19 |   * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client.core;
21 | 
22 | import com.fasterxml.jackson.databind.DeserializationFeature;
23 | import com.fasterxml.jackson.databind.ObjectMapper;
24 | 
25 | import javax.ws.rs.ext.ContextResolver;
26 | import javax.ws.rs.ext.Provider;
27 | 
28 | /**
29 |  * Provides custom configuration for Jackson.
30 |  *
31 |  * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
32 |  */
33 | @Provider
34 | public class JaxRsJacksonConfigurator implements ContextResolver<ObjectMapper> {
35 | 
36 |   private final ObjectMapper mapper;
37 | 
38 |   public JaxRsJacksonConfigurator() {
39 |     mapper = new ObjectMapper();
40 |     mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
41 |   }
42 | 
43 |   @Override
44 |   public ObjectMapper getContext(Class<?> type) {
45 |     return mapper;
46 |   }
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/core/JaxRsResponse.java:
--------------------------------------------------------------------------------
 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 2 |   *
 3 |   *    ADOBE CONFIDENTIAL
 4 |   *    ___________________
 5 |   *
 6 |   *    Copyright 2016 Adobe Systems Incorporated
 7 |   *    All Rights Reserved.
 8 |   *
 9 |   *    NOTICE:  All information contained herein is, and remains
10 |   *    the property of Adobe Systems Incorporated and its suppliers,
11 |   *    if any.  The intellectual and technical concepts contained
12 |   *    herein are proprietary to Adobe Systems Incorporated and its
13 |   *    suppliers and are protected by all applicable intellectual property
14 |   *    laws, including trade secret and copyright laws.
15 |   *    Dissemination of this information or reproduction of this material
16 |   *    is strictly forbidden unless prior written permission is obtained
17 |   *    from Adobe Systems Incorporated.
18 |   *
19 |   * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client.core;
21 | 
22 | import jersey.repackaged.com.google.common.collect.Maps;
23 | import ro.code4.czl.scrape.client.Response;
24 | 
25 | import java.util.Collections;
26 | import java.util.Date;
27 | import java.util.List;
28 | import java.util.Map;
29 | 
30 | /**
31 |  * Wrapper over {@linkplain javax.ws.rs.core.Response} that provides a safe body deserialization mechanism along with some syntactic sugar.
32 |  *
33 |  * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
34 |  */
35 | class JaxRsResponse<T> implements Response<T> {
36 | 
37 |   private final javax.ws.rs.core.Response originalResponse;
38 |   private final Map<String, String> simplifiedHeaders;
39 |   private final T entity;
40 | 
41 |   JaxRsResponse(javax.ws.rs.core.Response originalResponse, Class<T> expectedType) {
42 |     this.originalResponse = originalResponse;
43 |     this.entity = new JaxRsResponseDeserializationStrategy().read(originalResponse, expectedType);
44 |     this.simplifiedHeaders = Collections.unmodifiableMap(
45 |         Maps.transformEntries(originalResponse.getStringHeaders(), new StringListToStringEntryTransformer()));
46 |   }
47 | 
48 |   @Override
49 |   public int getStatusCode() {
50 |     return originalResponse.getStatus();
51 |   }
52 | 
53 |   @Override
54 |   public T getEntity() {
55 |     return entity;
56 |   }
57 | 
58 |   @Override
59 |   public String getContentType() {
60 |     return originalResponse.getMediaType().toString();
61 |   }
62 | 
63 |   @Override
64 |   public long getContentLength() {
65 |     return originalResponse.getLength();
66 |   }
67 | 
68 |   @Override
69 |   public String getETag() {
70 |     return originalResponse.getEntityTag().getValue();
71 |   }
72 | 
73 |   @Override
74 |   public Date getDate() {
75 |     return originalResponse.getDate();
76 |   }
77 | 
78 |   @Override
79 |   public String getHeaderString(String headerName) {
80 |     return originalResponse.getHeaderString(headerName);
81 |   }
82 | 
83 |   @Override
84 |   public Map<String, String> getHeaders() {
85 |     return simplifiedHeaders;
86 |   }
87 | 
88 |   private static class StringListToStringEntryTransformer implements Maps.EntryTransformer<String, List<String>, String> {
89 | 
90 |     @Override
91 |     public String transformEntry(String s, List<String> strings) {
92 |       if (strings == null || strings.isEmpty()) {
93 |         return null;
94 |       }
95 |       return strings.get(0);
96 |     }
97 |   }
98 | }
99 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/core/JaxRsResponseDeserializationStrategy.java:
--------------------------------------------------------------------------------
 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 2 |   *
 3 |   *    ADOBE CONFIDENTIAL
 4 |   *    ___________________
 5 |   *
 6 |   *    Copyright 2016 Adobe Systems Incorporated
 7 |   *    All Rights Reserved.
 8 |   *
 9 |   *    NOTICE:  All information contained herein is, and remains
10 |   *    the property of Adobe Systems Incorporated and its suppliers,
11 |   *    if any.  The intellectual and technical concepts contained
12 |   *    herein are proprietary to Adobe Systems Incorporated and its
13 |   *    suppliers and are protected by all applicable intellectual property
14 |   *    laws, including trade secret and copyright laws.
15 |   *    Dissemination of this information or reproduction of this material
16 |   *    is strictly forbidden unless prior written permission is obtained
17 |   *    from Adobe Systems Incorporated.
18 |   *
19 |   * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/
20 | package ro.code4.czl.scrape.client.core;
21 | 
22 | import java.io.InputStream;
23 | 
24 | import javax.ws.rs.core.Response;
25 | 
26 | /**
27 |  * Deserialization strategy that ensures the response body is safely deserialized and that the input stream is properly closed.
28 |  *
29 |  * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
30 |  */
31 | class JaxRsResponseDeserializationStrategy {
32 | 
33 |   @SuppressWarnings("unchecked")
34 |   <T> T read(Response response, Class<T> expectedType) {
35 |     if (!response.hasEntity()) {
36 |       response.close();
37 |       return null;
38 |     }
39 | 
40 |     if (InputStream.class.isAssignableFrom(expectedType)) {
41 |       return (T) response.getEntity();
42 |     } else {
43 |       if (response.getStatusInfo().getFamily() == Response.Status.Family.SUCCESSFUL) {
44 |         try {
45 |           return response.readEntity(expectedType);
46 |         } finally {
47 |           response.close();
48 |         }
49 |       }
50 |     }
51 | 
52 |     response.close();
53 |     return null;
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/model/CreatePublicationRequest.java:
--------------------------------------------------------------------------------
 1 | package ro.code4.czl.scrape.client.model;
 2 | 
 3 | import ro.code4.czl.scrape.client.ApiInvoker;
 4 | import ro.code4.czl.scrape.client.BaseRequest;
 5 | import ro.code4.czl.scrape.client.BaseRequestBuilder;
 6 | import ro.code4.czl.scrape.client.representation.PublicationRepresentation;
 7 | 
 8 | import javax.ws.rs.HttpMethod;
 9 | import javax.ws.rs.core.MediaType;
10 | 
11 | /**
12 |  * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
13 |  */
14 | public class CreatePublicationRequest extends BaseRequest<PublicationRepresentation> {
15 | 
16 |   private CreatePublicationRequest(CreatePublicationRequest.Builder builder) {
17 |     super(builder, "publications/", HttpMethod.POST, MediaType.APPLICATION_JSON, builder.apiInvoker);
18 | 
19 |     setBody(builder.spaceRepresentation);
20 |   }
21 | 
22 |   public static CreatePublicationRequest.Builder builder(PublicationRepresentation spaceRepresentation, ApiInvoker apiInvoker) {
23 |     return new CreatePublicationRequest.Builder(spaceRepresentation, apiInvoker);
24 |   }
25 | 
26 |   @Override
27 |   public Class<PublicationRepresentation> getResponseType() {
28 |     return PublicationRepresentation.class;
29 |   }
30 | 
31 |   public static class Builder extends BaseRequestBuilder<CreatePublicationRequest, PublicationRepresentation> {
32 | 
33 |     private final ApiInvoker apiInvoker;
34 |     private final PublicationRepresentation spaceRepresentation;
35 | 
36 |     Builder(PublicationRepresentation spaceRepresentation, ApiInvoker apiInvoker) {
37 |       this.apiInvoker = apiInvoker;
38 |       this.spaceRepresentation = spaceRepresentation;
39 |     }
40 | 
41 |     @Override
42 |     public CreatePublicationRequest build() {
43 |       return new CreatePublicationRequest(this);
44 |     }
45 |   }
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/representation/ContactRepresentation.java:
--------------------------------------------------------------------------------
 1 | package ro.code4.czl.scrape.client.representation;
 2 | 
 3 | import com.fasterxml.jackson.annotation.JsonInclude;
 4 | import com.fasterxml.jackson.annotation.JsonInclude.Include;
 5 | 
 6 | /**
 7 |  * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
 8 |  */
 9 | @JsonInclude(Include.NON_NULL)
10 | public class ContactRepresentation {
11 | 
12 |   private String tel;
13 |   private String email;
14 | 
15 |   public ContactRepresentation() {
16 |   }
17 | 
18 |   public ContactRepresentation(String tel, String email) {
19 |     this.tel = tel;
20 |     this.email = email;
21 |   }
22 | 
23 |   public String getTel() {
24 |     return tel;
25 |   }
26 | 
27 |   public void setTel(String tel) {
28 |     this.tel = tel;
29 |   }
30 | 
31 |   public String getEmail() {
32 |     return email;
33 |   }
34 | 
35 |   public void setEmail(String email) {
36 |     this.email = email;
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/representation/DocumentRepresentation.java:
--------------------------------------------------------------------------------
 1 | package ro.code4.czl.scrape.client.representation;
 2 | 
 3 | import com.fasterxml.jackson.annotation.JsonInclude;
 4 | import com.fasterxml.jackson.annotation.JsonInclude.Include;
 5 | 
 6 | import org.apache.commons.lang3.builder.ToStringBuilder;
 7 | import org.apache.commons.lang3.builder.ToStringStyle;
 8 | 
 9 | /**
10 |  * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
11 |  */
12 | @JsonInclude(Include.NON_NULL)
13 | public class DocumentRepresentation {
14 | 
15 |   private String type;
16 |   private String url;
17 | 
18 |   public DocumentRepresentation() {
19 |   }
20 | 
21 |   public DocumentRepresentation(String type, String url) {
22 |     this.type = type;
23 |     this.url = url;
24 |   }
25 | 
26 |   public String getType() {
27 |     return type;
28 |   }
29 | 
30 |   public void setType(String type) {
31 |     this.type = type;
32 |   }
33 | 
34 |   public String getUrl() {
35 |     return url;
36 |   }
37 | 
38 |   public void setUrl(String url) {
39 |     this.url = url;
40 |   }
41 | 
42 |   @Override
43 |   public String toString() {
44 |     return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE)
45 |         .append("type", type)
46 |         .append("url", url)
47 |         .toString();
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/client/samples/CzlClientSample.java:
--------------------------------------------------------------------------------
 1 | package ro.code4.czl.scrape.client.samples;
 2 | 
 3 | import org.slf4j.Logger;
 4 | import org.slf4j.LoggerFactory;
 5 | 
 6 | import ro.code4.czl.scrape.client.CzlClient;
 7 | import ro.code4.czl.scrape.client.CzlClientConfig;
 8 | import ro.code4.czl.scrape.client.authentication.TokenAuthenticationStrategy;
 9 | import ro.code4.czl.scrape.client.representation.PublicationRepresentation.PublicationRepresentationBuilder;
10 | 
11 | /**
12 |  * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
13 |  */
14 | public class CzlClientSample {
15 | 
16 |   private static final Logger logger = LoggerFactory.getLogger(CzlClientSample.class);
17 | 
18 |   public static void main(String[] args) {
19 | 
20 |     CzlClientConfig clientConfig = CzlClientConfig.builder()
21 |         .endpointURI("http://czl-api.code4.ro/api/")
22 |         .connectionRequestTimeout(500)
23 |         .connectTimeout(500)
24 |         .socketTimeout(3000)
25 |         .authenticationStrategy(new TokenAuthenticationStrategy())
26 |         .build();
27 | 
28 |     try (CzlClient czlClient = CzlClient.newClient(clientConfig)) {
29 |       czlClient.apiV1().createPublication(PublicationRepresentationBuilder
30 |                                              .aPublicationRepresentation()
31 |                                              .withIdentifier("1")
32 |                                              .withInstitution("finantepub")
33 |                                              .withType("HG")
34 |                                              .withDate("2017-03-08")
35 |                                              .build())
36 |           .execute();
37 |     } catch (Exception e) {
38 |       logger.error("Met an error.", e);
39 |     }
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/text/ProposalType.java:
--------------------------------------------------------------------------------
 1 | package ro.code4.czl.scrape.text;
 2 | 
 3 | /**
 4 |  * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
 5 |  */
 6 | public enum ProposalType {
 7 | 
 8 |   HG, LEGE, OM, OG, OUG, OTHER;
 9 | 
10 |   public static ProposalType fromLabel(String label) {
11 |     switch (label.toLowerCase()) {
12 |       case "hg":
13 |       case "hotarare": {
14 |         return HG;
15 |       }
16 |       case "lege": {
17 |         return LEGE;
18 |       }
19 |       case "om":
20 |       case "ordin": {
21 |         return OM;
22 |       }
23 |       case "og": {
24 |         return OG;
25 |       }
26 |       case "oug": {
27 |         return OUG;
28 |       }
29 |       default: {
30 |         return OTHER;
31 |       }
32 |     }
33 |   }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/_commons-java/src/main/java/ro/code4/czl/scrape/text/RomanianMonth.java:
--------------------------------------------------------------------------------
 1 | package ro.code4.czl.scrape.text;
 2 | 
 3 | /**
 4 |  * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com)
 5 |  */
 6 | public enum RomanianMonth {
 7 | 
 8 |   IANUARIE(1),
 9 |   FEBRUARIE(2),
10 |   MARTIE(3),
11 |   APRILIE(4),
12 |   MAI(5),
13 |   IUNIE(6),
14 |   IULIE(7),
15 |   AUGUST(8),
16 |   SEPTEMBRIE(9),
17 |   OCTOMBRIE(10),
18 |   NOIEMBRIE(11),
19 |   DECEMBRIE(12);
20 | 
21 |   private final int number;
22 | 
23 |   RomanianMonth(int number) {
24 |     this.number = number;
25 |   }
26 | 
27 |   public int getNumber() {
28 |     return number;
29 |   }
30 | 
31 |   public static RomanianMonth fromLabel(String value) {
32 |     switch (value.toLowerCase()) {
33 |       case "ianuarie": {
34 |         return IANUARIE;
35 |       }
36 |       case "februarie": {
37 |         return FEBRUARIE;
38 |       }
39 |       case "martie": {
40 |         return MARTIE;
41 |       }
42 |       case "aprilie": {
43 |         return APRILIE;
44 |       }
45 |       case "mai": {
46 |         return MAI;
47 |       }
48 |       case "iunie": {
49 |         return IUNIE;
50 |       }
51 |       case "iulie": {
52 |         return IULIE;
53 |       }
54 |       case "august": {
55 |         return AUGUST;
56 |       }
57 |       case "septembrie": {
58 |         return SEPTEMBRIE;
59 |       }
60 |       case "octombrie": {
61 |         return OCTOMBRIE;
62 |       }
63 |       case "noiembrie": {
64 |         return NOIEMBRIE;
65 |       }
66 |       case "decembrie": {
67 |         return DECEMBRIE;
68 |       }
69 |       default: {
70 |         throw new RuntimeException("Unrecognized month label " + value);
71 |       }
72 |     }
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-slate


--------------------------------------------------------------------------------
/afaceri/README.md:
--------------------------------------------------------------------------------
 1 | # Ministerul pentru Mediul de Afaceri, Comerț și Antreprenoriat
 2 | 
 3 | Surse de documente este http://www.antreprenoriat.gov.ro/categorie/transparenta-decizionala/proiecte-in-dezbatere-publica/ .
 4 | 
 5 | ### Tehnologie
 6 | *NodeJS* - serverul se conecteaza la URL-ul setat in fisierul din config, descarca fisierele PDF, parseaza continutul lor, trimite obiectele generate la API si sterge fisierele PDF de pe disc.
 7 | 
 8 | ### Instructiuni
 9 | Token-ul de autentificare la API trebuie setat in fisierul *config.json*.
10 | 
11 | Continutul PDF-urilor se proceseaza in paragrafe. Serverul obtine datele necesare din paragraful relevant. Paragraful relevant reprezinta primul paragraf cu un numar total mai mare de 8 cuvinte si 50 de litere (configurabil in *config.json*)
12 | ```
13 | npm install
14 | node server/server.js
15 | ```
16 | 
17 | ### Exceptii
18 | Datele documentelor nu exista intr-un format standardizat. Date interpretabile exista in URL-urile fisierelor si in numele acestora.
19 | 
20 | La fiecare rulare a server-ului, sunt (re)procesate fisierele din URL-ul principal.
21 | 


--------------------------------------------------------------------------------
/afaceri/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "afaceri",
 3 |   "version": "1.0.0",
 4 |   "main": "server/server.js",
 5 |   "scripts": {
 6 |     "lint": "eslint .",
 7 |     "start": "node .",
 8 |     "posttest": "npm run lint && nsp check"
 9 |   },
10 |   "dependencies": {
11 |     "async": "^2.1.5",
12 |     "cheerio": "^0.22.0",
13 |     "compression": "^1.0.3",
14 |     "cors": "^2.5.2",
15 |     "helmet": "^1.3.0",
16 |     "loopback": "^2.22.0",
17 |     "loopback-boot": "^2.6.5",
18 |     "loopback-component-explorer": "^2.4.0",
19 |     "loopback-datasource-juggler": "^2.39.0",
20 |     "pdf2json": "^1.1.7",
21 |     "serve-favicon": "^2.0.1",
22 |     "string": "^3.3.3",
23 |     "strong-error-handler": "^1.0.1"
24 |   },
25 |   "devDependencies": {
26 |     "eslint": "^2.13.1",
27 |     "eslint-config-loopback": "^4.0.0",
28 |     "nsp": "^2.1.0"
29 |   },
30 |   "repository": {
31 |     "type": "",
32 |     "url": ""
33 |   },
34 |   "license": "UNLICENSED",
35 |   "description": "afaceri"
36 | }
37 | 


--------------------------------------------------------------------------------
/afaceri/server/boot/authentication.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 | 
3 | module.exports = function enableAuthentication(server) {
4 |   // enable authentication
5 |   server.enableAuth();
6 | };
7 | 


--------------------------------------------------------------------------------
/afaceri/server/boot/root.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 | 
3 | module.exports = function(server) {
4 |   // Install a `/` route that returns server status
5 |   var router = server.loopback.Router();
6 |   router.get('/', server.loopback.status());
7 |   server.use(router);
8 | };
9 | 


--------------------------------------------------------------------------------
/afaceri/server/component-config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "loopback-component-explorer": {
3 |     "mountPath": "/explorer"
4 |   }
5 | }
6 | 


--------------------------------------------------------------------------------
/afaceri/server/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "restApiRoot": "/api",
 3 |   "host": "0.0.0.0",
 4 |   "port": 3000,
 5 |   "remoting": {
 6 |     "context": false,
 7 |     "rest": {
 8 |       "handleErrors": false,
 9 |       "normalizeHttpPath": false,
10 |       "xml": false
11 |     },
12 |     "json": {
13 |       "strict": false,
14 |       "limit": "100kb"
15 |     },
16 |     "urlencoded": {
17 |       "extended": true,
18 |       "limit": "100kb"
19 |     },
20 |     "cors": false
21 |   },
22 |   "legacyExplorer": false,
23 |   "logoutSessionsOnSensitiveChanges": true,
24 |   "userAgent": "jesus",
25 |   "downloadsFolder": "downloads",
26 |   "firstParagraphMinWords": 8,
27 |   "firstParagraphMinLetters": 50,
28 |   "APIKey": "Token dummy",
29 |   "mainURL": "http://www.antreprenoriat.gov.ro/categorie/transparenta-decizionala/proiecte-in-dezbatere-publica/"
30 | }
31 | 


--------------------------------------------------------------------------------
/afaceri/server/config/keywords.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Created by Andrei on 3/13/2017.
 3 |  */
 4 | 
 5 | var docType = [
 6 |     {
 7 |         type: "LEGE",
 8 |         regex: new RegExp("proiect ([a-zA-Z]+\s?){1,3} ordonan", "i")
 9 |     },
10 |     {
11 |         type: "OUG",
12 |         regex: new RegExp("ordonan\\S{1,2} de urgen\\S{1,2}", "i")
13 |     },
14 |     {
15 |         type: "HG",
16 |         regex: new RegExp("hot\S{1}r\S{1}re", "i")
17 |     }
18 | ];
19 | 
20 | var titleStartMarkStrings = [
21 |     "privind ",
22 |     "pentru "
23 | ];
24 | 
25 | var titleEndMarkStrings = [
26 |     "\n",
27 |     "\r\n"
28 | ];
29 | 
30 | var titleEndMarkRegex = [
31 |     new RegExp("sec\\S{1}iune", "i")
32 | ];
33 | 
34 | module.exports = {
35 |     docType: docType,
36 |     titleStartMarkStrings: titleStartMarkStrings,
37 |     titleEndMarkStrings: titleEndMarkStrings,
38 |     titleEndMarkRegex: titleEndMarkRegex
39 | };


--------------------------------------------------------------------------------
/afaceri/server/datasources.json:
--------------------------------------------------------------------------------
1 | {
2 |   "db": {
3 |     "name": "db",
4 |     "connector": "memory"
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/afaceri/server/middleware.development.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "final:after": {
 3 |     "strong-error-handler": {
 4 |       "params": {
 5 |         "debug": true,
 6 |         "log": true
 7 |       }
 8 |     }
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/afaceri/server/middleware.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "initial:before": {
 3 |     "loopback#favicon": {}
 4 |   },
 5 |   "initial": {
 6 |     "compression": {},
 7 |     "cors": {
 8 |       "params": {
 9 |         "origin": true,
10 |         "credentials": true,
11 |         "maxAge": 86400
12 |       }
13 |     },
14 |     "helmet#xssFilter": {},
15 |     "helmet#frameguard": {
16 |       "params": [
17 |         "deny"
18 |        ]
19 |     },
20 |     "helmet#hsts": {
21 |       "params": {
22 |         "maxAge": 0,
23 |         "includeSubdomains": true
24 |       }
25 |     },
26 |     "helmet#hidePoweredBy": {},
27 |     "helmet#ieNoOpen": {},
28 |     "helmet#noSniff": {},
29 |     "helmet#noCache": {
30 |       "enabled": false
31 |     }
32 |   },
33 |   "session": {},
34 |   "auth": {},
35 |   "parse": {},
36 |   "routes": {
37 |     "loopback#rest": {
38 |       "paths": [
39 |         "${restApiRoot}"
40 |       ]
41 |     }
42 |   },
43 |   "files": {},
44 |   "final": {
45 |     "loopback#urlNotFound": {}
46 |   },
47 |   "final:after": {
48 |     "strong-error-handler": {}
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/afaceri/server/model-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_meta": {
 3 |     "sources": [
 4 |       "loopback/common/models",
 5 |       "loopback/server/models",
 6 |       "../common/models",
 7 |       "./models"
 8 |     ],
 9 |     "mixins": [
10 |       "loopback/common/mixins",
11 |       "loopback/server/mixins",
12 |       "../common/mixins",
13 |       "./mixins"
14 |     ]
15 |   },
16 |   "User": {
17 |     "dataSource": "db"
18 |   },
19 |   "AccessToken": {
20 |     "dataSource": "db",
21 |     "public": false
22 |   },
23 |   "ACL": {
24 |     "dataSource": "db",
25 |     "public": false
26 |   },
27 |   "RoleMapping": {
28 |     "dataSource": "db",
29 |     "public": false,
30 |     "options": {
31 |       "strictObjectIDCoercion": true
32 |     }
33 |   },
34 |   "Role": {
35 |     "dataSource": "db",
36 |     "public": false
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/afaceri/server/server.js:
--------------------------------------------------------------------------------
 1 | 'use strict';
 2 | 
 3 | var loopback = require('loopback');
 4 | var boot = require('loopback-boot');
 5 | var contentParser = require('./controllers/contentParser');
 6 | 
 7 | var app = module.exports = loopback();
 8 | 
 9 | app.start = function() {
10 |   // start the web server
11 |   return app.listen(function() {
12 |     app.emit('started');
13 |     var baseUrl = app.get('url').replace(/\/$/, '');
14 |     console.log('Web server listening at: %s', baseUrl);
15 |     if (app.get('loopback-component-explorer')) {
16 |       var explorerPath = app.get('loopback-component-explorer').mountPath;
17 |       console.log('Browse your REST API at %s%s', baseUrl, explorerPath);
18 |     }
19 |     contentParser.init();
20 |   });
21 | };
22 | 
23 | // Bootstrap the application, configure models, datasources and middleware.
24 | // Sub-apps like REST API are mounted via boot scripts.
25 | boot(app, __dirname, function(err) {
26 |   if (err) throw err;
27 | 
28 |   // start the server if `$ node server.js`
29 |   if (require.main === module)
30 |     app.start();
31 | });
32 | 


--------------------------------------------------------------------------------
/agricultura/.gitignore:
--------------------------------------------------------------------------------
1 | /node_modules/


--------------------------------------------------------------------------------
/agricultura/README.md:
--------------------------------------------------------------------------------
 1 | # Ministerul Agriculturii Şi Dezvoltării Rurale
 2 | 
 3 | ## Tehnologie
 4 | NodeJS, [Nightmare](http://www.nightmarejs.org)
 5 | 
 6 | ## Instructiuni
 7 | ```
 8 | npm install
 9 | API_TOKEN=the_secret_api_token npm start
10 | ```
11 | 
12 | ## Exceptii
13 | 


--------------------------------------------------------------------------------
/agricultura/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "agricultura",
 3 |   "version": "1.0.0",
 4 |   "description": "scraper pentru agricultura",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "test": "todo: add tests",
 8 |     "start": "node index.js"
 9 |   },
10 |   "repository": {
11 |     "type": "git",
12 |     "url": "git+https://github.com/ciprian-chichirita/czl-scrape.git"
13 |   },
14 |   "keywords": [
15 |     "code4romania",
16 |     "ce",
17 |     "zice",
18 |     "legea",
19 |     "agricultura"
20 |   ],
21 |   "author": "ciprian chichirita, alex morega",
22 |   "license": "MIT",
23 |   "bugs": {
24 |     "url": "https://github.com/ciprian-chichirita/czl-scrape/issues"
25 |   },
26 |   "homepage": "https://github.com/ciprian-chichirita/czl-scrape#readme",
27 |   "devDependencies": {
28 |     "moment": "^2.17.1",
29 |     "nightmare": "^2.10.0",
30 |     "request": "^2.81.0",
31 |     "request-promise": "^4.1.1",
32 |     "sha256": "^0.2.0"
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/aparare/README.md:
--------------------------------------------------------------------------------
 1 | # Ministerul Apărării Naţionale
 2 | Sursa documente: http://dlaj.mapn.ro/
 3 | ## Tehnologie
 4 | *PHP* - Script simplu old-school
 5 | ## Instructiuni
 6 | Nu are instructiuni speciale. 
 7 | 
 8 | Tokenul va fi transmis ca argument:
 9 | ```bash
10 | $ php mapn_plugin.php TOKEN
11 | ```
12 | ## Exceptii
13 | Din cauza faptului ca pagina html nu e consistenta, au fost folosite RegExuri pentru a lua informatiile. 
14 | 
15 | O problema a constat in faptul ca o intrare este constituita pe site-ul acesta din 2 elemente practic, mai exact
16 | titlul proiectului si documentele aferente, dar ele nu pot fi legate una de cealalta logic. De aceea, scriptul
17 | va functiona doar in cazul in care gaseste acelasi numar de titluri si grupuri de documente.
18 | 
19 | Scriptul va intoarce <b>false</b> in urmatoarele situatii:
20 | * pagina este down
21 | * unul din elementele cheie de content este schimbat (titlurile nu mai au *, calea spre documente este schimbata)
22 | * numarul de titluri si numarul de grupuri de documente nu este acelasi 


--------------------------------------------------------------------------------
/apepaduri/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul Apelor și Pădurilor
2 | 
3 | ## Tehnologie
4 | 
5 | ## Instructiuni
6 | 
7 | ## Exceptii


--------------------------------------------------------------------------------
/cdep/README.md:
--------------------------------------------------------------------------------
 1 | # Camera Deputatilor
 2 | 
 3 | ## Tehnologie
 4 | python, scrapy
 5 | 
 6 | ## Instructiuni
 7 | ```
 8 | pip install -r requirements.txt
 9 | API_TOKEN='the secret token' python scraper.py
10 | ```
11 | 
12 | ## Exceptii
13 | 


--------------------------------------------------------------------------------
/cdep/requirements.in:
--------------------------------------------------------------------------------
1 | scrapy
2 | requests
3 | 


--------------------------------------------------------------------------------
/cdep/requirements.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile
 3 | # To update, run:
 4 | #
 5 | #    pip-compile --output-file requirements.txt requirements.in
 6 | #
 7 | asn1crypto==0.21.1        # via cryptography
 8 | attrs==19.1.0             # via automat, service-identity, twisted
 9 | automat==0.5.0            # via twisted
10 | certifi==2019.9.11        # via requests
11 | cffi==1.9.1               # via cryptography
12 | chardet==3.0.4            # via requests
13 | constantly==15.1.0        # via twisted
14 | cryptography==2.7         # via pyopenssl
15 | cssselect==1.0.1          # via parsel, scrapy
16 | hyperlink==19.0.0         # via twisted
17 | idna==2.7                 # via hyperlink, requests
18 | incremental==16.10.1      # via twisted
19 | lxml==3.7.3               # via parsel, scrapy
20 | parsel==1.1.0             # via scrapy
21 | pyasn1-modules==0.0.8     # via service-identity
22 | pyasn1==0.2.3             # via pyasn1-modules, service-identity
23 | pycparser==2.17           # via cffi
24 | pydispatcher==2.0.5       # via scrapy
25 | pyhamcrest==1.9.0         # via twisted
26 | pyopenssl==17.5.0         # via scrapy, service-identity
27 | queuelib==1.4.2           # via scrapy
28 | requests==2.20.0
29 | scrapy==1.3.3
30 | service-identity==16.0.0  # via scrapy
31 | six==1.10.0               # via automat, cryptography, parsel, pyhamcrest, pyopenssl, scrapy, w3lib
32 | twisted==19.7.0           # via scrapy
33 | urllib3==1.24.3           # via requests
34 | w3lib==1.17.0             # via parsel, scrapy
35 | zope.interface==4.6.0     # via twisted
36 | 
37 | # The following packages are considered to be unsafe in a requirements file:
38 | # setuptools==41.2.0        # via pyhamcrest, zope.interface
39 | 


--------------------------------------------------------------------------------
/cdep/scraper.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import re
 3 | import requests
 4 | import os
 5 | 
 6 | API_URL = 'http://czl-api.code4.ro/api/publications/'
 7 | API_TOKEN = os.environ['API_TOKEN']
 8 | 
 9 | INDEX_URL = 'http://www.cdep.ro/pls/proiecte/upl_pck2015.lista?cam=2&anp=2017'
10 | 
11 | def upload(doc):
12 |     headers = {'Authorization': 'Token ' + API_TOKEN}
13 |     resp = requests.post(API_URL, json=doc, headers=headers)
14 |     if resp.status_code == 400:
15 |         if re.search(r'Integrity Error: Key .* already exists', resp.text):
16 |             return
17 |     assert resp.status_code == 201
18 | 
19 | class EducatieSpider(scrapy.Spider):
20 | 
21 |     name = 'cdep'
22 |     start_urls = [INDEX_URL]
23 | 
24 |     def parse(self, response):
25 |         for tr in response.css('.grup-parlamentar-list > table > tbody > tr'):
26 |             href = tr.css('a::attr(href)').extract_first()
27 |             url = response.urljoin(href)
28 |             yield scrapy.Request(url, self.parse_proposal)
29 | 
30 |     def parse_proposal(self, response):
31 |         cale_txt = ' '.join(t.extract() for t in response.css('.cale *::text'))
32 |         plx_code = 'pl-x ' + re.search(r'pl-x\s+(\S+)', cale_txt.lower()).group(1)
33 |         title = response.css('.detalii-initiativa h4::text').extract_first()
34 | 
35 |         table = response.css('#olddiv > table')[-1]
36 |         for td in table.css('td'):
37 |             td_text = (td.css('::text').extract_first() or '').strip()
38 |             m = re.match(r'^(\d{2})\.(\d{2})\.(\d{4})$', td_text)
39 |             if m:
40 |                 date = '{}-{}-{}'.format(m.group(3), m.group(2), m.group(1))
41 |                 break
42 | 
43 |         documents = []
44 | 
45 |         for pdf_link in response.css('.program-lucru-detalii a'):
46 |             target = pdf_link.css('::attr(target)').extract_first() or ''
47 |             if target.lower() != 'pdf':
48 |                 continue
49 |             pdf_href = pdf_link.css('::attr(href)').extract_first()
50 |             pdf_url = response.urljoin(pdf_href)
51 |             label_tds = pdf_link.xpath('../../td')
52 |             pdf_label = ' '.join(
53 |                 td.css('::text').extract_first()
54 |                 for td in label_tds[1:]
55 |             ).strip()
56 |             documents.append({
57 |                 'type': pdf_label,
58 |                 'url': pdf_url,
59 |             })
60 | 
61 |         doc = {
62 |             'identifier': plx_code,
63 |             'title': title,
64 |             'institution': 'cdep',
65 |             'description': '',
66 |             'type': 'LEGE',
67 |             'date': date,
68 |             'documents': documents,
69 |         }
70 |         upload(doc)
71 | 
72 | def main():
73 |     from scrapy.crawler import CrawlerProcess, Crawler
74 |     process = CrawlerProcess()
75 |     process.crawl(EducatieSpider)
76 |     process.start()
77 | 
78 | if __name__ == '__main__':
79 |     main()
80 | 


--------------------------------------------------------------------------------
/cercetare/.editorconfig:
--------------------------------------------------------------------------------
 1 | [*]
 2 | charset=utf-8
 3 | end_of_line=crlf
 4 | insert_final_newline=false
 5 | indent_style=space
 6 | indent_size=4
 7 | 
 8 | [{*.jhm,*.xslt,*.xul,*.rng,*.xsl,*.xsd,*.ant,*.svg,*.tld,*.fxml,*.jrxml,*.xml,*.jnlp,*.wsdl}]
 9 | indent_style=space
10 | indent_size=2
11 | 
12 | [{.eslintrc,.babelrc,.stylelintrc,*.json,*.jsb3,*.jsb2,*.bowerrc}]
13 | indent_style=space
14 | indent_size=2
15 | 
16 | [{*.applejs,*.js}]
17 | indent_style=space
18 | indent_size=4
19 | 
20 | [{.analysis_options,*.yml,*.yaml}]
21 | indent_style=space
22 | indent_size=2
23 | 
24 | 


--------------------------------------------------------------------------------
/cercetare/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | node_modules
3 | secrets.json
4 | data.json


--------------------------------------------------------------------------------
/cercetare/README.md:
--------------------------------------------------------------------------------
 1 | # Ministerul Cercetării și Inovării
 2 | 
 3 | ## Tehnologie
 4 | 
 5 | 
 6 | JavaScript EcmaScript2015 (ES6)
 7 | 
 8 | 1. nodejs - https://nodejs.org/en/
 9 | 1. nightmare - https://github.com/segmentio/nightmare
10 | 1. cheerio - https://github.com/cheeriojs/cheerio
11 | 1. jsonfile - https://github.com/jprichardson/node-jsonfile
12 | 1. request - https://github.com/request/request
13 | 1. argv - https://github.com/yargs/yargs
14 | 1. diacritics - https://github.com/andrewrk/node-diacritics
15 | 
16 | ## Instructiuni
17 | 
18 | 1. install nodejs
19 | 1. run `npm update`
20 | 1. run `node app.js`, passing param `--post` will upload to api and also generate a file `data.json` to view data.
21 | 
22 | ## Exceptii
23 | 


--------------------------------------------------------------------------------
/cercetare/app.js:
--------------------------------------------------------------------------------
  1 | let nightmareConfig = {show: false},
  2 |     cheerio = require('cheerio'),
  3 |     request = require('request'),
  4 |     parseProject = require('./parseProject'),
  5 |     jsonfile = require('jsonfile'),
  6 |     argv = require('yargs').argv,
  7 |     secrets = require('./secrets.json') || {};
  8 | 
  9 | const URL = 'http://www.research.gov.ro/ro/articol/1029/despre-ancs-legislatie-proiecte-de-acte-normative',
 10 |     BASE = 'http://www.research.gov.ro';
 11 | 
 12 | const FILE = 'data.json';
 13 | 
 14 | /** ====== MAIN ====== */
 15 | 
 16 | getNightmareInstance()
 17 |     .goto(URL)
 18 |     .wait('body')
 19 |     .evaluate(getHTMLContent)
 20 |     .end()
 21 |     .then(processHTMLContent)
 22 |     .then(parseListItems)
 23 |     .then(postParsedResults)
 24 |     .catch(handleErrors);
 25 | 
 26 | 
 27 | /** ====== page ====== */
 28 | 
 29 | function getHTMLContent() {
 30 |     return document.querySelector('.icr_main .special_edit').innerHTML;
 31 | }
 32 | 
 33 | function processHTMLContent(result) {
 34 |     console.log('processing html page...');
 35 | 
 36 |     return {
 37 |         feedback_days_element: cheerio.load(result)('p').children('a[href^=mailto]').parent()[0],
 38 |         items: cheerio.load(result)('table tbody tr') //.not(function(item) {return cheerio.load(item).text() && cheerio.load(item).text().indexOf('Data publicarii') === -1})
 39 |     };
 40 | }
 41 | 
 42 | 
 43 | /** ====== list items ====== */
 44 | 
 45 | function parseListItems(resultObject) {
 46 |     let items = resultObject.items,
 47 |         parseResults = [];
 48 | 
 49 |     items.each(function (i, item) {
 50 |         let $ = cheerio.load(item),
 51 |             content = $.text().replace(/\n/g, '').replace(/\t/g, '');
 52 | 
 53 |         if(content && content.indexOf('Data publicarii') != 0) {
 54 |             parseResults.push(parseItem(resultObject.feedback_days_element, item));
 55 |         }
 56 |     });
 57 | 
 58 |     return parseResults;
 59 | }
 60 | 
 61 | function parseItem(feedback_days, item) {
 62 |     return parseProject(cheerio.load(item), BASE, cheerio.load(feedback_days));
 63 | }
 64 | 
 65 | 
 66 | /** ====== post ====== */
 67 | 
 68 | function postParsedResults(parsedResultsArr) {
 69 | 
 70 |     console.log('saving data to file...');
 71 | 
 72 |     jsonfile.writeFileSync(FILE, parsedResultsArr, {spaces: 4});
 73 | 
 74 |     if (argv.post) {
 75 |         if (!(secrets.API_URL && secrets.TOKEN)) {
 76 |             throw new Error('Share your secrets with me. Pretty please :)');
 77 |         }
 78 | 
 79 |         console.log('posting data to api...');
 80 | 
 81 |         let requestsArr = [];
 82 | 
 83 |         parsedResultsArr.forEach(function (result, i) {
 84 |             let promise = new Promise(function (resolve, reject) {
 85 |                 request({
 86 |                     uri: secrets.API_URL,
 87 |                     method: 'POST',
 88 |                     headers: {
 89 |                         'Authorization': 'Token ' + secrets.TOKEN,
 90 |                         'Content-Type': 'application/json'
 91 |                     },
 92 |                     json: result
 93 |                 }, function (error, response, body) {
 94 |                     if (error || response.statusCode !== 200) {
 95 |                         console.error('request failed: ', error)
 96 |                     }
 97 | 
 98 |                     resolve(body);
 99 |                 })
100 |             });
101 | 
102 |             requestsArr.push(promise);
103 |         });
104 | 
105 |         Promise.all(requestsArr).then(function (response) {
106 |             console.log('done!');
107 |             process.exit(0);
108 |         }).catch(function (err) {
109 |             throw new Error(err);
110 |         });
111 |     } else {
112 |         console.log('done!');
113 |         process.exit(0);
114 |     }
115 | }
116 | 
117 | 
118 | /** ====== utils ====== */
119 | 
120 | function getNightmareInstance() {
121 |     return require('nightmare')(nightmareConfig);
122 | }
123 | 
124 | function handleErrors(error) {
125 |     throw new Error(error);
126 | }


--------------------------------------------------------------------------------
/cercetare/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "pretutindeni",
 3 |   "version": "1.0.0",
 4 |   "description": "",
 5 |   "main": "app.js",
 6 |   "scripts": {
 7 |     "crawl": "node app.js"
 8 |   },
 9 |   "author": "",
10 |   "license": "ISC",
11 |   "dependencies": {
12 |     "cheerio": "0.22.0",
13 |     "diacritics": "1.3.0",
14 |     "jsonfile": "2.4.0",
15 |     "nightmare": "2.10.0",
16 |     "nodemon": "1.11.0",
17 |     "q": "1.4.1",
18 |     "request": "^2.81.0",
19 |     "yargs": "7.0.2"
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/cercetare/secrets.json.txt:
--------------------------------------------------------------------------------
1 | {
2 |   "TOKEN": "something something",
3 |   "API_URL": "http://something.com/api/post-parsed-results"
4 | }


--------------------------------------------------------------------------------
/dezvoltare/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.swo
3 | .DS_Store
4 | *.egg-info
5 | build
6 | *.pyc
7 | **/*.pyc
8 | dbs
9 | 


--------------------------------------------------------------------------------
/dezvoltare/README.md:
--------------------------------------------------------------------------------
 1 | # Ministerul Dezvoltării Regionale, Administrației Publice și Fondurilor Europene
 2 | 
 3 | http://www.mdrap.gov.ro/transparenta/consultari-publice/
 4 | 
 5 | ## Tehnologie
 6 | 
 7 | *Python 2.7*
 8 | [Scrapy 1.3.3](https://scrapy.org/)
 9 | 
10 | ## Instructiuni
11 | 
12 | ```
13 | pip install -r requirements.txt
14 | cd crawl_dezvoltare
15 | scrapy crawl mdrap -a token=xxxx
16 | ```
17 | 
18 | ## Exceptii


--------------------------------------------------------------------------------
/dezvoltare/crawl_dezvoltare/crawl_dezvoltare/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/dezvoltare/crawl_dezvoltare/crawl_dezvoltare/__init__.py


--------------------------------------------------------------------------------
/dezvoltare/crawl_dezvoltare/crawl_dezvoltare/exporters.py:
--------------------------------------------------------------------------------
1 | from scrapy.exporters import BaseItemExporter


--------------------------------------------------------------------------------
/dezvoltare/crawl_dezvoltare/crawl_dezvoltare/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class CrawlDezvoltareItem(scrapy.Item):
12 |     identifier = scrapy.Field()
13 |     title = scrapy.Field()
14 |     type = scrapy.Field()
15 |     institution = scrapy.Field()
16 |     institution = scrapy.Field()
17 |     date = scrapy.Field()
18 |     description = scrapy.Field()
19 |     feedback_days = scrapy.Field()
20 |     contact = scrapy.Field()
21 |     tel = scrapy.Field()
22 |     email = scrapy.Field()
23 |     documents = scrapy.Field()


--------------------------------------------------------------------------------
/dezvoltare/crawl_dezvoltare/crawl_dezvoltare/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class CrawlDezvoltareSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/dezvoltare/crawl_dezvoltare/crawl_dezvoltare/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import requests
 8 | 
 9 | class CrawlDezvoltarePipeline(object):
10 |     def process_item(self, item, spider):
11 |         doc = {
12 |             'identifier': item['identifier'],
13 |             'title': item['title'],
14 |             'institution': item['institution'],
15 |             'description': item['description'],
16 |             'type': item['type'],
17 |             'date': item['date'],
18 |             'documents': item['documents'],
19 |             'contact':item['contact'],
20 |             'feedback_days': item['feedback_days'] 
21 |         }
22 | 
23 |         response = requests.post('http://czl-api.code4.ro/api/publications/', headers={'Authorization': 'Token ' + spider.token }, json=doc)
24 |         # print '---------'
25 |         # print response
26 |         # print response.text
27 |         # print '---------'
28 |         return item
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/dezvoltare/crawl_dezvoltare/crawl_dezvoltare/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for crawl_dezvoltare project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'crawl_dezvoltare'
13 | 
14 | SPIDER_MODULES = ['crawl_dezvoltare.spiders']
15 | NEWSPIDER_MODULE = 'crawl_dezvoltare.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'code4romania (http://code4.ro)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'crawl_dezvoltare.middlewares.CrawlDezvoltareSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'crawl_dezvoltare.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'crawl_dezvoltare.pipelines.CrawlDezvoltarePipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/dezvoltare/crawl_dezvoltare/crawl_dezvoltare/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/dezvoltare/crawl_dezvoltare/crawl_dezvoltare/spiders/testing.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | 
 4 | item = {'contact': {'addr': u'Apolodor, nr. 17, sector 5',
 5 |              'email': u'iulia.matei@mdrap.ro',
 6 |              'fax': u'0372.114.569.'},
 7 |  'date': u'22-02-2017',
 8 |  'description': u'\xcen temeiul art. 7 din Legea nr. 52/2003 privind transparen\u0163a decizional\u0103 \xeen administra\u0163ia public\u0103, republicat\u0103, Ministerul Dezvolt\u0103rii Regionale, Administra\u0163iei Publice si Fondurilor Europene aduce la cuno\u015ftin\u0163a publicului textul urm\u0103torului proiect de act normativ \u2013 Ordin al viceprim-ministrului, ministrul dezvolt\u0103rii regionale, administra\u0163iei publice \u0219i fondurilor europene pentru aplicarea prevederilor art. III, alin. (11) din Ordonan\u0163a de urgen\u0163\u0103 a Guvernului nr. 63/2010 pentru modificarea \u015fi completarea Legii nr. 273/2006 privind finan\u0163ele publice locale, precum \u015fi pentru stabilirea unor m\u0103suri financiare.',
 9 |  'documents': [{'type': u'Referat de aprobare',
10 |                 'url': '/userfiles/referat_ordin_oug63.doc'}],
11 |  'feedback_days': u'10',
12 |  'identifier': u'proiect-de-omdrapfe-pentru-aplicarea-prevederilor-art-iii-alin-11-din-ordonanta-de-urgenta-a-guvernului-nr-632010-pentru-modificarea-si-completarea-legii-nr-2732006-privind-finantele-publice-locale-precum-si-pentru-stabilirea-unor-masuri-financiare-22-02-2017',
13 |  'institution': 'dezvoltare',
14 |  'title': u'Proiect de OMDRAPFE pentru  aplicarea prevederilor art. III, alin. (11) din Ordonan\u0163a de urgen\u0163\u0103 a  Guvernului nr. 63/2010 pentru modificarea \u015fi completarea Legii nr.  273/2006 privind finan\u0163ele publice locale, precum \u015fi pentru stabilirea  unor m\u0103suri financiare ',
15 |  'type': 'OMDRAPFE'}
16 | 
17 | r = requests.post('http://czl-api.code4.ro/api/publications/', headers={'Authorization': 'Token dezvoltare-very-secret-token'}, data=item)
18 | 


--------------------------------------------------------------------------------
/dezvoltare/crawl_dezvoltare/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = crawl_dezvoltare.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = crawl_dezvoltare
12 | 


--------------------------------------------------------------------------------
/dezvoltare/requirements.txt:
--------------------------------------------------------------------------------
 1 | appdirs==1.4.3
 2 | appnope==0.1.0
 3 | asn1crypto==0.21.1
 4 | attrs==16.3.0
 5 | Automat==0.5.0
 6 | backports.shutil-get-terminal-size==1.0.0
 7 | beautifulsoup4==4.5.3
 8 | cffi==1.9.1
 9 | constantly==15.1.0
10 | cryptography==1.8.1
11 | cssselect==1.0.1
12 | decorator==4.0.11
13 | enum34==1.1.6
14 | idna==2.5
15 | incremental==16.10.1
16 | ipaddress==1.0.18
17 | ipython==5.3.0
18 | ipython-genutils==0.1.0
19 | lxml==3.7.3
20 | packaging==16.8
21 | parsel==1.1.0
22 | pathlib2==2.2.1
23 | pexpect==4.2.1
24 | pickleshare==0.7.4
25 | prompt-toolkit==1.0.13
26 | ptyprocess==0.5.1
27 | pyasn1==0.2.3
28 | pyasn1-modules==0.0.8
29 | pycparser==2.17
30 | PyDispatcher==2.0.5
31 | Pygments==2.2.0
32 | pyOpenSSL==17.5.0
33 | pyparsing==2.2.0
34 | queuelib==1.4.2
35 | requests==2.20.0
36 | scandir==1.5
37 | Scrapy==1.3.3
38 | service-identity==16.0.0
39 | simplegeneric==0.8.1
40 | six==1.10.0
41 | slugify==0.0.1
42 | traitlets==4.3.2
43 | Twisted==19.7.0
44 | w3lib==1.17.0
45 | wcwidth==0.1.7
46 | zope.interface==4.3.3
47 | 


--------------------------------------------------------------------------------
/economie/.editorconfig:
--------------------------------------------------------------------------------
 1 | [*]
 2 | charset=utf-8
 3 | end_of_line=crlf
 4 | insert_final_newline=false
 5 | indent_style=space
 6 | indent_size=4
 7 | 
 8 | [{*.jhm,*.xslt,*.xul,*.rng,*.xsl,*.xsd,*.ant,*.svg,*.tld,*.fxml,*.jrxml,*.xml,*.jnlp,*.wsdl}]
 9 | indent_style=space
10 | indent_size=2
11 | 
12 | [{.eslintrc,.babelrc,.stylelintrc,*.json,*.jsb3,*.jsb2,*.bowerrc}]
13 | indent_style=space
14 | indent_size=2
15 | 
16 | [{*.applejs,*.js}]
17 | indent_style=space
18 | indent_size=4
19 | 
20 | [{.analysis_options,*.yml,*.yaml}]
21 | indent_style=space
22 | indent_size=2
23 | 
24 | 


--------------------------------------------------------------------------------
/economie/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | node_modules
3 | secrets.json
4 | data.json


--------------------------------------------------------------------------------
/economie/README.md:
--------------------------------------------------------------------------------
 1 | # Ministerul Economiei, Comerțului și Relațiilor cu Mediul de Afaceri
 2 | 
 3 | ## Tehnologie
 4 | 
 5 | JavaScript EcmaScript2015 (ES6)
 6 | 
 7 | 1. nodejs - https://nodejs.org/en/
 8 | 1. nightmare - https://github.com/segmentio/nightmare
 9 | 1. cheerio - https://github.com/cheeriojs/cheerio
10 | 1. jsonfile - https://github.com/jprichardson/node-jsonfile
11 | 1. request - https://github.com/request/request
12 | 1. argv - https://github.com/yargs/yargs
13 | 1. diacritics - https://github.com/andrewrk/node-diacritics
14 | 
15 | ## Instructiuni
16 | 
17 | 1. install nodejs
18 | 1. run `npm update`
19 | 1. run `node app.js`, passing param `--post` will upload to api and also generate a file `data.json` to view data.
20 | 
21 | ## Exceptii
22 | 
23 | 


--------------------------------------------------------------------------------
/economie/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "pretutindeni",
 3 |   "version": "1.0.0",
 4 |   "description": "",
 5 |   "main": "app.js",
 6 |   "scripts": {
 7 |     "crawl": "node app.js"
 8 |   },
 9 |   "author": "",
10 |   "license": "ISC",
11 |   "dependencies": {
12 |     "cheerio": "0.22.0",
13 |     "diacritics": "1.3.0",
14 |     "jsonfile": "2.4.0",
15 |     "nightmare": "2.10.0",
16 |     "nodemon": "1.11.0",
17 |     "q": "1.4.1",
18 |     "request": "^2.81.0",
19 |     "yargs": "7.0.2"
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/economie/secrets.json.txt:
--------------------------------------------------------------------------------
1 | {
2 |   "TOKEN": "something something",
3 |   "API_URL": "http://something.com/api/post-parsed-results"
4 | }


--------------------------------------------------------------------------------
/educatie/README.md:
--------------------------------------------------------------------------------
 1 | # Ministerul Educaţiei Naţionale și Cercetării Științifice
 2 | 
 3 | ## Tehnologie
 4 | 
 5 | Node.js, [nightmare](http://www.nightmarejs.org/)
 6 | 
 7 | ## Instrucțiuni
 8 | 
 9 | ```
10 | npm install
11 | ```
12 | 
13 | edit config.js, change API token (can also be specified on the command line) and other config vars
14 | 
15 | ```
16 | [API_TOKEN=foobar] npm start
17 | ```
18 | 
19 | ## Excepții
20 | 


--------------------------------------------------------------------------------
/educatie/config.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   api: {
 3 |     url: 'http://czl-api.code4.ro/api/publications/',
 4 |     token: 'educatie-very-secret-key'
 5 |   },
 6 |   scrape: {
 7 |     //url of the proposals listing page
 8 |     baseUrl: 'https://www.edu.ro/proiecte-acte-normative-0',
 9 |     //how many proposals to consider
10 |     proposals: 20,
11 |     defaultEmail: 'dgis@edu.gov.ro'
12 |   }
13 | };
14 | 


--------------------------------------------------------------------------------
/educatie/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "edu-scraper",
 3 |   "version": "1.0.0",
 4 |   "description": "Data scraper pentru Ministerul Educatiei",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "start": "node index.js",
 8 |     "test": "echo \"Error: no test specified\" && exit 1"
 9 |   },
10 |   "repository": {
11 |     "type": "git",
12 |     "url": "git+https://github.com/lbogdan/czl-scrape"
13 |   },
14 |   "author": {
15 |     "name": "Bogdan Luca",
16 |     "email": "luca.bogdan@gmail.com"
17 |   },
18 |   "license": "MIT",
19 |   "dependencies": {
20 |     "diacritics": "^1.3.0",
21 |     "jsonfile": "^2.4.0",
22 |     "moment": "^2.17.1",
23 |     "nightmare": "^2.10.0",
24 |     "request-promise": "^4.1.1"
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/energie/.gitignore:
--------------------------------------------------------------------------------
 1 | # OS files
 2 | .DS_Store
 3 | 
 4 | # Java files
 5 | *.class
 6 | 
 7 | # Log files
 8 | *.log
 9 | logs
10 | 
11 | # Maven
12 | target
13 | pom.xml.versionsBackup
14 | 
15 | # Dropwizard
16 | dependency-reduced-pom.xml
17 | 
18 | # Mobile Tools for Java (J2ME)
19 | .mtj.tmp/
20 | 
21 | # Package Files
22 | *.jar
23 | *.war
24 | *.ear
25 | 
26 | # IntelliJ IDEA
27 | *.iml
28 | .idea
29 | 
30 | # Eclipse
31 | .project
32 | .settings
33 | .classpath
34 | test-output
35 | 
36 | # Vim
37 | *.swp
38 | 
39 | # Virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
40 | hs_err_pid*
41 | 
42 | # Misc
43 | *git.properties
44 | 
45 | # Asciidoc
46 | .asciidoctor
47 | diag-*.png
48 | 


--------------------------------------------------------------------------------
/energie/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul Energiei
2 | 
3 | ## Tehnologie
4 | 
5 | ## Instructiuni
6 | 
7 | ## Exceptii


--------------------------------------------------------------------------------
/energie/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |     <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |     <parent>
 6 |         <groupId>ro.code4.czl</groupId>
 7 |         <artifactId>czl-scrape</artifactId>
 8 |         <version>0.0.1-SNAPSHOT</version>
 9 |         <relativePath>../</relativePath>
10 |     </parent>
11 | 
12 |     <artifactId>czl-scrape-energie</artifactId>
13 |     <packaging>jar</packaging>
14 |     <name>Ce Zice Legea :: Scraper :: Energie</name>
15 | 
16 |     <dependencies>
17 |         <dependency>
18 |             <groupId>${project.groupId}</groupId>
19 |             <artifactId>czl-scrape-commons</artifactId>
20 |             <version>${project.version}</version>
21 |         </dependency>
22 | 
23 |         <dependency>
24 |             <groupId>org.jsoup</groupId>
25 |             <artifactId>jsoup</artifactId>
26 |         </dependency>
27 | 
28 |         <!-- Logging -->
29 |         <dependency>
30 |             <groupId>ch.qos.logback</groupId>
31 |             <artifactId>logback-classic</artifactId>
32 |             <scope>runtime</scope>
33 |         </dependency>
34 |     </dependencies>
35 | 
36 | </project>
37 | 


--------------------------------------------------------------------------------
/energie/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <configuration scan="true">
 4 |     <appender name="APPLICATION_LOGS" class="ch.qos.logback.core.rolling.RollingFileAppender">
 5 |         <file>scraper-energie.log</file>
 6 | 
 7 |         <rollingPolicy class="ch.qos.logback.core.rolling.FixedWindowRollingPolicy">
 8 |             <fileNamePattern>scraper-energie.%i.log.zip</fileNamePattern>
 9 |             <minIndex>1</minIndex>
10 |             <maxIndex>3</maxIndex>
11 |         </rollingPolicy>
12 | 
13 |         <triggeringPolicy class="ch.qos.logback.core.rolling.SizeBasedTriggeringPolicy">
14 |             <maxFileSize>500MB</maxFileSize>
15 |         </triggeringPolicy>
16 | 
17 |         <encoder>
18 |             <pattern>%date{"yyyy-MM-dd'T'HH:mm:ss,SSSXXX", UTC} [%t] %-5level %c{1.} %msg%n</pattern>
19 |         </encoder>
20 |     </appender>
21 | 
22 |     <appender name="ASYNC_APPLICATION_LOGS" class="ch.qos.logback.classic.AsyncAppender">
23 |         <appender-ref ref="APPLICATION_LOGS"/>
24 | 
25 |         <!-- The maximum capacity of the blocking queue, the default value of this option 250. -->
26 |         <!-- Set blocking queue maximum capacity to 2048 -->
27 |         <queueSize>2048</queueSize>
28 | 
29 |         <!-- By default, when the blocking queue has 20% capacity remaining, it will drop events of level TRACE, DEBUG and INFO, keeping only events of level WARN and ERROR -->
30 |         <!-- In order to keep all events must set discardingThreshold to 0 (zero) -->
31 |         <discardingThreshold>0</discardingThreshold>
32 | 
33 |         <!-- Extracting caller data can be rather expensive, by default only thread name and the MDC are copied -->
34 |         <!-- However should this be set to false your pattern will not work for method name and line number logging, basically for %M:%L you'd end up with ?:? -->
35 |         <includeCallerData>false</includeCallerData>
36 |     </appender>
37 | 
38 |     <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
39 |         <encoder>
40 |             <pattern>%date{"yyyy-MM-dd'T'HH:mm:ss,SSSXXX", UTC} [%t] %-5level %c{1.} %msg%n</pattern>
41 |         </encoder>
42 |     </appender>
43 | 
44 |     <root level="debug">
45 |         <appender-ref ref="STDOUT"/>
46 |         <appender-ref ref="ASYNC_APPLICATION_LOGS"/>
47 |     </root>
48 | 
49 | </configuration>
50 | 


--------------------------------------------------------------------------------
/externe/README.md:
--------------------------------------------------------------------------------
 1 | # Ministerul Afacerilor Externe
 2 | 
 3 | ## Tehnologie
 4 | - Python 3 (developed and tested on 3.5.2)
 5 | - BeautifulSoup 4
 6 | - Requests
 7 | - Click
 8 | - **E**xtraordinarily **U**nderwhelming but also **S**uper **E**levated **B**inary **I**nformation **U**nit. 
 9 | 
10 |   A.K.A Eusebiu.
11 | 
12 | ## Instructiuni
13 | Pentru a-l convinge pe Eusebiu să ia la mână articolele de pe site-ul MAE, trebuie să: 
14 | - Instalezi `python3` si `pip`
15 | - Rulezi `python3 setup.py install` sau cu `sudo` in fata, daca nu ai un virtualenv
16 | - Ca să aflii ce poate Eusebiu să facă pentru umanitate: `python eusebiu.py --help`:
17 | ```
18 | Options:
19 |   --page TEXT       Selects the page to scrape. Available options are:
20 |                     
21 |                     <feed> scrapes the latest articles and falls back to
22 |                     observer mode
23 |                     ____________________________________________________
24 |                     
25 |                     <arhiva-2016> scrape the 2016 archive and switch to
26 |                     observer mode
27 |                     ____________________________________________________
28 |                     
29 |                     <arhiva-2014> scrape the 2014-2015 archive and switch
30 |                     to observer mode
31 |                     ____________________________________________________
32 |   --log_level TEXT  Sets the logging level. Available values: ERROR,
33 |                     WARNING, INFO, DEBUG,
34 |   --delay FLOAT     Number of hours to wait before checking for changes.
35 |                     Default=1
36 |   --observer        Periodically checks for changes and scrapes them if
37 |                     available.
38 |   --help            Show this message and exit.
39 | ```
40 | ## Exceptii
41 | Eusebiu se bazeaza in mare parte pe regex-uri pentru a extrage (silit, sau nu) informatii
42 | de la MAE. 
43 | 
44 | In cazul in care persoanele responsabil pentru introducerea articolelor in sistem 
45 | se decid subit sa foloseasca alte pattern-uri decat cele pe le intelege Eusebiu, scraperul va
46 | genera articole invalide. Daca un articol nu contine toate detalii obligatorii, Eusebiu nu-i va 
47 | face POST la API.
48 | 


--------------------------------------------------------------------------------
/externe/__init__.py:
--------------------------------------------------------------------------------
1 | VERSION = '17.03.12'
2 | 


--------------------------------------------------------------------------------
/externe/eusebiu.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import click
  3 | import inspect
  4 | import logging
  5 | import os
  6 | 
  7 | from scraper.article_serializer import ArticleSerializer
  8 | from scraper.extractor import *
  9 | from utils.api_client import post_data
 10 | from utils.settings import *
 11 | 
 12 | 
 13 | @click.command()
 14 | @click.option('--page', default='feed', help=CLICK_HELPER['page'])
 15 | @click.option('--log_level', default='INFO', help=CLICK_HELPER['log-level'])
 16 | @click.option('--delay', default=1, type=float, help=CLICK_HELPER['delay'])
 17 | @click.option('--observer', is_flag=True, default=False,
 18 |               help=CLICK_HELPER['observer'])
 19 | def get_to_work(page, delay, observer, log_level):
 20 |     # init logging
 21 |     if log_level not in LOG_LEVELS:
 22 |         logging.warning('Unrecognized log_level: %s. Defaulting to INFO')
 23 |         log_level = 'INFO'
 24 | 
 25 |     current_dir = os.path.dirname(
 26 |         os.path.abspath(inspect.getfile(inspect.currentframe()))
 27 |     )
 28 |     logs_dir = current_dir + LOGS_DIR
 29 |     if not os.path.exists(logs_dir):
 30 |         os.makedirs(logs_dir)
 31 | 
 32 |     logging.basicConfig(filename=LOG_FILE, level=LOG_LEVELS[log_level],
 33 |                         format='%(asctime)s %(levelname)s %(message)s')
 34 | 
 35 |     # if observer flag is set, ignore everything else and start eavesdropping
 36 |     if observer:
 37 |         shut_up_and_listen(delay)
 38 | 
 39 |     # validate page selection
 40 |     if page not in SCRAPER_PAGES:
 41 |         logging.error('Page name: %s not recognized. See help for available pages', page)
 42 |         exit()
 43 | 
 44 |     # scrape all articles on this page, and dump them on the API
 45 |     dump_one_of_these(page)
 46 | 
 47 |     # then get back to eavesdropping
 48 |     shut_up_and_listen(delay)
 49 | 
 50 | 
 51 | def shut_up_and_listen(delay):
 52 |     """ Eusebiu skillfully lurks in the shadows, waiting for a new article to be posted.
 53 |     :param delay: int: number of hours to wait before the next tactical strike.
 54 |     :return: None
 55 |     """
 56 |     current_latest = []
 57 |     while True:
 58 |         feed_extractor = Extractor(settings.URLS.get('feed'))
 59 |         latest_entries = feed_extractor.get_identifier_list()
 60 |         logging.debug('latest_entries: %s', latest_entries)
 61 | 
 62 |         if not current_latest:
 63 |             logging.info('Assuming current state of feed is the latest ...')
 64 |             current_latest = latest_entries[:]
 65 | 
 66 |         diff = set(current_latest) - set(latest_entries)
 67 |         for identifier in diff:
 68 |             # be polite to the MAE website
 69 |             time.sleep(0.5)
 70 |             logging.info('Found new article: %s', identifier)
 71 |             article = feed_extractor.get_article_by_id(identifier)
 72 |             diff.remove(article.identifier)
 73 |             post_article(article)
 74 | 
 75 |         logging.info('ETA until next scrape: %s hour(s)', delay)
 76 |         time.sleep(hours_to_sec(delay))
 77 | 
 78 | 
 79 | def dump_one_of_these(page):
 80 |     """
 81 |     Eusebiu masterfully extracts all the articles on a given page, and swiftly dumps
 82 |     them onto the API.
 83 |     :param page: the page to eviscerate.
 84 |     :return: None
 85 |     """
 86 |     extractor = Extractor(settings.URLS.get(page))
 87 |     articles = extractor.get_all_articles()
 88 |     for article in articles:
 89 |         # be polite to the API
 90 |         time.sleep(0.5)
 91 |         post_article(article)
 92 | 
 93 | 
 94 | def post_article(article):
 95 |     """Attempts to POST and article to the API.
 96 |     :param article: the object to POST.
 97 |     :return: True if successful, False otherwise.
 98 |     """
 99 |     if not ArticleSerializer().is_valid(article):
100 |         logging.error('Invalid article: %s \n WILL NOT POST TO API', article)
101 |         return False
102 |     data = ArticleSerializer().serialize(article)
103 |     return post_data(data)
104 | 
105 | 
106 | if __name__ == '__main__':
107 |     get_to_work()
108 | 


--------------------------------------------------------------------------------
/externe/scraper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/externe/scraper/__init__.py


--------------------------------------------------------------------------------
/externe/scraper/article_serializer.py:
--------------------------------------------------------------------------------
 1 | from utils import settings
 2 | 
 3 | 
 4 | class ArticleSerializer:
 5 |     @staticmethod
 6 |     def serialize(article):
 7 |         return dict(
 8 |             # TODO
 9 |             identifier=article.identifier,
10 |             title=article.title,
11 |             type=article.article_type,
12 |             institution=settings.INSTITUTION,
13 |             date=article.published_at.isoformat(),
14 |             description='N\A',
15 |             feedback_days=article.feedback_days,
16 |             contact=article.contact,
17 |             documents=article.documents,
18 |         )
19 | 
20 |     @staticmethod
21 |     def is_valid(article):
22 |         """Checks if an Article is valid, according to the API specs.
23 |         :param article: The Article instance to validate
24 |         :return: True or False
25 |         """
26 |         for field in settings.MANDATORY_FIELDS:
27 |             if not getattr(article, field):
28 |                 return False
29 |         return True
30 | 


--------------------------------------------------------------------------------
/externe/scraper/extractor.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup as beautiful_soup
 3 | 
 4 | import utils.settings as settings
 5 | from scraper.article import Article
 6 | 
 7 | 
 8 | class Extractor:
 9 |     """Extractor object, responsible for fetching data from the MAE website.
10 |     """
11 |     url = None
12 |     content = None
13 |     articles = None
14 | 
15 |     def __init__(self, url):
16 |         self.url = url
17 |         self.content = self._fetch_page()
18 | 
19 |     def get_all_articles(self):
20 |         """Generates a list of all Article objects fetched from MAE.
21 |         :return: the list of Articles
22 |         """
23 |         self.articles = [Article(table) for table in self._get_tables()]
24 |         return self.articles
25 | 
26 |     def get_article_by_id(self, identifier):
27 |         """Returns the article matching the given identifier.
28 |         :param identifier: the id
29 |         :return: the matching Article, or None
30 |         """
31 |         if not self.articles:
32 |             self.get_all_articles()
33 | 
34 |         for a in self.articles:
35 |             if a.identifier == identifier:
36 |                 return a
37 | 
38 |     def get_identifier_list(self):
39 |         """Extracts a list of identifiers of the latest articles.
40 |         :return: list
41 |         """
42 |         latest = []
43 |         for table in self._get_tables():
44 |             tr = table.select('tr')
45 |             article = Article()
46 |             article._extract_article_type(tr)
47 |             article._extract_title(tr)
48 |             article._generate_id()
49 |             latest.append(article.identifier)
50 |         return latest
51 | 
52 |     def _fetch_page(self):
53 |         page = requests.get(self.url, headers=settings.HEADERS)
54 |         return beautiful_soup(page.text, 'html.parser')
55 | 
56 |     def _get_tables(self):
57 |         return self.content.select_one('div.art').select('table')
58 | 


--------------------------------------------------------------------------------
/externe/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | import re
 3 | from setuptools import setup
 4 | 
 5 | install_requires = [
 6 |     'beautifulsoup4',
 7 |     'requests',
 8 |     'click',
 9 |     'lxml'
10 | ]
11 | 
12 | version_regex = re.compile("VERSION\s*=\s*'(.*?)'$")
13 | 
14 | with open('__init__.py') as stream:
15 |     VERSION = version_regex.search(stream.read()).group(1)
16 | 
17 | setup(
18 |     version=VERSION,
19 |     name='mae-scraper',
20 |     url='https://github.com/code4romania/czl-scrape/tree/master/externe',
21 |     author='Rares Urdea, Alexandru Hodorogea',
22 |     author_email='contact@code4.ro',
23 |     description='Scraper pentru site-ul Ministerului de Afaceri Externe',
24 |     install_requires=install_requires,
25 | )
26 | 


--------------------------------------------------------------------------------
/externe/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/externe/utils/__init__.py


--------------------------------------------------------------------------------
/externe/utils/api_client.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import requests
 3 | import time
 4 | 
 5 | from utils.settings import *
 6 | 
 7 | 
 8 | def post_data(data):
 9 |     attempts = 5
10 |     success = False
11 | 
12 |     while not success and attempts > 0:
13 |         attempts -= 1
14 |         response = requests.post(URLS['api-publications'], data, headers=HEADERS)
15 | 
16 |         if _already_exists(response):
17 |             logging.warning(
18 |                 'Object: %s \nalready exists, according to API. Skipping.', data
19 |             )
20 |             break
21 | 
22 |         success = response.status_code == STATUS_CREATED
23 |         if success:
24 |             break
25 |         time.sleep(30)
26 | 
27 |     if not success:
28 |         logging.error('Failed to POST data to API: %s', data)
29 | 
30 |     return success
31 | 
32 | 
33 | def _already_exists(response):
34 |     return response.status_code == STATUS_BAD_REQUEST \
35 |            and ALREADY_EXISTS in response.text.lower()
36 | 


--------------------------------------------------------------------------------
/externe/utils/lang.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | class LangHelper(object):
 3 |     FUCK_NO = [
 4 |         # new line
 5 |         '\n',
 6 |         # tab
 7 |         '\t',
 8 |         # non-breaking space
 9 |         '\xa0',
10 |         # 0 width space
11 |         '\u200b'
12 |     ]
13 | 
14 |     @staticmethod
15 |     def englishize_romanian(string):
16 |         symbols = (u"țţȚŢșşȘŞăĂîÎâÂ",
17 |                    u"ttTTssSSaAiIaA")
18 | 
19 |         tr = {ord(a): ord(b) for a, b in zip(*symbols)}
20 |         return string.translate(tr)
21 | 
22 |     @staticmethod
23 |     def beautify_romanian(string):
24 |         symbols = (u"ţşŢŞ",
25 |                    u"țșȚȘ")
26 |         tr = {ord(a): ord(b) for a, b in zip(*symbols)}
27 |         return string.translate(tr)
28 | 
29 |     @staticmethod
30 |     def sanitize(string):
31 |         """Sanitize a string.
32 |         Removes new lines and 0 width spaces, because fuck those.
33 | 
34 |         :param string: The string to sanitize.
35 |         :return: A clean string.
36 |         """
37 |         if string:
38 |             for this_little_shit in LangHelper.FUCK_NO:
39 |                 string = string.replace(this_little_shit, '')
40 |         return string
41 | 


--------------------------------------------------------------------------------
/externe/utils/settings.py:
--------------------------------------------------------------------------------
 1 | WAIT = {
 2 |     '1_sec': 1,
 3 |     '0.5_sec': 0.5
 4 | }
 5 | 
 6 | HEADERS = {
 7 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) '
 8 |                   'AppleWebKit/537.36 (KHTML, like Gecko) '
 9 |                   'Chrome/39.0.2171.95 Safari/537.36',
10 |     'Authorization': 'Token externe-very-secret-key'
11 | }
12 | 
13 | SCRAPER_PAGES = [
14 |     'arhiva-1415',
15 |     'arhiva-2016',
16 |     'feed'
17 | ]
18 | 
19 | # The keys linking to MAE pages need to match the items in SCRAPER_PAGES
20 | URLS = {
21 |     'mae_base': 'http://www.mae.ro',
22 |     'feed': 'https://www.mae.ro/node/2011#null',
23 |     'arhiva-2016': 'http://www.mae.ro/node/40248',
24 |     'arhiva-1415': 'http://www.mae.ro/node/35609',
25 |     'api-publications': 'http://czl-api.code4.ro/api/publications/'
26 | }
27 | 
28 | STATUS_CREATED = 201
29 | STATUS_BAD_REQUEST = 400
30 | ALREADY_EXISTS = 'already exists'
31 | 
32 | TYPES = {
33 |     'HOTARARE': 'HG',
34 |     'ORDONANTA': 'OG',
35 |     'ORDONANTA DE URGENTA': 'OUG',
36 |     'ORDINUL MINISTRULUI AFACERILOR EXTERNE': 'OM',
37 |     'ORDIN': 'OM',
38 |     'PROIECT DE LEGE': 'LEGE',
39 |     'LEGE': 'LEGE',
40 |     'OTHER': 'OTHER'
41 | }
42 | 
43 | MONTHS = dict(
44 |     ianuarie='01',
45 |     februarie='02',
46 |     martie='03',
47 |     aprilie='04',
48 |     mai='05',
49 |     iunie='06',
50 |     iulie='07',
51 |     august='08',
52 |     septembrie='09',
53 |     octombrie='10',
54 |     noiembrie='11',
55 |     decembrie='12'
56 | )
57 | 
58 | CLICK_HELPER = {
59 | 
60 |     'log-level': '\b Sets the logging level. Available values: ERROR, WARNING, INFO, DEBUG,',
61 |     'page': """
62 | \b Selects the page to scrape. Available options are:
63 | \b <feed> scrapes the latest articles and falls back to observer mode
64 | ____________________________________________________
65 | \b <arhiva-2016> scrape the 2016 archive and switch to observer mode
66 | ____________________________________________________
67 | \b <arhiva-1415> scrape the 2014-2015 archive and switch
68 |    to observer mode
69 | ____________________________________________________
70 |             """,
71 |     'observer': 'Periodically checks for changes and scrapes them if available. '
72 |                 'NOTE: in observer mode, any <page> argument is ignored.',
73 |     'delay': 'Number of hours to wait before checking for changes. Default=1'
74 | }
75 | 
76 | LOG_LEVELS = {
77 |     'ERROR': 40,
78 |     'WARNING': 30,
79 |     'INFO': 20,
80 |     'DEBUG': 10
81 | }
82 | 
83 | LOG_FILE = 'logs/scraper.log'
84 | LOGS_DIR = '/logs'
85 | 
86 | INSTITUTION = 'externe'
87 | 
88 | MANDATORY_FIELDS = ['identifier', 'title', 'published_at', 'article_type']
89 | 
90 | DATE_FMT = '%Y-%m-%d'
91 | 
92 | 
93 | def hours_to_sec(hours):
94 |     return hours * 3600
95 | 


--------------------------------------------------------------------------------
/finantepub/.gitignore:
--------------------------------------------------------------------------------
1 | /node_modules
2 | 


--------------------------------------------------------------------------------
/finantepub/README.md:
--------------------------------------------------------------------------------
 1 | # Ministerul Finanţelor Publice
 2 | 
 3 | ## Tehnologie
 4 | NodeJS, [Nightmare](http://www.nightmarejs.org)
 5 | 
 6 | ## Instructiuni
 7 | ```
 8 | npm install
 9 | API_TOKEN=the_secret_api_token npm start
10 | ```
11 | 
12 | ## Exceptii
13 | 


--------------------------------------------------------------------------------
/finantepub/index.js:
--------------------------------------------------------------------------------
  1 | const sha256 = require('sha256');
  2 | const rp = require('request-promise');
  3 | const Nightmare = require('nightmare');
  4 | const nightmare = Nightmare({ show: false, typeInterval: 2, waitTimeout: 5000 });
  5 | 
  6 | const YEAR_THRESHOLD = 2017;
  7 | 
  8 | const API_TOKEN = process.env['API_TOKEN'];
  9 | 
 10 | function guessType(text) {
 11 |   text = text.toLowerCase().trim();
 12 |   text = text.replace(/^proiect\s*/, '');
 13 |   if(text.match(/^ordonanță de urgență/)) return 'OUG';
 14 |   if(text.match(/^lege/)) return 'LEGE';
 15 |   if(text.match(/^ordin/)) return 'OG';
 16 |   if(text.match(/^hotărâre/)) return 'HG';
 17 |   throw new Error(`failz: ${text}`);
 18 | }
 19 | 
 20 | function parsePage(page = 1) {
 21 |     nightmare
 22 |       .cookies.clear()
 23 |       .useragent(`Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.${Math.round(Math.random()*100)}`)
 24 |       .goto(`http://www.mfinante.gov.ro/transparent.html?method=transparenta&pagina=acasa&locale=ro&d-6834775-p=${page}`)
 25 |       .wait("#transparentaList")
 26 |       .evaluate(()=> {
 27 |         if(document.querySelector('#transparentaList').innerText.trim() == '') return;
 28 |         let itemsList = [], items = [... document.querySelectorAll('#transparentaList > tbody > tr ')];
 29 |         for (let item of items) {
 30 |             let text = item.innerText;
 31 |             let match = text.replace(/\s+/g, ' ').match(
 32 |               /(.*?)\s*- publicat în data de\s*(\d{2})\.(\d{2})\.(\d{4})/);
 33 | 
 34 |             if(! match) {
 35 |               throw new Error(`Can't match title and date in text: "${text}"`);
 36 |             }
 37 | 
 38 |             let documents = []
 39 |             let links = item.querySelectorAll('a.downlPDF');
 40 |             for (let doc of links) {
 41 |               documents.push({
 42 |                 type: 'act',
 43 |                 url: doc.href
 44 |               });
 45 |             }
 46 | 
 47 |             let returnObj = {
 48 |                 title: match[1],
 49 |                 date: `${match[4]}-${match[3]}-${match[2]}`,
 50 |                 documents: documents,
 51 |                 label: links[0].innerText
 52 |             };
 53 | 
 54 |             itemsList.push(returnObj);
 55 |         }
 56 |         return itemsList;
 57 |       })
 58 |       .then((result) => {
 59 | 
 60 |         if(! result) {
 61 |           console.log("halt!");
 62 |           nightmare.halt();
 63 |           return;
 64 |         }
 65 | 
 66 |         let itemsList = [];
 67 | 
 68 |         for(let val of result) {
 69 |             let year = val.date.split('-')[0]
 70 |             if (year < YEAR_THRESHOLD) {
 71 |                 console.log("halt!");
 72 |                 nightmare.halt();
 73 |                 return;
 74 |             }
 75 | 
 76 |             val.identifier = sha256(val.documents[0].url);
 77 |             val.institution = 'finantepub';
 78 |             val.description = '';
 79 |             val.type = guessType(val.label);
 80 |             delete val.label;
 81 |             itemsList.push(val);
 82 |         }
 83 | 
 84 |         function postAllItems(remaining) {
 85 |             if(! remaining.length) return;
 86 |             let val = remaining[0];
 87 |             return rp.post({
 88 |                 url: 'http://czl-api.code4.ro/api/publications/',
 89 |                 headers: {Authorization: `Token ${API_TOKEN}`},
 90 |                 json: val
 91 |             })
 92 |             .then(() => {
 93 |                 console.log('posted item: ', val.identifier);
 94 |                 return postAllItems(remaining.slice(1));
 95 |             });
 96 |         }
 97 | 
 98 |         return postAllItems(itemsList);
 99 | 
100 |       })
101 |       .then(() => {
102 |         parsePage(page + 1);
103 |       })
104 |       .catch((error) => {
105 |         console.error('error:', error);
106 |         nightmare.halt();
107 |       });
108 | }
109 | 
110 | parsePage();
111 | 


--------------------------------------------------------------------------------
/finantepub/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "finantepub",
 3 |   "version": "1.0.0",
 4 |   "description": "## Tehnologie",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "start": "node index.js"
 8 |   },
 9 |   "repository": {
10 |     "type": "git",
11 |     "url": "git+https://github.com/mgax/czl-scrape.git"
12 |   },
13 |   "author": "ciprian chichirita, alex morega",
14 |   "license": "MIT",
15 |   "devDependencies": {
16 |     "nightmare": "^2.10.0",
17 |     "request-promise": "^4.1.1",
18 |     "sha256": "^0.2.0"
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/interne/.editorconfig:
--------------------------------------------------------------------------------
 1 | [*]
 2 | charset=utf-8
 3 | end_of_line=crlf
 4 | insert_final_newline=false
 5 | indent_style=space
 6 | indent_size=4
 7 | 
 8 | [{*.jhm,*.xslt,*.xul,*.rng,*.xsl,*.xsd,*.ant,*.svg,*.tld,*.fxml,*.jrxml,*.xml,*.jnlp,*.wsdl}]
 9 | indent_style=space
10 | indent_size=2
11 | 
12 | [{.eslintrc,.babelrc,.stylelintrc,*.json,*.jsb3,*.jsb2,*.bowerrc}]
13 | indent_style=space
14 | indent_size=2
15 | 
16 | [{*.applejs,*.js}]
17 | indent_style=space
18 | indent_size=4
19 | 
20 | [{.analysis_options,*.yml,*.yaml}]
21 | indent_style=space
22 | indent_size=2
23 | 
24 | 


--------------------------------------------------------------------------------
/interne/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | node_modules
3 | secrets.json
4 | data.json


--------------------------------------------------------------------------------
/interne/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul Afacerilor Interne
2 | 
3 | ## Tehnologie
4 | 
5 | ## Instructiuni
6 | 
7 | ## Exceptii


--------------------------------------------------------------------------------
/interne/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "pretutindeni",
 3 |   "version": "1.0.0",
 4 |   "description": "",
 5 |   "main": "app.js",
 6 |   "scripts": {
 7 |     "crawl": "node app.js"
 8 |   },
 9 |   "author": "",
10 |   "license": "ISC",
11 |   "dependencies": {
12 |     "cheerio": "0.22.0",
13 |     "diacritics": "1.3.0",
14 |     "jsonfile": "2.4.0",
15 |     "nightmare": "2.10.0",
16 |     "nodemon": "1.11.0",
17 |     "q": "1.4.1",
18 |     "request": "2.81.0",
19 |     "yargs": "7.0.2"
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/interne/secrets.json.txt:
--------------------------------------------------------------------------------
1 | {
2 |   "TOKEN": "something something",
3 |   "API_URL": "http://something.com/api/post-parsed-results"
4 | }


--------------------------------------------------------------------------------
/justitie/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .scrapy
3 | 
4 | *.pyc
5 | __pycache__
6 | **/__pycache__
7 | 


--------------------------------------------------------------------------------
/justitie/README.md:
--------------------------------------------------------------------------------
 1 | # Ministerul Justiţiei
 2 | 
 3 | ## Tehnologie
 4 | 
 5 | *Python 3.6*, [virtualenv](https://virtualenv.pypa.io/) e un prieten bun
 6 | [Scrapy](https://scrapy.org/)
 7 | 
 8 | ```
 9 | pip install -r requirements.txt
10 | 
11 | # on windows:
12 | pip install win32api
13 | ```
14 | 
15 | ## Instructiuni
16 | 
17 | ```
18 | scrapy crawl publication
19 | ```
20 | 
21 | ## Altele
22 | 
23 | Data understading & values
24 | * [online](https://etherpad.net/p/hackajust)
25 | * see doc folder
26 | 


--------------------------------------------------------------------------------
/justitie/just/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/justitie/just/__init__.py


--------------------------------------------------------------------------------
/justitie/just/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | class JustPublication(scrapy.Item):
11 |     # define the fields for your item here like:
12 |     # name = scrapy.Field()
13 |     identifier = scrapy.Field()
14 |     title = scrapy.Field()
15 |     type = scrapy.Field()
16 |     institution = scrapy.Field()
17 |     date = scrapy.Field()
18 |     description = scrapy.Field()
19 |     feedback_days = scrapy.Field()
20 |     contact = scrapy.Field()
21 |     documents = scrapy.Field()
22 | 
23 |     pass
24 | 
25 | 


--------------------------------------------------------------------------------
/justitie/just/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class JustSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/justitie/just/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import requests
 9 | import json
10 | import logging
11 | 
12 | from just.items import JustPublication
13 | import logging
14 | 
15 | API_KEY = 'justitie-very-secret-key'
16 | API_PUBLICATIONS = 'http://czl-api.code4.ro/api/publications/'
17 | 
18 | class JustPublicationsToApiPipeline(object):
19 |     def process_item(self, item, spider):
20 | 
21 |         if type(item) != JustPublication:
22 |             return item
23 | 
24 |         r = requests.post(API_PUBLICATIONS, json=dict(item), headers={'Authorization': 'Token %s' % (API_KEY,) } )
25 | 
26 | 
27 |         if r.status_code == 200 or r.status_code == '200':
28 |             logging.log(msg=r.status_code, level=logging.INFO)
29 |         else:
30 |             logging.log(msg=r.status_code, level=logging.ERROR)
31 |             logging.log(msg=r.content, level=logging.INFO)
32 | 
33 |         return item
34 | 


--------------------------------------------------------------------------------
/justitie/just/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for just project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'just'
13 | 
14 | SPIDER_MODULES = ['just.spiders']
15 | NEWSPIDER_MODULE = 'just.spiders'
16 | 
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | USER_AGENT = 'code4romania (+http://www.code4.ro)'
19 | 
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = True
22 | 
23 | LOG_ENABLED = True
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 | 
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | DOWNLOAD_DELAY = 5
32 | # The download delay setting will honor only one of:
33 | CONCURRENT_REQUESTS_PER_DOMAIN = 1
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 | 
36 | # Disable cookies (enabled by default)
37 | #COOKIES_ENABLED = False
38 | 
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 | 
42 | # Override the default request headers:
43 | #DEFAULT_REQUEST_HEADERS = {
44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | #   'Accept-Language': 'en',
46 | #}
47 | 
48 | # Enable or disable spider middlewares
49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
50 | #SPIDER_MIDDLEWARES = {
51 | #    'just.middlewares.JustSpiderMiddleware': 543,
52 | #}
53 | 
54 | # Enable or disable downloader middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
56 | #DOWNLOADER_MIDDLEWARES = {
57 | #    'just.middlewares.MyCustomDownloaderMiddleware': 543,
58 | #}
59 | 
60 | # Enable or disable extensions
61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | #    'scrapy.extensions.telnet.AutoThrottle': None,
64 | #}
65 | 
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 |     'just.pipelines.JustPublicationsToApiPipeline': 100,
70 | }
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | # AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | # AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | # AUTOTHROTTLE_MAX_DELAY = 30
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0
82 | # Enable showing throttling stats for every response received:
83 | # AUTOTHROTTLE_DEBUG = True
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | HTTPCACHE_ENABLED = True
88 | HTTPCACHE_EXPIRATION_SECS = 30
89 | HTTPCACHE_DIR = 'httpcache'
90 | HTTPCACHE_IGNORE_HTTP_CODES = []
91 | HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 


--------------------------------------------------------------------------------
/justitie/just/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/justitie/requirements.txt:
--------------------------------------------------------------------------------
 1 | appdirs==1.4.3
 2 | asn1crypto==0.21.1
 3 | attrs==16.3.0
 4 | Automat==0.5.0
 5 | cffi==1.9.1
 6 | constantly==15.1.0
 7 | convertdate==2.1.0
 8 | cryptography==1.8.1
 9 | cssselect==1.0.1
10 | ephem==3.7.6.0
11 | idna==2.5
12 | incremental==16.10.1
13 | jdatetime==1.8.2
14 | lxml==3.7.3
15 | packaging==16.8
16 | parsel==1.1.0
17 | pyasn1==0.2.3
18 | pyasn1-modules==0.0.8
19 | pycparser==2.17
20 | PyDispatcher==2.0.5
21 | pyOpenSSL==17.5.0
22 | pyparsing==2.2.0
23 | pytz==2016.10
24 | queuelib==1.4.2
25 | regex==2017.2.8
26 | ruamel.yaml==0.13.14
27 | Scrapy==1.3.3
28 | service-identity==16.0.0
29 | six==1.10.0
30 | Twisted==19.7.0
31 | tzlocal==1.3
32 | umalqurra==0.2
33 | Unidecode==0.4.20
34 | w3lib==1.17.0
35 | zope.interface==4.3.3
36 | 


--------------------------------------------------------------------------------
/justitie/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = just.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = just
12 | 


--------------------------------------------------------------------------------
/mediu/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.swo
3 | .DS_Store
4 | *.egg-info
5 | build
6 | *.pyc
7 | **/*.pyc
8 | dbs
9 | 


--------------------------------------------------------------------------------
/mediu/README.md:
--------------------------------------------------------------------------------
 1 | # Ministerul Mediului
 2 | 
 3 | ## Tehnologie
 4 | 
 5 | *Python 2.7*
 6 | [Scrapy 1.3.3](https://scrapy.org/)
 7 | 
 8 | ## Instructiuni
 9 | 
10 | ```
11 | pip install -r requirements.txt
12 | cd crawl_mediu
13 | scrapy crawl mmediu -a token=xxxx
14 | ```
15 | 
16 | 
17 | ## Exceptii


--------------------------------------------------------------------------------
/mediu/crawl_mediu/crawl_mediu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/mediu/crawl_mediu/crawl_mediu/__init__.py


--------------------------------------------------------------------------------
/mediu/crawl_mediu/crawl_mediu/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class CrawlMediuItem(scrapy.Item):
12 |     identifier = scrapy.Field()
13 |     title = scrapy.Field()
14 |     type = scrapy.Field()
15 |     institution = scrapy.Field()
16 |     institution = scrapy.Field()
17 |     date = scrapy.Field()
18 |     description = scrapy.Field()
19 |     feedback_days = scrapy.Field()
20 |     contact = scrapy.Field()
21 |     tel = scrapy.Field()
22 |     email = scrapy.Field()
23 |     documents = scrapy.Field()


--------------------------------------------------------------------------------
/mediu/crawl_mediu/crawl_mediu/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class CrawlMediuSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/mediu/crawl_mediu/crawl_mediu/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import requests
 8 | 
 9 | class CrawlMediuPipeline(object):
10 |     def process_item(self, item, spider):
11 |         doc = {
12 |             'identifier': item['identifier'],
13 |             'title': item['title'],
14 |             'institution': item['institution'],
15 |             'description': item['description'],
16 |             'type': item['type'],
17 |             'date': item['date'],
18 |             'documents': item['documents'],
19 |             'contact':item['contact'],
20 |             'feedback_days': item['feedback_days'] 
21 |         }
22 | 
23 |         response = requests.post('http://czl-api.code4.ro/api/publications/', headers={'Authorization': 'Token ' + spider.token }, json=doc)
24 |         # print '---------'
25 |         # print response
26 |         # print response.text
27 |         # print '---------'
28 |         return item
29 | 
30 | 


--------------------------------------------------------------------------------
/mediu/crawl_mediu/crawl_mediu/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for crawl_mediu project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'crawl_mediu'
13 | 
14 | SPIDER_MODULES = ['crawl_mediu.spiders']
15 | NEWSPIDER_MODULE = 'crawl_mediu.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'crawl_mediu (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'crawl_mediu.middlewares.CrawlMediuSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'crawl_mediu.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'crawl_mediu.pipelines.CrawlMediuPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/mediu/crawl_mediu/crawl_mediu/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/mediu/crawl_mediu/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = crawl_mediu.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = crawl_mediu
12 | 


--------------------------------------------------------------------------------
/mediu/requirements.txt:
--------------------------------------------------------------------------------
 1 | appdirs==1.4.3
 2 | appnope==0.1.0
 3 | asn1crypto==0.21.1
 4 | attrs==16.3.0
 5 | Automat==0.5.0
 6 | backports.shutil-get-terminal-size==1.0.0
 7 | beautifulsoup4==4.5.3
 8 | cffi==1.9.1
 9 | constantly==15.1.0
10 | cryptography==1.8.1
11 | cssselect==1.0.1
12 | decorator==4.0.11
13 | enum34==1.1.6
14 | idna==2.5
15 | incremental==16.10.1
16 | ipaddress==1.0.18
17 | ipython==5.3.0
18 | ipython-genutils==0.1.0
19 | lxml==3.7.3
20 | packaging==16.8
21 | parsel==1.1.0
22 | pathlib2==2.2.1
23 | pexpect==4.2.1
24 | pickleshare==0.7.4
25 | prompt-toolkit==1.0.13
26 | ptyprocess==0.5.1
27 | pyasn1==0.2.3
28 | pyasn1-modules==0.0.8
29 | pycparser==2.17
30 | PyDispatcher==2.0.5
31 | Pygments==2.2.0
32 | pyOpenSSL==16.2.0
33 | pyparsing==2.2.0
34 | queuelib==1.4.2
35 | requests==2.13.0
36 | scandir==1.5
37 | Scrapy==1.3.3
38 | service-identity==16.0.0
39 | simplegeneric==0.8.1
40 | six==1.10.0
41 | slugify==0.0.1
42 | traitlets==4.3.2
43 | Twisted==17.1.0
44 | w3lib==1.17.0
45 | wcwidth==0.1.7
46 | zope.interface==4.3.3
47 | 


--------------------------------------------------------------------------------
/presedinte/README.md:
--------------------------------------------------------------------------------
1 | # Presedintia
2 | 
3 | ## Tehnologie
4 | 
5 | ## Instructiuni
6 | 
7 | ## Exceptii


--------------------------------------------------------------------------------
/pretutindeni/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | node_modules
3 | parseProject.js


--------------------------------------------------------------------------------
/pretutindeni/README.md:
--------------------------------------------------------------------------------
 1 | # Ministerul pentru Românii de Pretutindeni
 2 | 
 3 | 1. http://www.dprp.gov.ro/documente-in-consultare-publica/ 
 4 | 
 5 | ## Tehnologie
 6 | 
 7 | 1. nodejs - https://nodejs.org/en/
 8 | 2. nightmarejs - https://github.com/segmentio/nightmare
 9 | 
10 | ## Instructiuni
11 | 
12 | ## Exceptii
13 | 
14 | Oamenii care updateaza chestia asta sunt exceptii. Paragrafe fara structura aruncate pur si simplu acolo. Foarte dificil de parsat.


--------------------------------------------------------------------------------
/pretutindeni/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "pretutindeni",
 3 |   "version": "1.0.0",
 4 |   "description": "",
 5 |   "main": "app.js",
 6 |   "scripts": {
 7 |     "crawl": "node app.js"
 8 |   },
 9 |   "author": "",
10 |   "license": "ISC",
11 |   "dependencies": {
12 |     "cheerio": "0.22.0",
13 |     "nightmare": "2.10.0"
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/pretutindeni/parseProject.example:
--------------------------------------------------------------------------------
1 | var cheerio = require('cheerio')
2 | 
3 | module.exports = function(project) {
4 |     "use strict";
5 | 
6 |     console.log(project);
7 | };


--------------------------------------------------------------------------------
/relparlament/README.md:
--------------------------------------------------------------------------------
1 | # Ministerul pentru Relaţia cu Parlamentul
2 | 
3 | ## Tehnologie
4 | 
5 | ## Instructiuni
6 | 
7 | ## Exceptii


--------------------------------------------------------------------------------
/relparlament/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "relparlament",
 3 |   "version": "1.0.0",
 4 |   "description": "## Tehnologie",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "start": "node index.js",
 8 |     "test": "echo \"Error: no test specified\" && exit 1"
 9 |   },
10 |   "author": "Mihnea Beldescu",
11 |   "license": "ISC",
12 |   "dependencies": {
13 |     "cheerio": "^0.22.0",
14 |     "lokijs": "^1.4.3",
15 |     "nightmare": "^2.10.0",
16 |     "request": "^2.81.0"
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/sanatate/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/sanatate/README.md:
--------------------------------------------------------------------------------
 1 | # Ministerul Sănătăţii
 2 | Crawler simplu, de la țară, făcut cu scrapy. Nu știe bine românește, dar înțelege oricum (face fuzzy matching pe titluri ca să scoată tipul de act normativ).
 3 | ## Tehnologie
 4 | - python3, pip
 5 | - scrapy, fuzzywuzzy, urllib3
 6 | - python-Levenshtein [opțional]
 7 | 
 8 | ## Instructiuni
 9 | Bagi chestii în _credentials.json_, după care un clasic _pip install -r requirements.txt_ și un clasic _scrapy crawl sanatate_.
10 | ## Exceptii
11 | Detectarea tipului de act normativ nu e perfectă, și nici a tipului de documente. Asta e o problemă mai mare, și nu are sens să o tratăm doar într-un singur crawler.
12 | 


--------------------------------------------------------------------------------
/sanatate/credentials.json:
--------------------------------------------------------------------------------
1 | {
2 |     "endpoint": "http://czl-api.code4.ro/api/publications/",
3 |     "authorization": "weeee"
4 | }


--------------------------------------------------------------------------------
/sanatate/requirements.txt:
--------------------------------------------------------------------------------
1 | fuzzywuzzy==0.15.0
2 | python-Levenshtein==0.12.0
3 | Scrapy==1.3.3
4 | urllib3==1.20
5 | 


--------------------------------------------------------------------------------
/sanatate/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = scrapy_proj.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapy_proj
12 | 


--------------------------------------------------------------------------------
/sanatate/scrapy_proj/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/sanatate/scrapy_proj/__init__.py


--------------------------------------------------------------------------------
/sanatate/scrapy_proj/helpers/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from scrapy_proj.helpers.legal import *
4 | from scrapy_proj.helpers.romanian import *
5 | from scrapy_proj.helpers.text import *
6 | 


--------------------------------------------------------------------------------
/sanatate/scrapy_proj/helpers/legal.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import re
 4 | import fuzzywuzzy.fuzz as fuzz
 5 | 
 6 | from scrapy_proj.helpers.romanian import *
 7 | 
 8 | class LegalHelper(object):
 9 |     @staticmethod
10 |     def get_type_from_title(title):
11 |         engrol = RomanianHelper.englishize_romanian(title).lower()
12 | 
13 |         stop_pos = len(title)
14 |         magic_keyword_search_result = re.search(r'(pentru|privind)', engrol)
15 |         if magic_keyword_search_result != None:
16 |             stop_pos = magic_keyword_search_result.start()
17 | 
18 |         search_space = engrol[:stop_pos]
19 | 
20 |         type_to_keywords = {
21 |             'HG': 'hotarare',
22 |             'OM': 'ordin',
23 |             'LEGE': 'lege',
24 |             'OG': 'ordonanta',
25 |             'OUG': 'ordonanta de urgenta'
26 |         }
27 | 
28 |         final_type = None
29 |         max_ratio = 0
30 | 
31 |         for key in type_to_keywords:
32 |             ratio = fuzz.ratio(type_to_keywords[key], search_space)
33 |             if ratio > max_ratio:
34 |                 max_ratio = ratio
35 |                 final_type = key
36 | 
37 |         return final_type
38 | 


--------------------------------------------------------------------------------
/sanatate/scrapy_proj/helpers/romanian.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | class RomanianHelper(object):
 4 |     @staticmethod
 5 |     def englishize_romanian(string):
 6 |         symbols = (u"țţȚŢșşȘŞăǎĂîÎâÂ",
 7 |                    u"ttTTssSSaaAiIaA")
 8 | 
 9 |         tr = {ord(a):ord(b) for a, b in zip(*symbols)}
10 | 
11 |         return string.translate(tr)
12 | 
13 |     @staticmethod
14 |     def beautify_romanian(string):
15 |         symbols = (u"ǎţşŢŞ",
16 |                    u"ățșȚȘ")
17 |         tr = {ord(a):ord(b) for a, b in zip(*symbols)}
18 |         return string.translate(tr)
19 | 


--------------------------------------------------------------------------------
/sanatate/scrapy_proj/helpers/text.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import re
 4 | 
 5 | class TextHelper(object):
 6 | 
 7 |     @staticmethod
 8 |     def remove_non_ascii(string):
 9 |         return re.sub(r'[^\x00-\x7F]+', ' ', string)
10 | 
11 |     @staticmethod
12 |     def remove_non_numeric(string):
13 |         return re.sub('[^0-9]+', '', string)
14 | 
15 |     @staticmethod
16 |     def rws(str):
17 |         if str:
18 |             return ' '.join(str.split())
19 |         else:
20 |             return None
21 | 
22 |     @staticmethod
23 |     def titleize(string):
24 |         if string:
25 |             return string.title()
26 |         else:
27 |             return None
28 | 


--------------------------------------------------------------------------------
/sanatate/scrapy_proj/items/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from scrapy_proj.items.act import *
4 | from scrapy_proj.items.contact import *
5 | 


--------------------------------------------------------------------------------
/sanatate/scrapy_proj/items/act.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import scrapy
 4 | 
 5 | class ActItem(scrapy.Item):
 6 |     identifier = scrapy.Field()
 7 |     title = scrapy.Field(serializer=str)
 8 |     type = scrapy.Field()
 9 |     institution = scrapy.Field()
10 |     date = scrapy.Field()
11 |     description = scrapy.Field()
12 |     feedback_days = scrapy.Field(serializer=int)
13 |     contact = scrapy.Field()
14 |     documents = scrapy.Field()
15 | 


--------------------------------------------------------------------------------
/sanatate/scrapy_proj/items/contact.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | import scrapy
4 | 
5 | class ContactItem(scrapy.Item):
6 |     tel = scrapy.Field(serializer=str)
7 |     email = scrapy.Field(serializer=str)
8 | 


--------------------------------------------------------------------------------
/sanatate/scrapy_proj/loaders/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from scrapy_proj.loaders.act import *
4 | from scrapy_proj.loaders.contact import *
5 | 


--------------------------------------------------------------------------------
/sanatate/scrapy_proj/loaders/act.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from scrapy.loader import ItemLoader
 4 | from scrapy_proj.helpers import *
 5 | from scrapy.loader.processors import *
 6 | from datetime import datetime as dt
 7 | 
 8 | class ActLoader(ItemLoader):
 9 |     default_output_processor = TakeFirst()
10 |     title_in = MapCompose(TextHelper.rws, RomanianHelper.beautify_romanian)
11 |     contact_in = Compose(TakeFirst(), lambda x: dict(x))
12 |     date_in = MapCompose(lambda d: dt.strptime(d, '%d-%m-%Y').strftime('%Y-%m-%d'))
13 |     feedback_days_in = MapCompose(int)
14 |     documents_in = Identity()
15 |     documents_out = Identity()
16 | 


--------------------------------------------------------------------------------
/sanatate/scrapy_proj/loaders/contact.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from scrapy.loader import ItemLoader
 4 | from scrapy_proj.helpers import *
 5 | from scrapy.loader.processors import *
 6 | 
 7 | class ContactLoader(ItemLoader):
 8 |     default_output_processor = TakeFirst()
 9 |     email_in = MapCompose(str.lower)
10 |     tel_in = MapCompose(TextHelper.remove_non_numeric)
11 | 


--------------------------------------------------------------------------------
/sanatate/scrapy_proj/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from scrapy_proj.pipelines.extrameta import *
4 | from scrapy_proj.pipelines.post import *
5 | 


--------------------------------------------------------------------------------
/sanatate/scrapy_proj/pipelines/extrameta.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import datetime
 4 | import hashlib
 5 | 
 6 | from scrapy_proj.helpers import *
 7 | 
 8 | class SanatatePipelineExtraMeta(object):
 9 |     def process_item(self, item, spider):
10 |         item['institution'] = spider.name
11 |         act_type = LegalHelper.get_type_from_title(item['title'])
12 |         if act_type == None:
13 |             raise scrapy.exceptions.DropItem
14 |         item['type'] = act_type
15 |         engrol = RomanianHelper.englishize_romanian(item['title']).lower()
16 |         engrolna = TextHelper.remove_non_ascii(engrol)
17 |         identifier_text = '{0} {1}'.format(engrolna, item['date'] if 'date' in item else 'NA')
18 |         identifier_text_hashed = hashlib.md5(identifier_text.encode()).hexdigest()
19 |         item['identifier'] = '{0}-{1}-{2}'.format(item['institution'], item['type'], identifier_text_hashed)
20 |         return item
21 | 


--------------------------------------------------------------------------------
/sanatate/scrapy_proj/pipelines/post.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import json
 4 | import urllib3
 5 | 
 6 | class SanatatePipelinePost(object):
 7 |     def open_spider(self, spider):
 8 |         with open('credentials.json') as credentials_file:
 9 |             self.credentials = json.load(credentials_file)
10 |     def process_item(self, item, spider):
11 |         http = urllib3.PoolManager()
12 |         r = http.request(
13 |             'POST',
14 |             self.credentials['endpoint'],
15 |             headers={
16 |                 'Content-Type': 'application/json',
17 |                 'Authorization': self.credentials['authorization']
18 |             },
19 |             body=json.dumps(dict(item))
20 |         )
21 |         return item
22 | 


--------------------------------------------------------------------------------
/sanatate/scrapy_proj/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for scrapy_proj project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'scrapy_proj'
13 | 
14 | SPIDER_MODULES = ['scrapy_proj.spiders']
15 | NEWSPIDER_MODULE = 'scrapy_proj.spiders'
16 | 
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | #USER_AGENT = 'scrapy_proj (+http://www.yourdomain.com)'
19 | 
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = True
22 | 
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | #CONCURRENT_REQUESTS = 32
25 | 
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | #DOWNLOAD_DELAY = 3
30 | # The download delay setting will honor only one of:
31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | #CONCURRENT_REQUESTS_PER_IP = 16
33 | 
34 | # Disable cookies (enabled by default)
35 | #COOKIES_ENABLED = False
36 | 
37 | # Disable Telnet Console (enabled by default)
38 | #TELNETCONSOLE_ENABLED = False
39 | 
40 | # Override the default request headers:
41 | #DEFAULT_REQUEST_HEADERS = {
42 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | #   'Accept-Language': 'en',
44 | #}
45 | 
46 | # Enable or disable spider middlewares
47 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
48 | #SPIDER_MIDDLEWARES = {
49 | #    'scrapy_proj.middlewares.ScrapyProjSpiderMiddleware': 543,
50 | #}
51 | 
52 | # Enable or disable downloader middlewares
53 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
54 | #DOWNLOADER_MIDDLEWARES = {
55 | #    'scrapy_proj.middlewares.MyCustomDownloaderMiddleware': 543,
56 | #}
57 | 
58 | # Enable or disable extensions
59 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
60 | #EXTENSIONS = {
61 | #    'scrapy.extensions.telnet.TelnetConsole': None,
62 | #}
63 | 
64 | # Configure item pipelines
65 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
66 | ITEM_PIPELINES = {
67 |     'scrapy_proj.pipelines.SanatatePipelineExtraMeta': 298,
68 |     'scrapy_proj.pipelines.SanatatePipelinePost': 299,
69 | }
70 | 
71 | LOG_LEVEL = 'WARNING'
72 | 
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 | 
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 | 


--------------------------------------------------------------------------------
/sanatate/scrapy_proj/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/sanatate/scrapy_proj/spiders/sanatate.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import scrapy
 4 | import scrapy_proj.items as items
 5 | import scrapy_proj.loaders as loaders
 6 | import re
 7 | import sys
 8 | 
 9 | class SanatateSpider(scrapy.Spider):
10 |     name = 'sanatate'
11 | 
12 |     def start_requests(self):
13 |         urls = [
14 |             'http://www.ms.ro/acte-normative-in-transparenta/?vpage=2',
15 |         ]
16 | 
17 |         for url in urls:
18 |             yield scrapy.Request(url=url, callback=self.parse)
19 | 
20 |     def parse(self, response):
21 |         date_regex = re.compile('de\s+la\s+(\d{1,2}[-/]\d{2}[-/]\d{4})')
22 |         email_regex = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')
23 |         tel_regex = re.compile(r'[^0-9](0(?:[0-9].?){9})')
24 |         feedback_days_regex = re.compile(r'termen.*limita.*[^[0-9]]*([0-9]{1,2}).*zi')
25 | 
26 |         for item in response.css('.panel'):
27 |             heading = item.css('div.panel-heading')
28 |             body = item.css('div.panel-body')
29 |             body_text = ''.join(body.xpath('.//text()').extract()).lower()
30 | 
31 |             title = item.css('a.panel-title::text').extract_first()
32 | 
33 |             loader = loaders.ActLoader(items.ActItem())
34 |             loader.add_value('title', title)
35 | 
36 |             contact_loader = loaders.ContactLoader(items.ContactItem())
37 |             contact_loader.add_value('tel', tel_regex.findall(body_text))
38 |             contact_loader.add_value('email', email_regex.findall(body_text))
39 |             loader.add_value('contact', contact_loader.load_item())
40 |             loader.add_value('date', date_regex.findall(body_text))
41 |             loader.add_value('feedback_days', feedback_days_regex.findall(body_text))
42 | 
43 |             keys = ['type', 'url']
44 |             types = body.xpath('.//a[contains(@href, ".pdf")]').xpath('text()').extract()
45 |             urls = body.xpath('.//a[contains(@href, ".pdf")]').xpath('@href').extract()
46 |             docs = [[types[i], urls[i]] for i in range(len(types))]
47 |             loader.add_value('documents', [dict(zip(keys, doc)) for doc in docs])
48 | 
49 |             yield loader.load_item()
50 | 
51 |         next_pages = response.css('.pt-cv-pagination a::attr(href)').extract()
52 |         next_pages.reverse()
53 |         for next_page in next_pages:
54 |             next_page = response.urljoin(next_page)
55 |             yield scrapy.Request(next_page, callback=self.parse)
56 | 


--------------------------------------------------------------------------------
/scrapy/.gitignore:
--------------------------------------------------------------------------------
1 | /.cache
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/scrapy/Readme.md:
--------------------------------------------------------------------------------
 1 | # Scrapere scrise cu scrapy
 2 | 
 3 | O colecție de scrapere implementate folosind [scrapy](https://scrapy.org).
 4 | Fiecărei instituții îi corespunde un scraper care descarcă publicații de pe site.
 5 | Mai departe, publicațiile sunt validate într-un pipeline comun, și trimise la
 6 | [api](http://czl-api.code4.ro).
 7 | 
 8 | ## Spidere implementate
 9 | * [`dialog`](czlscrape/spiders/dialog.py) - Ministerul Consultărilor Publice și
10 |   Dialogului Social
11 | 
12 | ## Instrucțiuni
13 | * Ai nevoie de python3, preferabil cu un
14 |   [virtualenv](https://virtualenv.pypa.io).
15 | 
16 | * Instalezi dependențele:
17 |    ```sh
18 |    pip install -r requirements.txt
19 |    ```
20 | 
21 | * Configurezi variabile de mediu:
22 |    ```sh
23 |    export API_TOKEN='the secret token'
24 |    export SENTRY_DSN='the sentry dsn' # opțional
25 |    ```
26 | 
27 | * Rulezi unul din spidere:
28 |    ```sh
29 |    scrapy crawl dialog
30 |    ```
31 | 
32 | * După ce faci schimbări în cod, rulezi testele:
33 |    ```sh
34 |    pytest
35 |    ```
36 | 


--------------------------------------------------------------------------------
/scrapy/czlscrape/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import logging
 4 | 
 5 | if 'SENTRY_DSN' in os.environ:
 6 |     import logging
 7 |     from raven.handlers.logging import SentryHandler
 8 |     from raven.conf import setup_logging
 9 |     setup_logging(SentryHandler(os.environ['SENTRY_DSN'], level=logging.WARN))
10 | 
11 | logging.Formatter.converter = time.gmtime
12 | 


--------------------------------------------------------------------------------
/scrapy/czlscrape/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class Publication(scrapy.Item):
12 |     institution = scrapy.Field()
13 |     identifier = scrapy.Field()
14 |     type = scrapy.Field()
15 |     date = scrapy.Field()
16 |     title = scrapy.Field()
17 |     description = scrapy.Field()
18 |     documents = scrapy.Field()
19 |     contact = scrapy.Field()
20 |     feedback_days = scrapy.Field()
21 |     max_feedback_date = scrapy.Field()
22 | 


--------------------------------------------------------------------------------
/scrapy/czlscrape/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class CzlScrapeSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/scrapy/czlscrape/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import os
 9 | import re
10 | import logging
11 | from scrapy.exceptions import DropItem
12 | import requests
13 | 
14 | API_URL = 'http://czl-api.code4.ro/api/publications/'
15 | API_TOKEN = os.environ.get('API_TOKEN')
16 | 
17 | logger = logging.getLogger(__name__)
18 | logger.setLevel(logging.WARN)
19 | 
20 | 
21 | class UploadPipeline(object):
22 |     def process_item(self, item, spider):
23 |         self.upload(item)
24 |         return item
25 | 
26 |     def upload(self, item):
27 |         if not API_TOKEN:
28 |             print(item)
29 |             return
30 | 
31 |         headers = {'Authorization': 'Token ' + API_TOKEN}
32 |         resp = requests.post(API_URL, json=dict(item), headers=headers)
33 |         if resp.status_code == 400:
34 |             if re.search(r'Integrity Error: Key .* already exists', resp.text):
35 |                 return
36 |         if resp.status_code != 201:
37 |             msg = "Failed to upload publication: {!r}".format(resp)
38 |             raise RuntimeError(msg)
39 | 
40 | 
41 | class PublicationValidatorPipeline(object):
42 | 
43 |     REQUIRED_FIELDS = [
44 |         'identifier',
45 |         'title',
46 |         'institution',
47 |         'description',
48 |         'type',
49 |         'date',
50 |     ]
51 | 
52 |     def process_item(self, item, spider):
53 |         for field in self.REQUIRED_FIELDS:
54 |             if not item.get(field):
55 |                 message = "Missing field {}".format(field)
56 |                 logger.warn(message)
57 |                 raise DropItem(message)
58 |         return item
59 | 


--------------------------------------------------------------------------------
/scrapy/czlscrape/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for czlscrape project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'czlscrape'
13 | 
14 | SPIDER_MODULES = ['czlscrape.spiders']
15 | NEWSPIDER_MODULE = 'czlscrape.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'czlscrape (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'czlscrape.middlewares.CzlScrapeSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'czlscrape.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |     'czlscrape.pipelines.PublicationValidatorPipeline': 300,
69 |     'czlscrape.pipelines.UploadPipeline': 1000,
70 | }
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 
93 | LOG_LEVEL = 'DEBUG'
94 | LOG_FORMAT = '%(asctime)s srv="czl-scrape" [%(thread)d] %(levelname)s %(name)s %(funcName)s: %(message)s'
95 | LOG_DATEFORMAT = '%Y-%m-%dT%H:%M:%SZ'
96 | 


--------------------------------------------------------------------------------
/scrapy/czlscrape/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy/czlscrape/spiders/afaceri.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import re
 3 | from ..items import Publication
 4 | 
 5 | INDEX_URL = 'http://www.aippimm.ro/categorie/transparenta-decizionala---modificare-hg-96-2011/'
 6 | 
 7 | def text_from(sel):
 8 |     return (sel.xpath('string(.)').extract_first() or "").strip()
 9 | 
10 | def guess_publication_type(text):
11 |     text = text.lower()
12 |     text = re.sub(r'[șş]', 's', text)
13 |     text = re.sub(r'[țţ]', 't', text)
14 |     text = re.sub(r'[ăâ]', 'a', text)
15 |     text = re.sub(r'[î]', 'i', text)
16 |     rules = [
17 |         ("lege", "LEGE"),
18 |         ("hotarare de guvern", "HG"),
19 |         ("hotarare a guvernului", "HG"),
20 |         ("hg", "HG"),
21 |         ("ordonanta de guvern", "OG"),
22 |         ("oug", "OUG"),
23 |         ("ordonanta de urgenta", "OUG"),
24 |         ("ordin de ministru", "OM"),
25 |         ("ordinul", "OM"),
26 |     ]
27 |     for substr, publication_type in rules:
28 |         if substr in text:
29 |             return publication_type
30 |     else:
31 |         return "OTHER"
32 | 
33 | class AfaceriSpider(scrapy.Spider):
34 | 
35 |     name = 'afaceri'
36 |     start_urls = [INDEX_URL]
37 | 
38 |     def parse(self, response):
39 |         for article in response.css('.article_container'):
40 |             link = article.css('a.lead_subcat')
41 |             title = text_from(link)
42 |             if not title:
43 |                 continue
44 | 
45 |             date_match = re.search(
46 |                 r'(?P<day>\d{2})\.(?P<month>\d{2})\.(?P<year>\d{4})$',
47 |                 text_from(article.css('ul.lead')),
48 |             )
49 |             date = "{year}-{month}-{day}".format(**date_match.groupdict())
50 | 
51 |             identifier = link.css('::attr(href)').extract_first().split('/')[-1]
52 |             publication_type = guess_publication_type(title)
53 | 
54 |             documents = [
55 |                 {
56 |                     'type': href.split('.')[-1],
57 |                     'url': href,
58 |                 }
59 |                 for href in article.css('a.files::attr(href)').extract()
60 |             ]
61 | 
62 |             yield Publication(
63 |                 identifier=identifier,
64 |                 title=title,
65 |                 institution='afaceri',
66 |                 description=title,
67 |                 type=publication_type,
68 |                 date=date,
69 |                 documents=documents,
70 |             )
71 | 


--------------------------------------------------------------------------------
/scrapy/czlscrape/spiders/dialog.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import re
 3 | 
 4 | from czlscrape.utils import guess_initiative_type
 5 | from ..items import Publication
 6 | 
 7 | INDEX_URL = 'http://dialogsocial.gov.ro/categorie/proiecte-de-acte-normative/'
 8 | 
 9 | DOC_EXTENSIONS = [
10 |     ".docs", ".doc", ".txt", ".crt", ".xls",
11 |     ".xml", ".pdf", ".docx", ".xlsx",
12 | ]
13 | 
14 | TYPE_RULES = [
15 |     ("lege", "LEGE"),
16 |     ("hotarare de guvern", "HG"),
17 |     ("hotarare a guvernului", "HG"),
18 |     ("ordonanta de guvern", "OG"),
19 |     ("ordonanta de urgenta", "OUG"),
20 |     ("ordin de ministru", "OM"),
21 |     ("ordinul", "OM"),
22 | ]
23 | 
24 | 
25 | def text_from(sel):
26 |     return sel.xpath('string(.)').extract_first().strip()
27 | 
28 | 
29 | class DialogSpider(scrapy.Spider):
30 | 
31 |     name = 'dialog'
32 |     start_urls = [INDEX_URL]
33 | 
34 |     def parse(self, response):
35 |         for article in response.css('#content article.post'):
36 |             href = article.css('.entry-title a::attr(href)').extract_first()
37 |             yield scrapy.Request(response.urljoin(href), self.parse_article)
38 | 
39 |     def parse_article(self, response):
40 |         title = text_from(response.css('h1'))
41 |         publication_type = guess_initiative_type(title, TYPE_RULES)
42 | 
43 |         article = response.css('#content article.post')[0]
44 | 
45 |         id_value = article.css('::attr(id)').extract_first()
46 |         identifier = re.match(r'post-(\d+)', id_value).group(1)
47 | 
48 |         date = (
49 |             article.css('time.entry-date::attr(datetime)')
50 |             .extract_first()[:10]
51 |         )
52 | 
53 |         # remove <div class="fb-comments"> and everything below
54 |         to_remove = article.css('.fb-comments')[0].root
55 |         while to_remove is not None:
56 |             next_to_remove = to_remove.getnext()
57 |             to_remove.getparent().remove(to_remove)
58 |             to_remove = next_to_remove
59 | 
60 |         documents = [
61 |             {
62 |                 'type': href.split('.')[-1],
63 |                 'url': href,
64 |             }
65 |             for href in article.css('a::attr(href)').extract()
66 |             if any(href.endswith(ext) for ext in DOC_EXTENSIONS)
67 |         ]
68 | 
69 |         return Publication(
70 |             identifier=identifier,
71 |             title=title,
72 |             institution='dialog',
73 |             description=text_from(article),
74 |             type=publication_type,
75 |             date=date,
76 |             documents=documents,
77 |         )
78 | 
79 | 
80 | def main():
81 |     from scrapy.crawler import CrawlerProcess
82 |     process = CrawlerProcess()
83 |     process.crawl(DialogSpider)
84 |     process.start()
85 | 
86 | if __name__ == '__main__':
87 |     main()
88 | 


--------------------------------------------------------------------------------
/scrapy/czlscrape/spiders/senat.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import datetime
 4 | import re
 5 | 
 6 | from scrapy import Spider, Request
 7 | 
 8 | from ..items import Publication
 9 | from ..utils import extract_documents
10 | 
11 | INDEX_URL = 'https://www.senat.ro/LegiProiect.aspx'
12 | 
13 | 
14 | class SenatSpider(Spider):
15 |     name = 'senat'
16 |     start_urls = [INDEX_URL]
17 | 
18 |     def parse(self, response):
19 |         for entry in response.css('#GridViewProiecte tr > td:nth-child(2) a'):
20 |             href = entry.css('a::attr(href)').extract_first()
21 |             yield Request(response.urljoin(href), self.parse_entry)
22 | 
23 |     def parse_entry(self, response):
24 |         identifier = response.css(
25 |             '#ctl00_B_Center_ctl06_viewFisa_lblNr::text').extract_first()
26 |         description = response.css(
27 |             '#ctl00_B_Center_ctl06_grdTitlu_ctl02_Label1::text').extract_first()
28 |         title = description
29 |         date_string = response.css(
30 |             '#ctl00_B_Center_ctl06_grdDerulare_ctl02_Label1::text').extract_first()
31 |         date_match = re.match(
32 |             '^(?P<day>\d{1,2})\-(?P<month>\d{1,2})\-(?P<year>\d{4})$',
33 |             date_string)
34 |         if date_match:
35 |             date = datetime.date(
36 |                 int(date_match.group('year')),
37 |                 int(date_match.group('month')),
38 |                 int(date_match.group('day')),
39 |             )
40 |         else:
41 |             date = datetime.date.today()
42 | 
43 |         documents = [
44 |             {
45 |                 'type': re.sub('^[^a-zA-Z]+', '', doc['type'], 1),
46 |                 'url': re.sub('\\\\', '/', response.urljoin(doc['url'])),
47 |             } for doc in extract_documents(response.css(
48 |                 '#ctl00_B_Center_Accordion1 div.accrdContent a'))
49 |         ]
50 | 
51 |         contact = {
52 |             'tel': '021 315 8942',
53 |             'email': 'infopub@senat.ro',
54 |         }
55 | 
56 |         return Publication(
57 |             identifier=identifier,
58 |             title=title,
59 |             institution='senat',
60 |             description=description,
61 |             type='LEGE',
62 |             date=date.isoformat(),
63 |             documents=documents,
64 |             contact=contact
65 |         )
66 | 


--------------------------------------------------------------------------------
/scrapy/czlscrape/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from scrapy.selector import SelectorList
 4 | 
 5 | DIACRITICS_RULES = [
 6 |     (r'[șş]', 's'),
 7 |     (r'[ȘŞ]', 'S'),
 8 |     (r'[țţ]', 't'),
 9 |     (r'[ȚŢ]', 'T'),
10 |     (r'[ăâ]', 'a'),
11 |     (r'[ĂÂ]', 'A'),
12 |     (r'[î]', 'i'),
13 |     (r'[Î]', 'I'),
14 | ]
15 | 
16 | ROMANIAN_MONTHS = {
17 |     'ianuarie': 1,
18 |     'februarie': 2,
19 |     'martie': 3,
20 |     'aprilie': 4,
21 |     'mai': 5,
22 |     'iunie': 6,
23 |     'iulie': 7,
24 |     'august': 8,
25 |     'septembrie': 9,
26 |     'octombrie': 10,
27 |     'noiembrie': 11,
28 |     'decembrie': 12,
29 | }
30 | 
31 | DOC_EXTENSIONS = [".docs", ".doc", ".txt", ".crt", ".xls", ".xml", ".pdf",
32 |                   ".docx", ".xlsx", ]
33 | 
34 | 
35 | def guess_initiative_type(text: str, rules: list) -> str:
36 |     """
37 |     Try to identify the type of a law initiative from its description.
38 | 
39 |     Use a best guess approach. The rules are provided by the caller as a list
40 |     of tuples. Each tuple is composed of a search string and the initiative
41 |     type it matches to.
42 |     :param text: the description of the initiative
43 |     :param rules: the rules of identification expressed as a list of tuples
44 |     :return: the type of initiative if a rule matches; "OTHER" if no rule
45 |     matches
46 |     """
47 |     text = strip_diacritics(text)
48 | 
49 |     for search_string, initiative_type in rules:
50 |         if search_string in text:
51 |             return initiative_type
52 |     else:
53 |         return "OTHER"
54 | 
55 | 
56 | def strip_diacritics(text: str) -> str:
57 |     """
58 |     Replace all diacritics in the given text with their regular counterparts.
59 |     :param text: the text to look into
60 |     :return: the text without diacritics
61 |     """
62 |     result = text
63 |     for search_pattern, replacement in DIACRITICS_RULES:
64 |         result = re.sub(search_pattern, replacement, result)
65 |     return result
66 | 
67 | 
68 | def romanian_month_number(text: str) -> int:
69 |     """
70 |     Return the number of the given month identified by its Romanian name.
71 |     :param text: the name of the month in Romanian
72 |     :return: the number of the month if the month name is recognized,
73 |     otherwise None
74 |     """
75 |     return ROMANIAN_MONTHS.get(text.lower())
76 | 
77 | 
78 | def extract_documents(selector_list: SelectorList):
79 |     """
80 |     Extract white-listed documents from CSS selectors.
81 | 
82 |     Generator function. Search for links to white-listed document types and
83 |     return all matching ones. Each entry has two properties. "type" contains
84 |     the link text, "url" contains the link URL.
85 | 
86 |     :param selector_list: a SelectorList
87 |     :return: a generator
88 |     """
89 |     for link_selector in selector_list:
90 |         url = link_selector.css('::attr(href)').extract_first()
91 |         if any(url.endswith(ext) for ext in DOC_EXTENSIONS):
92 |             yield {
93 |                 'type': link_selector.css('::text').extract_first(),
94 |                 'url': url,
95 |             }
96 | 


--------------------------------------------------------------------------------
/scrapy/requirements.in:
--------------------------------------------------------------------------------
1 | scrapy
2 | requests
3 | raven
4 | pytest
5 | 


--------------------------------------------------------------------------------
/scrapy/requirements.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile
 3 | # To update, run:
 4 | #
 5 | #    pip-compile --output-file requirements.txt requirements.in
 6 | #
 7 | appdirs==1.4.3            # via setuptools
 8 | asn1crypto==0.21.1        # via cryptography
 9 | attrs==16.3.0             # via automat, service-identity
10 | automat==0.5.0            # via twisted
11 | cffi==1.9.1               # via cryptography
12 | constantly==15.1.0        # via twisted
13 | contextlib2==0.5.4        # via raven
14 | cryptography==1.8.1       # via pyopenssl
15 | cssselect==1.0.1          # via parsel, scrapy
16 | idna==2.5                 # via cryptography
17 | incremental==16.10.1      # via twisted
18 | lxml==3.7.3               # via parsel, scrapy
19 | packaging==16.8           # via cryptography, setuptools
20 | parsel==1.1.0             # via scrapy
21 | py==1.4.33                # via pytest
22 | pyasn1-modules==0.0.8     # via service-identity
23 | pyasn1==0.2.3             # via pyasn1-modules, service-identity
24 | pycparser==2.17           # via cffi
25 | pydispatcher==2.0.5       # via scrapy
26 | pyopenssl==16.2.0         # via scrapy, service-identity
27 | pyparsing==2.2.0          # via packaging
28 | pytest==3.0.7
29 | queuelib==1.4.2           # via scrapy
30 | raven==6.0.0
31 | requests==2.13.0
32 | scrapy==1.3.3
33 | service-identity==16.0.0  # via scrapy
34 | six==1.10.0               # via automat, cryptography, packaging, parsel, pyopenssl, scrapy, setuptools, w3lib
35 | twisted==17.1.0           # via scrapy
36 | w3lib==1.17.0             # via parsel, scrapy
37 | zope.interface==4.3.3     # via twisted
38 | 
39 | # The following packages are considered to be unsafe in a requirements file:
40 | # setuptools                # via cryptography, pytest, zope.interface
41 | 


--------------------------------------------------------------------------------
/scrapy/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = czlscrape.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = czlscrape
12 | 


--------------------------------------------------------------------------------
/scrapy/testsuite/conftest.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from pathlib import Path
3 | 
4 | sys.path.append(str(Path(__file__).resolve().parent.parent))
5 | 


--------------------------------------------------------------------------------
/scrapy/testsuite/test_validator.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from scrapy.exceptions import DropItem
 3 | from czlscrape.items import Publication
 4 | from czlscrape.pipelines import PublicationValidatorPipeline
 5 | 
 6 | def create_publication():
 7 |     return Publication(
 8 |         identifier='aa',
 9 |         title="the good publication",
10 |         institution='foo',
11 |         description="this is a publication that has all required fields",
12 |         type='HG',
13 |         date='2017-04-03',
14 |         documents=[
15 |             {'type': 'something', 'url': 'http://example.com/something.pdf'},
16 |         ],
17 |     )
18 | 
19 | def test_ok():
20 |     pipeline = PublicationValidatorPipeline()
21 |     pipeline.process_item(create_publication(), None)
22 | 
23 | @pytest.mark.parametrize('field', [
24 |     'identifier',
25 |     'title',
26 |     'institution',
27 |     'description',
28 |     'type',
29 |     'date',
30 | ])
31 | def test_missing_field(field):
32 |     publication = create_publication()
33 |     del publication[field]
34 |     pipeline = PublicationValidatorPipeline()
35 |     with pytest.raises(DropItem) as err:
36 |         pipeline.process_item(publication, None)
37 | 


--------------------------------------------------------------------------------
/sgg/README.md:
--------------------------------------------------------------------------------
 1 | # Secretariatul General al Guvernului
 2 | 
 3 | ## Tehnologie
 4 | Python3, virtualenv, scrapy
 5 | 
 6 | 
 7 | ## Instructiuni
 8 | 
 9 | Install `Python3` and `virtualenv`
10 | 
11 |     virtualenv -p python3 venv
12 |     source venv/bin/activate
13 |     pip install -r requirements.txt
14 |     cd sgg 
15 |     SGG_AUTH_TOKEN=sgg-very-secret-key python3 run.py
16 | 
17 | ## Exceptii


--------------------------------------------------------------------------------
/sgg/requirements.txt:
--------------------------------------------------------------------------------
 1 | appdirs==1.4.3
 2 | asn1crypto==0.21.1
 3 | attrs==16.3.0
 4 | Automat==0.5.0
 5 | cffi==1.9.1
 6 | constantly==15.1.0
 7 | cryptography==1.8.1
 8 | cssselect==1.0.1
 9 | idna==2.5
10 | incremental==16.10.1
11 | lxml==3.7.3
12 | packaging==16.8
13 | parsel==1.1.0
14 | pyasn1==0.2.3
15 | pyasn1-modules==0.0.8
16 | pycparser==2.17
17 | PyDispatcher==2.0.5
18 | pyOpenSSL==16.2.0
19 | pyparsing==2.2.0
20 | queuelib==1.4.2
21 | requests==2.13.0
22 | Scrapy==1.3.3
23 | service-identity==16.0.0
24 | six==1.10.0
25 | Twisted==17.1.0
26 | w3lib==1.17.0
27 | zope.interface==4.3.3
28 | 


--------------------------------------------------------------------------------
/sgg/sgg/run.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | import subprocess
 5 | import json
 6 | import requests 
 7 | 
 8 | POST_URL = "http://czl-api.code4.ro/api/publications/"
 9 | # POST_URL_DEV = "http://10.231.234.10:8000/api/publications/"
10 | 
11 | AUTH_TOKEN = os.getenv('SGG_AUTH_TOKEN', "sgg-very-secret-key")
12 | 
13 | headers = {
14 |   'Authorization': " ".join(['Token',AUTH_TOKEN])
15 | }
16 | 
17 | if os.path.exists("sgg.json"):
18 |     os.remove("sgg.json")
19 | 
20 | subprocess.call(['scrapy','crawl', 'sgg_spider', '-o', 'sgg.json'])
21 | 
22 | with open("sgg.json") as fp:
23 |     items = json.load(fp)
24 |     for item in items: 
25 |         r = requests.post(POST_URL, data=item, headers=headers)
26 |         if r.status_code >= 400:
27 |             print(json.dumps(r.json()))
28 | 
29 | print("DONE!")


--------------------------------------------------------------------------------
/sgg/sgg/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = sgg.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sgg
12 | 


--------------------------------------------------------------------------------
/sgg/sgg/sgg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/sgg/sgg/sgg/__init__.py


--------------------------------------------------------------------------------
/sgg/sgg/sgg/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class SggItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/sgg/sgg/sgg/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class SggSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/sgg/sgg/sgg/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class SggPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/sgg/sgg/sgg/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for sgg project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'sgg'
13 | 
14 | SPIDER_MODULES = ['sgg.spiders']
15 | NEWSPIDER_MODULE = 'sgg.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'sgg (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'sgg.middlewares.SggSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'sgg.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'sgg.pipelines.SggPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/sgg/sgg/sgg/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/sgg/sgg/sgg/spiders/sgg_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 |  
 4 | from scrapy.crawler import CrawlerProcess
 5 | import logging
 6 | import json 
 7 | import hashlib 
 8 | 
 9 | base_url = "http://www.sgg.ro"
10 | 
11 | 
12 | 
13 | class Item(scrapy.Item):
14 |     identifier = scrapy.Field()
15 |     title = scrapy.Field()
16 |     type = scrapy.Field()
17 |     institution = scrapy.Field()
18 |     date = scrapy.Field()
19 |     description = scrapy.Field()
20 |     feedback_days = scrapy.Field()
21 |     contact = scrapy.Field()
22 |     documents = scrapy.Field() 
23 |     
24 | def xtract(obj, sel):
25 |     ret = obj.xpath(sel).extract_first()
26 | 
27 |     if ret: 
28 |         ret = " ".join(map(lambda s : s.strip(), ret.splitlines()))
29 |         return ret
30 |     return ""
31 | 
32 | def identify(institution, titlu):
33 | 
34 |     return " : ".join([hashlib.md5(titlu.encode('utf-8')).hexdigest(), institution])
35 | 
36 | class SggSpider(scrapy.Spider):
37 |     name = "sgg_spider"
38 |     allowed_domains = ["www.sgg.ro"]
39 |     start_urls = ['http://www.sgg.ro/legislativ/index.php/']
40 | 
41 |     def parse(self, response):
42 |         links = response.css('a::attr(href)').extract()
43 |         links = list(set([response.urljoin(link) for link in links if "domeniu.php" in link]))
44 |         # yield scrapy.Request(response.urljoin('/legislativ/domeniu.php?id=84'), callback=self.parse_details)
45 | 
46 |         for link in links:
47 |             yield scrapy.Request(response.urljoin(link), callback=self.parse_details)
48 |             
49 | 
50 |     def parse_details(self, response):
51 |         # response = get(response.url)
52 | 
53 |         institution = response.xpath('//h2/text()').extract()[0].strip() 
54 |         logging.warn("scrapping: %s - %s"%(response.url, institution))
55 | 
56 |         for tr in response.xpath('//table[@class="fancy"]/tr'): 
57 |             
58 |             if tr.xpath('td[1]'):
59 |                 item = Item()
60 |                 titlu =  xtract(tr, 'td[1]//div/text()') 
61 |                 type_ = xtract(tr, 'td[2]//div//strong/text()')
62 |                 consult = xtract(tr, 'td[3]//div/text()')
63 |                 avizare = xtract(tr, 'td[4]//div/text()')
64 |                 avizori = xtract(tr, 'td[5]//div/text()')
65 |                 termen_avize = xtract(tr, 'td[6]//div/text()')
66 |                 mfp_mj = xtract(tr, 'td[7]//div/text()')
67 |                 reavizare = xtract(tr, 'td[8]//div/text()')
68 |                 init_1 = xtract(tr, 'td[9]//a/@href')
69 |                 init_2 = xtract(tr, 'td[10]//a/@href')
70 |                 final_1 = xtract(tr, 'td[11]//a/@href')
71 |                 final_2 = xtract(tr, 'td[12]//a/@href')
72 | 
73 |                 docs = [{"type": "nota", "url": response.urljoin(f)} for f in [init_1, init_2, final_1, final_2] if f]
74 | 
75 |                 item['identifier'] = identify(institution, titlu)
76 |                 item['title'] = titlu
77 |                 item['type'] = type_
78 |                 item['institution'] = "sgg"
79 |                 item['date'] = consult
80 |                 item['description'] = ""
81 |                 item['feedback_days'] = None
82 |                 item['contact'] = None
83 |                 item['documents'] = docs
84 | 
85 |                 yield item 
86 | 
87 | if __name__ == '__main__':
88 |     process = CrawlerProcess({
89 |         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
90 |         'LOG_LEVEL' : 'WARNING'
91 |     })
92 | 
93 |     process.crawl(SggSpider)
94 |     process.start()


--------------------------------------------------------------------------------
/tineret/.gitignore:
--------------------------------------------------------------------------------
1 | .scrapy
2 | **/__pycache__
3 | 


--------------------------------------------------------------------------------
/tineret/README.md:
--------------------------------------------------------------------------------
 1 | # Ministerul Justiţiei
 2 | 
 3 | ## Tehnologie
 4 | 
 5 |  - *Python 3.6*, [virtualenv](https://virtualenv.pypa.io/) is a good friend
 6 |  - [Scrapy](https://scrapy.org/)
 7 | 
 8 | ```
 9 | pip install -r requirements.txt
10 | ```
11 | 
12 | ## Instructiuni
13 | 
14 | ```
15 | scrapy crawl tineret
16 | ```
17 | 
18 | ## Altele
19 | 
20 | 


--------------------------------------------------------------------------------
/tineret/requirements.txt:
--------------------------------------------------------------------------------
 1 | appdirs==1.4.3
 2 | asn1crypto==0.21.1
 3 | attrs==16.3.0
 4 | Automat==0.5.0
 5 | cffi==1.9.1
 6 | constantly==15.1.0
 7 | convertdate==2.1.0
 8 | cryptography==1.8.1
 9 | cssselect==1.0.1
10 | ephem==3.7.6.0
11 | idna==2.5
12 | incremental==16.10.1
13 | jdatetime==1.8.2
14 | lxml==3.7.3
15 | packaging==16.8
16 | parsel==1.1.0
17 | pyasn1==0.2.3
18 | pyasn1-modules==0.0.8
19 | pycparser==2.17
20 | PyDispatcher==2.0.5
21 | pyOpenSSL==16.2.0
22 | pyparsing==2.2.0
23 | pytz==2016.10
24 | queuelib==1.4.2
25 | regex==2017.2.8
26 | ruamel.yaml==0.13.14
27 | Scrapy==1.3.3
28 | service-identity==16.0.0
29 | six==1.10.0
30 | Twisted==17.1.0
31 | tzlocal==1.3
32 | umalqurra==0.2
33 | Unidecode==0.4.20
34 | w3lib==1.17.0
35 | zope.interface==4.3.3
36 | 


--------------------------------------------------------------------------------
/tineret/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = tineret.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = tineret
12 | 


--------------------------------------------------------------------------------
/tineret/tineret/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/tineret/tineret/__init__.py


--------------------------------------------------------------------------------
/tineret/tineret/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | class Publication(scrapy.Item):
11 |     # define the fields for your item here like:
12 |     # name = scrapy.Field()
13 |     identifier = scrapy.Field()
14 |     title = scrapy.Field()
15 |     type = scrapy.Field()
16 |     institution = scrapy.Field()
17 |     date = scrapy.Field()
18 |     description = scrapy.Field()
19 |     feedback_days = scrapy.Field()
20 |     contact = scrapy.Field()
21 |     documents = scrapy.Field()
22 | 
23 |     pass
24 | 
25 | 


--------------------------------------------------------------------------------
/tineret/tineret/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import requests
 9 | import json
10 | import logging
11 | 
12 | from tineret.items import Publication
13 | import logging
14 | 
15 | API_KEY = 'tineret-very-secret-key'
16 | API_PUBLICATIONS = 'http://czl-api.code4.ro/api/publications/'
17 | 
18 | class PublicationsToApiPipeline(object):
19 |     def process_item(self, item, spider):
20 | 
21 |         if type(item) != Publication:
22 |             return item
23 | 
24 |         r = requests.post(API_PUBLICATIONS, json=dict(item), headers={'Authorization': 'Token %s' % (API_KEY,) } )
25 | 
26 | 
27 |         if r.status_code == 200 or r.status_code == '200':
28 |             logging.log(msg=r.status_code, level=logging.INFO)
29 |         else:
30 |             logging.log(msg=r.status_code, level=logging.ERROR)
31 |             logging.log(msg=r.content, level=logging.INFO)
32 | 
33 |         return item
34 | 


--------------------------------------------------------------------------------
/tineret/tineret/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for tineret project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'tineret'
13 | 
14 | SPIDER_MODULES = ['tineret.spiders']
15 | NEWSPIDER_MODULE = 'tineret.spiders'
16 | 
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | USER_AGENT = 'code4romania (+http://www.code4.ro)'
19 | 
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = True
22 | 
23 | LOG_ENABLED = True
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 | 
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | DOWNLOAD_DELAY = 5
32 | # The download delay setting will honor only one of:
33 | CONCURRENT_REQUESTS_PER_DOMAIN = 1
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 | 
36 | # Disable cookies (enabled by default)
37 | #COOKIES_ENABLED = False
38 | 
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 | 
42 | # Override the default request headers:
43 | #DEFAULT_REQUEST_HEADERS = {
44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | #   'Accept-Language': 'en',
46 | #}
47 | 
48 | # Enable or disable spider middlewares
49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
50 | #SPIDER_MIDDLEWARES = {
51 | #    'tineret.middlewares.JustSpiderMiddleware': 543,
52 | #}
53 | 
54 | # Enable or disable downloader middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
56 | #DOWNLOADER_MIDDLEWARES = {
57 | #    'tineret.middlewares.MyCustomDownloaderMiddleware': 543,
58 | #}
59 | 
60 | # Enable or disable extensions
61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | #    'scrapy.extensions.telnet.AutoThrottle': None,
64 | #}
65 | 
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 |     'tineret.pipelines.PublicationsToApiPipeline': 100,
70 | }
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | # AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | # AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | # AUTOTHROTTLE_MAX_DELAY = 30
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0
82 | # Enable showing throttling stats for every response received:
83 | # AUTOTHROTTLE_DEBUG = True
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | HTTPCACHE_ENABLED = True
88 | HTTPCACHE_EXPIRATION_SECS = 30
89 | HTTPCACHE_DIR = 'httpcache'
90 | HTTPCACHE_IGNORE_HTTP_CODES = []
91 | HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 


--------------------------------------------------------------------------------
/tineret/tineret/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/transport/README.md:
--------------------------------------------------------------------------------
 1 | # Ministerul Transporturilor
 2 | 
 3 | ## Tehnologie
 4 | 
 5 | Node.js, [nightmare](http://www.nightmarejs.org/)
 6 | 
 7 | ## Instrucțiuni
 8 | 
 9 | ```
10 | npm install
11 | ```
12 | 
13 | edit config.js, change API token (can also be specified on the command line) and other config vars
14 | 
15 | ```
16 | [API_TOKEN=foobar] npm start
17 | ```
18 | 
19 | ## Excepții
20 | 


--------------------------------------------------------------------------------
/transport/config.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   api: {
 3 |     url: 'http://czl-api.code4.ro/api/publications/',
 4 |     token: 'educatie-very-secret-key'
 5 |   },
 6 |   scrape: {
 7 |     //url of the proposals listing page
 8 |     baseUrl: 'http://mt.gov.ro/web14/transparenta-decizionala/consultare-publica/acte-normative-in-avizare',
 9 |     //number of listing pages to scrape
10 |     pages: 2
11 |   }
12 | };
13 | 


--------------------------------------------------------------------------------
/transport/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "mt-scraper",
 3 |   "version": "1.0.0",
 4 |   "description": "Data scraper pentru Ministerul Transporturilor",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "start": "node index.js",
 8 |     "test": "echo \"Error: no test specified\" && exit 1"
 9 |   },
10 |   "repository": {
11 |     "type": "git",
12 |     "url": "git+https://github.com/lbogdan/czl-scrape"
13 |   },
14 |   "author": {
15 |     "name": "Bogdan Luca",
16 |     "email": "luca.bogdan@gmail.com"
17 |   },
18 |   "license": "MIT",
19 |   "dependencies": {
20 |     "diacritics": "^1.3.0",
21 |     "jsonfile": "^2.4.0",
22 |     "moment": "^2.17.1",
23 |     "nightmare": "^2.10.0",
24 |     "request-promise": "^4.1.1"
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/turism/README.md:
--------------------------------------------------------------------------------
 1 | # Ministerul Turismului
 2 | 
 3 | ## Tehnologie
 4 | Java
 5 | 
 6 | ## Instructiuni
 7 | Rulati scraper.jar, va crea output-ul astfel:
 8 | 
 9 | Anexele le salveaza in out_files/Anexe
10 | Proiectele le salveaza in out_files/Proiecte
11 | 
12 | ## Exceptii
13 | Din cauza faptului ca orice link care nu ducea la un document PDF de pe site redirectiona pe pagina principala, scraper-ul downloadeaza toate documentele PDF si le organizeaza in Anexe si Proiecte.


--------------------------------------------------------------------------------
/turism/out/production/scraper/com/company/Main.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out/production/scraper/com/company/Main.class


--------------------------------------------------------------------------------
/turism/out/production/scraper/com/company/Scraper.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out/production/scraper/com/company/Scraper.class


--------------------------------------------------------------------------------
/turism/out/scraper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out/scraper.jar


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa1.1.1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.1.1.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa1.1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.1.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa1.2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.2.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa1.3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.3.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa1.4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.4.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa1.5.1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.5.1.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa1.5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.5.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa1.6.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.6.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa1.7.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.7.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa1.8.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.8.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa10.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa11.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa11.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa12.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa12.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa13.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa13.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa14.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa14.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa15.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa15.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa2.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa3.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa4.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa5.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa6.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa6.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa7.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa7.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa8.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa8.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa9.2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa9.2.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexa9.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa9.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/AnexaAP.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/AnexaAP.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexabrevet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexabrevet.pdf


--------------------------------------------------------------------------------
/turism/out_files/Anexe/Anexacazare.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexacazare.pdf


--------------------------------------------------------------------------------
/turism/out_files/Proiecte/Ordin-criterii-participare-targuri-externe.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Proiecte/Ordin-criterii-participare-targuri-externe.pdf


--------------------------------------------------------------------------------
/turism/out_files/Proiecte/Proiect-de-Ordin-al-Ministrului-delegat-pentru-intreprinderi-mici-şi-mijlocii-mediul-de-afaceri-şi-turism-pentru-modificarea-OMT-nr-235-2001.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Proiecte/Proiect-de-Ordin-al-Ministrului-delegat-pentru-intreprinderi-mici-şi-mijlocii-mediul-de-afaceri-şi-turism-pentru-modificarea-OMT-nr-235-2001.pdf


--------------------------------------------------------------------------------
/turism/out_files/Proiecte/Proiect-ordin-modificare-Ordin-65.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Proiecte/Proiect-ordin-modificare-Ordin-65.pdf


--------------------------------------------------------------------------------