├── .gitattributes ├── .gitignore ├── README.md ├── _commons-java ├── .gitignore ├── README.md ├── pom.xml └── src │ └── main │ └── java │ └── ro │ └── code4 │ └── czl │ └── scrape │ ├── client │ ├── ApiClient.java │ ├── ApiInvoker.java │ ├── AuthenticationStrategy.java │ ├── BaseRequest.java │ ├── BaseRequestBuilder.java │ ├── Credential.java │ ├── CzlApiUploadPipeline.java │ ├── CzlApiV1.java │ ├── CzlClient.java │ ├── CzlClientConfig.java │ ├── Request.java │ ├── RequestBuilder.java │ ├── Response.java │ ├── authentication │ │ └── TokenAuthenticationStrategy.java │ ├── core │ │ ├── CloseIdleConnectionsTask.java │ │ ├── IdleConnectionMonitor.java │ │ ├── JaxRsJacksonConfigurator.java │ │ ├── JaxRsResponse.java │ │ ├── JaxRsResponseDeserializationStrategy.java │ │ ├── JerseyClientApiInvoker.java │ │ └── LoggingFilter.java │ ├── model │ │ └── CreatePublicationRequest.java │ ├── representation │ │ ├── ContactRepresentation.java │ │ ├── DocumentRepresentation.java │ │ └── PublicationRepresentation.java │ └── samples │ │ └── CzlClientSample.java │ └── text │ ├── ProposalType.java │ └── RomanianMonth.java ├── _config.yml ├── afaceri ├── README.md ├── package.json └── server │ ├── boot │ ├── authentication.js │ └── root.js │ ├── component-config.json │ ├── config.json │ ├── config │ └── keywords.js │ ├── controllers │ └── contentParser.js │ ├── datasources.json │ ├── middleware.development.json │ ├── middleware.json │ ├── model-config.json │ └── server.js ├── agricultura ├── .gitignore ├── README.md ├── index.js └── package.json ├── aparare ├── README.md └── mapn_plugin.php ├── apepaduri └── README.md ├── cdep ├── README.md ├── requirements.in ├── requirements.txt └── scraper.py ├── cercetare ├── .editorconfig ├── .gitignore ├── README.md ├── app.js ├── package.json ├── parseProject.js └── secrets.json.txt ├── dezvoltare ├── .gitignore ├── README.md ├── crawl_dezvoltare │ ├── crawl_dezvoltare │ │ ├── __init__.py │ │ ├── exporters.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── mdrap.py │ │ │ └── testing.py │ └── scrapy.cfg └── requirements.txt ├── economie ├── .editorconfig ├── .gitignore ├── README.md ├── app.js ├── package.json ├── parseProject.js ├── secrets.json.txt └── yarn.lock ├── educatie ├── README.md ├── config.js ├── index.js └── package.json ├── energie ├── .gitignore ├── README.md ├── pom.xml └── src │ └── main │ ├── java │ └── Main.java │ └── resources │ └── logback.xml ├── externe ├── README.md ├── __init__.py ├── eusebiu.py ├── scraper │ ├── __init__.py │ ├── article.py │ ├── article_serializer.py │ └── extractor.py ├── setup.py └── utils │ ├── __init__.py │ ├── api_client.py │ ├── lang.py │ └── settings.py ├── finantepub ├── .gitignore ├── README.md ├── index.js └── package.json ├── interne ├── .editorconfig ├── .gitignore ├── README.md ├── app.js ├── package.json ├── parseProject.js ├── secrets.json.txt └── yarn.lock ├── justitie ├── .gitignore ├── README.md ├── doc │ └── scraping.md ├── just │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── publications.py ├── requirements.txt └── scrapy.cfg ├── license ├── mediu ├── .gitignore ├── README.md ├── crawl_mediu │ ├── crawl_mediu │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ └── mmediu.py │ └── scrapy.cfg └── requirements.txt ├── pom.xml ├── presedinte └── README.md ├── pretutindeni ├── .gitignore ├── README.md ├── app.js ├── package.json ├── parseProject.example └── yarn.lock ├── relparlament ├── README.md ├── index.js └── package.json ├── sanatate ├── .gitignore ├── README.md ├── credentials.json ├── requirements.txt ├── scrapy.cfg └── scrapy_proj │ ├── __init__.py │ ├── helpers │ ├── __init__.py │ ├── legal.py │ ├── romanian.py │ └── text.py │ ├── items │ ├── __init__.py │ ├── act.py │ └── contact.py │ ├── loaders │ ├── __init__.py │ ├── act.py │ └── contact.py │ ├── pipelines │ ├── __init__.py │ ├── extrameta.py │ └── post.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ └── sanatate.py ├── scrapy ├── .gitignore ├── Readme.md ├── czlscrape │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ ├── spiders │ │ ├── __init__.py │ │ ├── afaceri.py │ │ ├── comunicatii.py │ │ ├── cultura.py │ │ ├── dialog.py │ │ ├── munca.py │ │ └── senat.py │ └── utils.py ├── requirements.in ├── requirements.txt ├── scrapy.cfg └── testsuite │ ├── conftest.py │ └── test_validator.py ├── sgg ├── README.md ├── requirements.txt └── sgg │ ├── run.py │ ├── scrapy.cfg │ └── sgg │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ └── sgg_spider.py ├── tineret ├── .gitignore ├── README.md ├── requirements.txt ├── scrapy.cfg └── tineret │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ └── tineret.py ├── transport ├── README.md ├── config.js ├── index.js └── package.json └── turism ├── README.md ├── out ├── production │ └── scraper │ │ └── com │ │ └── company │ │ ├── Main.class │ │ └── Scraper.class └── scraper.jar ├── out_files ├── Anexe │ ├── Anexa1.1.1.pdf │ ├── Anexa1.1.pdf │ ├── Anexa1.2.pdf │ ├── Anexa1.3.pdf │ ├── Anexa1.4.pdf │ ├── Anexa1.5.1.pdf │ ├── Anexa1.5.pdf │ ├── Anexa1.6.pdf │ ├── Anexa1.7.pdf │ ├── Anexa1.8.pdf │ ├── Anexa1.pdf │ ├── Anexa10.pdf │ ├── Anexa11.pdf │ ├── Anexa12.pdf │ ├── Anexa13.pdf │ ├── Anexa14.pdf │ ├── Anexa15.pdf │ ├── Anexa2.pdf │ ├── Anexa3.pdf │ ├── Anexa4.pdf │ ├── Anexa5.pdf │ ├── Anexa6.pdf │ ├── Anexa7.pdf │ ├── Anexa8.pdf │ ├── Anexa9.2.pdf │ ├── Anexa9.pdf │ ├── AnexaAP.pdf │ ├── Anexabrevet.pdf │ └── Anexacazare.pdf └── Proiecte │ ├── Ordin-criterii-participare-targuri-externe.pdf │ ├── Proiect-de-Ordin-al-Ministrului-delegat-pentru-intreprinderi-mici-şi-mijlocii-mediul-de-afaceri-şi-turism-pentru-modificarea-OMT-nr-235-2001.pdf │ └── Proiect-ordin-modificare-Ordin-65.pdf └── src └── com └── company └── Main.java /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # OS files 2 | .DS_Store 3 | 4 | # Java files 5 | *.class 6 | 7 | # Log files 8 | *.log 9 | logs 10 | 11 | # Maven 12 | target 13 | pom.xml.versionsBackup 14 | 15 | # Mobile Tools for Java (J2ME) 16 | .mtj.tmp/ 17 | 18 | # Package Files 19 | *.jar 20 | *.war 21 | *.ear 22 | 23 | # IntelliJ IDEA 24 | *.iml 25 | .idea 26 | 27 | # Eclipse 28 | .project 29 | .settings 30 | .classpath 31 | test-output 32 | 33 | # Vim 34 | *.swp 35 | 36 | # Virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 37 | hs_err_pid* 38 | 39 | # Misc 40 | *git.properties 41 | 42 | # Python 43 | *.pyc 44 | 45 | # pyenv 46 | .python-version 47 | 48 | # Node 49 | node_modules/ 50 | -------------------------------------------------------------------------------- /_commons-java/.gitignore: -------------------------------------------------------------------------------- 1 | # OS files 2 | .DS_Store 3 | 4 | # Java files 5 | *.class 6 | 7 | # Log files 8 | *.log 9 | logs 10 | 11 | # Maven 12 | target 13 | pom.xml.versionsBackup 14 | 15 | # Dropwizard 16 | dependency-reduced-pom.xml 17 | 18 | # Mobile Tools for Java (J2ME) 19 | .mtj.tmp/ 20 | 21 | # Package Files 22 | *.jar 23 | *.war 24 | *.ear 25 | 26 | # IntelliJ IDEA 27 | *.iml 28 | .idea 29 | 30 | # Eclipse 31 | .project 32 | .settings 33 | .classpath 34 | test-output 35 | 36 | # Vim 37 | *.swp 38 | 39 | # Virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 40 | hs_err_pid* 41 | 42 | # Misc 43 | *git.properties 44 | 45 | # Asciidoc 46 | .asciidoctor 47 | diag-*.png 48 | -------------------------------------------------------------------------------- /_commons-java/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | 6 | ro.code4.czl 7 | czl-scrape 8 | 0.0.1-SNAPSHOT 9 | ../ 10 | 11 | 12 | czl-scrape-commons 13 | jar 14 | 15 | Ce Zice Legea :: Scraper :: Common Libraries 16 | 17 | 18 | 19 | 20 | us.codecraft 21 | webmagic-core 22 | 23 | 24 | org.slf4j 25 | slf4j-log4j12 26 | 27 | 28 | 29 | 30 | 31 | 32 | org.slf4j 33 | slf4j-api 34 | 35 | 36 | 37 | 38 | org.glassfish.jersey.core 39 | jersey-client 40 | 41 | 42 | org.glassfish.jersey.connectors 43 | jersey-apache-connector 44 | 45 | 46 | org.glassfish.jersey.media 47 | jersey-media-json-jackson 48 | 49 | 50 | 51 | org.apache.commons 52 | commons-lang3 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/client/ApiClient.java: -------------------------------------------------------------------------------- 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 2 | * 3 | * ADOBE CONFIDENTIAL 4 | * ___________________ 5 | * 6 | * Copyright 2016 Adobe Systems Incorporated 7 | * All Rights Reserved. 8 | * 9 | * NOTICE: All information contained herein is, and remains 10 | * the property of Adobe Systems Incorporated and its suppliers, 11 | * if any. The intellectual and technical concepts contained 12 | * herein are proprietary to Adobe Systems Incorporated and its 13 | * suppliers and are protected by all applicable intellectual property 14 | * laws, including trade secret and copyright laws. 15 | * Dissemination of this information or reproduction of this material 16 | * is strictly forbidden unless prior written permission is obtained 17 | * from Adobe Systems Incorporated. 18 | * 19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/ 20 | package ro.code4.czl.scrape.client; 21 | 22 | import ro.code4.czl.scrape.client.core.JerseyClientApiInvoker; 23 | 24 | /** 25 | * {@link ApiClient} instances are heavyweight objects that should be created sparingly. A {@link ApiClient} object is 26 | * thread-safe and should be reused when targeting the same service endpoint. 27 | * 28 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com) 29 | */ 30 | public abstract class ApiClient implements AutoCloseable { 31 | 32 | protected final ApiInvoker apiInvoker; 33 | 34 | /** 35 | * Creates a new client instance using all the settings specified by the given configuration object. 36 | * 37 | * @param config a client configuration object 38 | */ 39 | protected ApiClient(CzlClientConfig config) { 40 | this(config, new JerseyClientApiInvoker(config)); 41 | } 42 | 43 | /** 44 | * Creates a new client instance using all the settings specified by the given configuration object and a custom {@link ApiInvoker} instance. 45 | * 46 | * @param config a client configuration object 47 | * @param apiInvoker a custom API invoker object 48 | */ 49 | private ApiClient(CzlClientConfig config, ApiInvoker apiInvoker) { 50 | this.apiInvoker = apiInvoker; 51 | } 52 | 53 | /** 54 | * Retrieves the API invoker object used by this client. 55 | * 56 | * @return a {@link ApiInvoker} instance 57 | */ 58 | public ApiInvoker getApiInvoker() { 59 | return apiInvoker; 60 | } 61 | 62 | @Override 63 | public void close() throws Exception { 64 | this.shutdown(); 65 | } 66 | 67 | /** 68 | * Shuts down the connection manager used by this client and releases allocated resources. This includes closing all connections, whether they are 69 | * currently used or not. 70 | */ 71 | private void shutdown() { 72 | apiInvoker.shutdown(); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/client/ApiInvoker.java: -------------------------------------------------------------------------------- 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 2 | * 3 | * ADOBE CONFIDENTIAL 4 | * ___________________ 5 | * 6 | * Copyright 2016 Adobe Systems Incorporated 7 | * All Rights Reserved. 8 | * 9 | * NOTICE: All information contained herein is, and remains 10 | * the property of Adobe Systems Incorporated and its suppliers, 11 | * if any. The intellectual and technical concepts contained 12 | * herein are proprietary to Adobe Systems Incorporated and its 13 | * suppliers and are protected by all applicable intellectual property 14 | * laws, including trade secret and copyright laws. 15 | * Dissemination of this information or reproduction of this material 16 | * is strictly forbidden unless prior written permission is obtained 17 | * from Adobe Systems Incorporated. 18 | * 19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/ 20 | package ro.code4.czl.scrape.client; 21 | 22 | /** 23 | * Basic API invoker contract. 24 | * 25 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com) 26 | */ 27 | public interface ApiInvoker extends AutoCloseable { 28 | 29 | /** 30 | * Configures a request header that should be added to every request made via this API invoker. 31 | * 32 | * @param key request header name 33 | * @param value request header value 34 | */ 35 | void addDefaultHeader(String key, String value); 36 | 37 | /** 38 | * Executes a request. 39 | * 40 | * @param request the request to execute 41 | * @param the type that the response should be deserialized into 42 | * @return a {@link Response} instance containing the response body deserialized into the desired type 43 | */ 44 | Response invokeAPI(Request request); 45 | 46 | /** 47 | * Shuts down the connection manager used by this API invoker and releases allocated resources. This includes closing all connections, whether they 48 | * are currently used or not. 49 | */ 50 | void shutdown(); 51 | } 52 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/client/AuthenticationStrategy.java: -------------------------------------------------------------------------------- 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 2 | * 3 | * ADOBE CONFIDENTIAL 4 | * ___________________ 5 | * 6 | * Copyright 2016 Adobe Systems Incorporated 7 | * All Rights Reserved. 8 | * 9 | * NOTICE: All information contained herein is, and remains 10 | * the property of Adobe Systems Incorporated and its suppliers, 11 | * if any. The intellectual and technical concepts contained 12 | * herein are proprietary to Adobe Systems Incorporated and its 13 | * suppliers and are protected by all applicable intellectual property 14 | * laws, including trade secret and copyright laws. 15 | * Dissemination of this information or reproduction of this material 16 | * is strictly forbidden unless prior written permission is obtained 17 | * from Adobe Systems Incorporated. 18 | * 19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/ 20 | package ro.code4.czl.scrape.client; 21 | 22 | /** 23 | * Contract for an authentication strategy. 24 | * 25 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com) 26 | */ 27 | public interface AuthenticationStrategy { 28 | 29 | /** 30 | * Processes the request with the goal of applying the authentication strategy. This is called before the request is executed. 31 | * 32 | * @param request the request. 33 | * @param the expected type of the response body 34 | */ 35 | void process(Request request); 36 | 37 | } 38 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/client/BaseRequestBuilder.java: -------------------------------------------------------------------------------- 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 2 | * 3 | * ADOBE CONFIDENTIAL 4 | * ___________________ 5 | * 6 | * Copyright 2016 Adobe Systems Incorporated 7 | * All Rights Reserved. 8 | * 9 | * NOTICE: All information contained herein is, and remains 10 | * the property of Adobe Systems Incorporated and its suppliers, 11 | * if any. The intellectual and technical concepts contained 12 | * herein are proprietary to Adobe Systems Incorporated and its 13 | * suppliers and are protected by all applicable intellectual property 14 | * laws, including trade secret and copyright laws. 15 | * Dissemination of this information or reproduction of this material 16 | * is strictly forbidden unless prior written permission is obtained 17 | * from Adobe Systems Incorporated. 18 | * 19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/ 20 | package ro.code4.czl.scrape.client; 21 | 22 | import java.util.HashMap; 23 | import java.util.Map; 24 | 25 | /** 26 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com) 27 | */ 28 | public abstract class BaseRequestBuilder, U> implements RequestBuilder { 29 | 30 | String ifNoneMatch; 31 | String ifMatch; 32 | Map customHeaders = new HashMap<>(); 33 | boolean head; 34 | Boolean followRedirects; 35 | Credential credential; 36 | 37 | @Override 38 | public RequestBuilder ifNoneMatch(String ifNoneMatch) { 39 | this.ifNoneMatch = ifNoneMatch; 40 | return this; 41 | } 42 | 43 | @Override 44 | public RequestBuilder ifMatch(String ifMatch) { 45 | this.ifMatch = ifMatch; 46 | return this; 47 | } 48 | 49 | @Override 50 | public RequestBuilder headersOnly() { 51 | this.head = true; 52 | return this; 53 | } 54 | 55 | @Override 56 | public RequestBuilder followRedirects(boolean followRedirects) { 57 | this.followRedirects = followRedirects; 58 | return this; 59 | } 60 | 61 | @Override 62 | public RequestBuilder credential(Credential credential) { 63 | this.credential = credential; 64 | return this; 65 | } 66 | 67 | @Override 68 | public RequestBuilder header(String headerName, String headerValue) { 69 | this.customHeaders.put(headerName, headerValue); 70 | return this; 71 | } 72 | 73 | @Override 74 | public Response execute() { 75 | return build().execute(); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/client/Credential.java: -------------------------------------------------------------------------------- 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 2 | * 3 | * ADOBE CONFIDENTIAL 4 | * ___________________ 5 | * 6 | * Copyright 2016 Adobe Systems Incorporated 7 | * All Rights Reserved. 8 | * 9 | * NOTICE: All information contained herein is, and remains 10 | * the property of Adobe Systems Incorporated and its suppliers, 11 | * if any. The intellectual and technical concepts contained 12 | * herein are proprietary to Adobe Systems Incorporated and its 13 | * suppliers and are protected by all applicable intellectual property 14 | * laws, including trade secret and copyright laws. 15 | * Dissemination of this information or reproduction of this material 16 | * is strictly forbidden unless prior written permission is obtained 17 | * from Adobe Systems Incorporated. 18 | * 19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/ 20 | package ro.code4.czl.scrape.client; 21 | 22 | /** 23 | * Marker interface for credential used during authentication. Used by {@linkplain AuthenticationStrategy} implementations. 24 | * 25 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com) 26 | */ 27 | public interface Credential { 28 | 29 | } 30 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/client/CzlApiUploadPipeline.java: -------------------------------------------------------------------------------- 1 | package ro.code4.czl.scrape.client; 2 | 3 | import static ro.code4.czl.scrape.client.representation.PublicationRepresentation.PublicationRepresentationBuilder.aPublicationRepresentation; 4 | 5 | import ro.code4.czl.scrape.client.representation.DocumentRepresentation; 6 | import us.codecraft.webmagic.ResultItems; 7 | import us.codecraft.webmagic.Task; 8 | import us.codecraft.webmagic.pipeline.Pipeline; 9 | 10 | import java.util.List; 11 | import java.util.Map; 12 | 13 | /** 14 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com) 15 | */ 16 | public class CzlApiUploadPipeline implements Pipeline { 17 | 18 | private final CzlClient czlClient; 19 | 20 | public CzlApiUploadPipeline(CzlClient czlClient) { 21 | this.czlClient = czlClient; 22 | } 23 | 24 | @SuppressWarnings("unchecked") 25 | @Override 26 | public void process(ResultItems resultItems, Task task) { 27 | Map extractedFields = resultItems.getAll(); 28 | 29 | czlClient.apiV1() 30 | .createPublication(aPublicationRepresentation() 31 | .withDate((String) extractedFields.get("date")) 32 | .withInstitution((String) extractedFields.get("institution")) 33 | .withIdentifier((String) extractedFields.get("identifier")) 34 | .withDescription((String) extractedFields.get("description")) 35 | .withDocuments((List) extractedFields.get("documents")) 36 | .withTitle((String) extractedFields.get("title")) 37 | .withType((String) extractedFields.get("type")) 38 | //.withFeedback_days((int) extractedFields.get("feedbackDays")) 39 | .withContact((Map) extractedFields.get("contact")) 40 | .build()) 41 | .execute(); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/client/CzlApiV1.java: -------------------------------------------------------------------------------- 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 2 | * 3 | * ADOBE CONFIDENTIAL 4 | * ___________________ 5 | * 6 | * Copyright 2016 Adobe Systems Incorporated 7 | * All Rights Reserved. 8 | * 9 | * NOTICE: All information contained herein is, and remains 10 | * the property of Adobe Systems Incorporated and its suppliers, 11 | * if any. The intellectual and technical concepts contained 12 | * herein are proprietary to Adobe Systems Incorporated and its 13 | * suppliers and are protected by all applicable intellectual property 14 | * laws, including trade secret and copyright laws. 15 | * Dissemination of this information or reproduction of this material 16 | * is strictly forbidden unless prior written permission is obtained 17 | * from Adobe Systems Incorporated. 18 | * 19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/ 20 | package ro.code4.czl.scrape.client; 21 | 22 | import ro.code4.czl.scrape.client.model.CreatePublicationRequest; 23 | import ro.code4.czl.scrape.client.representation.PublicationRepresentation; 24 | 25 | /** 26 | * A class describing the API for Ce Zice Legea. Uses a fluent builder interface to create requests. 27 | * 28 | * @author Ionut -Maxim Margelatu (ionut.margelatu@gmail.com) 29 | */ 30 | public class CzlApiV1 { 31 | 32 | private final ApiInvoker apiInvoker; 33 | 34 | /** 35 | * Creates a new request builder. 36 | * 37 | * @param apiInvoker the {@linkplain ApiInvoker} implementation to use for every request built via this class. 38 | * @see ApiInvoker 39 | */ 40 | CzlApiV1(ApiInvoker apiInvoker) { 41 | this.apiInvoker = apiInvoker; 42 | } 43 | 44 | /** 45 | * Starts preparing a new request for creating a publication. 46 | * 47 | * @param publicationRepresentation the representation of the publication to create. 48 | * @return a request builder. 49 | */ 50 | public CreatePublicationRequest.Builder createPublication(PublicationRepresentation publicationRepresentation) { 51 | return CreatePublicationRequest.builder(publicationRepresentation, apiInvoker); 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/client/CzlClient.java: -------------------------------------------------------------------------------- 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 2 | * 3 | * ADOBE CONFIDENTIAL 4 | * ___________________ 5 | * 6 | * Copyright 2016 Adobe Systems Incorporated 7 | * All Rights Reserved. 8 | * 9 | * NOTICE: All information contained herein is, and remains 10 | * the property of Adobe Systems Incorporated and its suppliers, 11 | * if any. The intellectual and technical concepts contained 12 | * herein are proprietary to Adobe Systems Incorporated and its 13 | * suppliers and are protected by all applicable intellectual property 14 | * laws, including trade secret and copyright laws. 15 | * Dissemination of this information or reproduction of this material 16 | * is strictly forbidden unless prior written permission is obtained 17 | * from Adobe Systems Incorporated. 18 | * 19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/ 20 | package ro.code4.czl.scrape.client; 21 | 22 | /** 23 | * An REST client object. {@link CzlClient} instances are heavyweight objects that should be created sparingly. A {@link CzlClient} object is 24 | * thread-safe and should be reused when targeting the same service endpoint. 25 | * 26 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com) 27 | */ 28 | public class CzlClient extends ApiClient { 29 | 30 | /** 31 | * Build a new client instance using all the settings specified by the given configuration object. {@link CzlClient} instances are heavyweight objects 32 | * that should be created sparingly. A {@link CzlClient} object is thread-safe and should be reused when targeting the same service endpoint. 33 | * 34 | * @param czlClientConfig a client configuration object 35 | * @return a new SDK client instance 36 | */ 37 | public static CzlClient newClient(CzlClientConfig czlClientConfig) { 38 | return new CzlClient(czlClientConfig); 39 | } 40 | 41 | private CzlClient(CzlClientConfig czlClientConfig) { 42 | super(czlClientConfig); 43 | } 44 | 45 | 46 | /** 47 | * Access the API. 48 | * 49 | * @return an object describing the API. 50 | */ 51 | public CzlApiV1 apiV1() { 52 | return new CzlApiV1(apiInvoker); 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/client/Request.java: -------------------------------------------------------------------------------- 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 2 | * 3 | * ADOBE CONFIDENTIAL 4 | * ___________________ 5 | * 6 | * Copyright 2016 Adobe Systems Incorporated 7 | * All Rights Reserved. 8 | * 9 | * NOTICE: All information contained herein is, and remains 10 | * the property of Adobe Systems Incorporated and its suppliers, 11 | * if any. The intellectual and technical concepts contained 12 | * herein are proprietary to Adobe Systems Incorporated and its 13 | * suppliers and are protected by all applicable intellectual property 14 | * laws, including trade secret and copyright laws. 15 | * Dissemination of this information or reproduction of this material 16 | * is strictly forbidden unless prior written permission is obtained 17 | * from Adobe Systems Incorporated. 18 | * 19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/ 20 | package ro.code4.czl.scrape.client; 21 | 22 | import java.util.Map; 23 | 24 | /** 25 | * Contract for a request made by the client. 26 | * 27 | * @param the expected type of the response body 28 | * @author Ionut -Maxim Margelatu (ionut.margelatu@gmail.com) 29 | */ 30 | public interface Request { 31 | 32 | /** 33 | * Executes the request and returns the response. 34 | * 35 | * @return the result of the execution. If the response contains a body, it will be automatically deserialized and ready for use. 36 | */ 37 | Response execute(); 38 | 39 | /** 40 | * Returns the type of response body, if any; null otherwise. 41 | * 42 | * @return the response type 43 | */ 44 | Class getResponseType(); 45 | 46 | /** 47 | * Returns the absolute path of the target of this request. 48 | * 49 | * @return the absolute path. 50 | */ 51 | String getPath(); 52 | 53 | /** 54 | * Returns the HTTP method used by this request. 55 | * 56 | * @return the method. 57 | */ 58 | String getMethod(); 59 | 60 | /** 61 | * Returns the path parameters used by this request. 62 | * 63 | * @return the path parameters. 64 | */ 65 | Map getPathParams(); 66 | 67 | /** 68 | * Returns the query parameters used by this request. 69 | * 70 | * @return the query parameters. 71 | */ 72 | Map getQueryParams(); 73 | 74 | /** 75 | * Returns the matrix parameters used by this request. 76 | * 77 | * @return the matrix parameters. 78 | */ 79 | Map getMatrixParams(); 80 | 81 | /** 82 | * Returns the header parameters used by this request. 83 | * 84 | * @return the header parameters. 85 | */ 86 | Map getHeaderParams(); 87 | 88 | /** 89 | * Returns the body used by this request, if any. 90 | * 91 | * @return the body if one has been specified, null otherwise. 92 | */ 93 | Object getBody(); 94 | 95 | /** 96 | * Returns the value of the Accept used by this request. 97 | * 98 | * @return the value of the Accept header. 99 | */ 100 | String getAcceptHeader(); 101 | 102 | /** 103 | * Indicates whether this request is supposed to follow redirects or not. 104 | * 105 | * @return true if the request is supposed to follow redirects, false otherwise. 106 | */ 107 | Boolean isFollowRedirectsEnabled(); 108 | 109 | /** 110 | * Returns the value of the credential used by this request. 111 | * 112 | * @return the credential, if any. 113 | */ 114 | Credential getCredential(); 115 | } 116 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/client/RequestBuilder.java: -------------------------------------------------------------------------------- 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 2 | * 3 | * ADOBE CONFIDENTIAL 4 | * ___________________ 5 | * 6 | * Copyright 2016 Adobe Systems Incorporated 7 | * All Rights Reserved. 8 | * 9 | * NOTICE: All information contained herein is, and remains 10 | * the property of Adobe Systems Incorporated and its suppliers, 11 | * if any. The intellectual and technical concepts contained 12 | * herein are proprietary to Adobe Systems Incorporated and its 13 | * suppliers and are protected by all applicable intellectual property 14 | * laws, including trade secret and copyright laws. 15 | * Dissemination of this information or reproduction of this material 16 | * is strictly forbidden unless prior written permission is obtained 17 | * from Adobe Systems Incorporated. 18 | * 19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/ 20 | package ro.code4.czl.scrape.client; 21 | 22 | /** 23 | * Contract for builders of {@linkplain Request} instances. 24 | * 25 | * @param the request type 26 | * @param the expected type of the response body 27 | * @author Ionut -Maxim Margelatu (ionut.margelatu@gmail.com) 28 | */ 29 | public interface RequestBuilder, U> { 30 | 31 | /** 32 | * Sets the If-None-Match header to the given value. Useful when making conditional requests. 33 | * 34 | * @param ifNoneMatch the value of the header. 35 | * @return the request builder. 36 | */ 37 | RequestBuilder ifNoneMatch(String ifNoneMatch); 38 | 39 | /** 40 | * Sets the If-Match header to the given value. Useful when making conditional requests. 41 | * 42 | * @param ifMatch the value of the header. 43 | * @return the request builder. 44 | */ 45 | RequestBuilder ifMatch(String ifMatch); 46 | 47 | /** 48 | * Make the request to only ask for headers. Only applies when the original request is using GET. 49 | * 50 | * @return the request builder. 51 | */ 52 | RequestBuilder headersOnly(); 53 | 54 | /** 55 | * Enables or disables following redirects. 56 | * 57 | * @param followRedirects set to true to enable following redirects, otherwise to false. 58 | * @return the request builder. 59 | */ 60 | RequestBuilder followRedirects(boolean followRedirects); 61 | 62 | /** 63 | * Use the given credential for this request. 64 | * 65 | * @param credential the credential to use for this request. 66 | * @return the request builder. 67 | */ 68 | RequestBuilder credential(Credential credential); 69 | 70 | /** 71 | * Adds a custom header to this request. 72 | * 73 | * @param headerName the header name for this request. 74 | * @param headerValue the header value for this request. 75 | * @return the request builder. 76 | */ 77 | RequestBuilder header(String headerName, String headerValue); 78 | 79 | /** 80 | * Build the request. Does not execute it. 81 | * 82 | * @return the request. 83 | */ 84 | T build(); 85 | 86 | /** 87 | * Builds and executes the request. 88 | * 89 | * @return the result of the execution of the request. 90 | */ 91 | Response execute(); 92 | 93 | } 94 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/client/Response.java: -------------------------------------------------------------------------------- 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 2 | * 3 | * ADOBE CONFIDENTIAL 4 | * ___________________ 5 | * 6 | * Copyright 2016 Adobe Systems Incorporated 7 | * All Rights Reserved. 8 | * 9 | * NOTICE: All information contained herein is, and remains 10 | * the property of Adobe Systems Incorporated and its suppliers, 11 | * if any. The intellectual and technical concepts contained 12 | * herein are proprietary to Adobe Systems Incorporated and its 13 | * suppliers and are protected by all applicable intellectual property 14 | * laws, including trade secret and copyright laws. 15 | * Dissemination of this information or reproduction of this material 16 | * is strictly forbidden unless prior written permission is obtained 17 | * from Adobe Systems Incorporated. 18 | * 19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/ 20 | package ro.code4.czl.scrape.client; 21 | 22 | import java.util.Date; 23 | import java.util.Map; 24 | 25 | /** 26 | * Contract for a response to a request made by the client. 27 | * 28 | * @param the expected type of the response body 29 | * @author Ionut -Maxim Margelatu (ionut.margelatu@gmail.com) 30 | */ 31 | public interface Response { 32 | 33 | /** 34 | * Returns the status code of the response. 35 | * 36 | * @return the status code. 37 | */ 38 | int getStatusCode(); 39 | 40 | /** 41 | * Returns the entity in the response. 42 | * 43 | * @return the entity. 44 | */ 45 | T getEntity(); 46 | 47 | /** 48 | * Returns the content type of the response. 49 | * 50 | * @return the content type. 51 | */ 52 | String getContentType(); 53 | 54 | /** 55 | * Returns the content length of the response. 56 | * 57 | * @return the content length. 58 | */ 59 | long getContentLength(); 60 | 61 | /** 62 | * Returns the ETag header value, if any. 63 | * 64 | * @return the ETag header value. 65 | */ 66 | String getETag(); 67 | 68 | /** 69 | * Returns the date of the response. 70 | * 71 | * @return the date. 72 | */ 73 | Date getDate(); 74 | 75 | /** 76 | * Returns the value of a given response header. 77 | * 78 | * @param headerName the header name. 79 | * @return the header value. 80 | */ 81 | String getHeaderString(String headerName); 82 | 83 | /** 84 | * Returns all the response headers. 85 | * 86 | * @return the response headers. 87 | */ 88 | Map getHeaders(); 89 | } 90 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/client/authentication/TokenAuthenticationStrategy.java: -------------------------------------------------------------------------------- 1 | package ro.code4.czl.scrape.client.authentication; 2 | 3 | import ro.code4.czl.scrape.client.AuthenticationStrategy; 4 | import ro.code4.czl.scrape.client.Request; 5 | 6 | /** 7 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com) 8 | */ 9 | public class TokenAuthenticationStrategy implements AuthenticationStrategy { 10 | 11 | private final String tokenValue = System.getProperty("czl.scrape.token"); 12 | 13 | @Override 14 | public void process(Request request) { 15 | request.getHeaderParams().put("Authorization", "Token " + tokenValue); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/client/core/CloseIdleConnectionsTask.java: -------------------------------------------------------------------------------- 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 2 | * 3 | * ADOBE CONFIDENTIAL 4 | * ___________________ 5 | * 6 | * Copyright 2016 Adobe Systems Incorporated 7 | * All Rights Reserved. 8 | * 9 | * NOTICE: All information contained herein is, and remains 10 | * the property of Adobe Systems Incorporated and its suppliers, 11 | * if any. The intellectual and technical concepts contained 12 | * herein are proprietary to Adobe Systems Incorporated and its 13 | * suppliers and are protected by all applicable intellectual property 14 | * laws, including trade secret and copyright laws. 15 | * Dissemination of this information or reproduction of this material 16 | * is strictly forbidden unless prior written permission is obtained 17 | * from Adobe Systems Incorporated. 18 | * 19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/ 20 | package ro.code4.czl.scrape.client.core; 21 | 22 | import org.apache.http.conn.HttpClientConnectionManager; 23 | import org.slf4j.Logger; 24 | import org.slf4j.LoggerFactory; 25 | 26 | import java.util.concurrent.TimeUnit; 27 | 28 | /** 29 | * Closes idle or expired connections created by a specific connection manager. 30 | * 31 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com) 32 | */ 33 | class CloseIdleConnectionsTask implements Runnable { 34 | 35 | private static final Logger logger = LoggerFactory.getLogger(CloseIdleConnectionsTask.class); 36 | 37 | private final HttpClientConnectionManager connectionManager; 38 | private final int idleTime; 39 | 40 | /** 41 | * Creates a new task. 42 | * 43 | * @param connectionManager the connection manager that will be periodically checked 44 | * @param idleTime the inactivity time in milliseconds after which connections are considered to be idle 45 | */ 46 | CloseIdleConnectionsTask(HttpClientConnectionManager connectionManager, int idleTime) { 47 | this.connectionManager = connectionManager; 48 | this.idleTime = idleTime; 49 | } 50 | 51 | @Override 52 | public void run() { 53 | try { 54 | connectionManager.closeExpiredConnections(); 55 | connectionManager.closeIdleConnections(idleTime, TimeUnit.MILLISECONDS); 56 | } catch (Exception t) { 57 | logger.warn("Unable to close idle connections", t); 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/client/core/JaxRsJacksonConfigurator.java: -------------------------------------------------------------------------------- 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 2 | * 3 | * ADOBE CONFIDENTIAL 4 | * ___________________ 5 | * 6 | * Copyright 2016 Adobe Systems Incorporated 7 | * All Rights Reserved. 8 | * 9 | * NOTICE: All information contained herein is, and remains 10 | * the property of Adobe Systems Incorporated and its suppliers, 11 | * if any. The intellectual and technical concepts contained 12 | * herein are proprietary to Adobe Systems Incorporated and its 13 | * suppliers and are protected by all applicable intellectual property 14 | * laws, including trade secret and copyright laws. 15 | * Dissemination of this information or reproduction of this material 16 | * is strictly forbidden unless prior written permission is obtained 17 | * from Adobe Systems Incorporated. 18 | * 19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/ 20 | package ro.code4.czl.scrape.client.core; 21 | 22 | import com.fasterxml.jackson.databind.DeserializationFeature; 23 | import com.fasterxml.jackson.databind.ObjectMapper; 24 | 25 | import javax.ws.rs.ext.ContextResolver; 26 | import javax.ws.rs.ext.Provider; 27 | 28 | /** 29 | * Provides custom configuration for Jackson. 30 | * 31 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com) 32 | */ 33 | @Provider 34 | public class JaxRsJacksonConfigurator implements ContextResolver { 35 | 36 | private final ObjectMapper mapper; 37 | 38 | public JaxRsJacksonConfigurator() { 39 | mapper = new ObjectMapper(); 40 | mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); 41 | } 42 | 43 | @Override 44 | public ObjectMapper getContext(Class type) { 45 | return mapper; 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/client/core/JaxRsResponse.java: -------------------------------------------------------------------------------- 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 2 | * 3 | * ADOBE CONFIDENTIAL 4 | * ___________________ 5 | * 6 | * Copyright 2016 Adobe Systems Incorporated 7 | * All Rights Reserved. 8 | * 9 | * NOTICE: All information contained herein is, and remains 10 | * the property of Adobe Systems Incorporated and its suppliers, 11 | * if any. The intellectual and technical concepts contained 12 | * herein are proprietary to Adobe Systems Incorporated and its 13 | * suppliers and are protected by all applicable intellectual property 14 | * laws, including trade secret and copyright laws. 15 | * Dissemination of this information or reproduction of this material 16 | * is strictly forbidden unless prior written permission is obtained 17 | * from Adobe Systems Incorporated. 18 | * 19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/ 20 | package ro.code4.czl.scrape.client.core; 21 | 22 | import jersey.repackaged.com.google.common.collect.Maps; 23 | import ro.code4.czl.scrape.client.Response; 24 | 25 | import java.util.Collections; 26 | import java.util.Date; 27 | import java.util.List; 28 | import java.util.Map; 29 | 30 | /** 31 | * Wrapper over {@linkplain javax.ws.rs.core.Response} that provides a safe body deserialization mechanism along with some syntactic sugar. 32 | * 33 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com) 34 | */ 35 | class JaxRsResponse implements Response { 36 | 37 | private final javax.ws.rs.core.Response originalResponse; 38 | private final Map simplifiedHeaders; 39 | private final T entity; 40 | 41 | JaxRsResponse(javax.ws.rs.core.Response originalResponse, Class expectedType) { 42 | this.originalResponse = originalResponse; 43 | this.entity = new JaxRsResponseDeserializationStrategy().read(originalResponse, expectedType); 44 | this.simplifiedHeaders = Collections.unmodifiableMap( 45 | Maps.transformEntries(originalResponse.getStringHeaders(), new StringListToStringEntryTransformer())); 46 | } 47 | 48 | @Override 49 | public int getStatusCode() { 50 | return originalResponse.getStatus(); 51 | } 52 | 53 | @Override 54 | public T getEntity() { 55 | return entity; 56 | } 57 | 58 | @Override 59 | public String getContentType() { 60 | return originalResponse.getMediaType().toString(); 61 | } 62 | 63 | @Override 64 | public long getContentLength() { 65 | return originalResponse.getLength(); 66 | } 67 | 68 | @Override 69 | public String getETag() { 70 | return originalResponse.getEntityTag().getValue(); 71 | } 72 | 73 | @Override 74 | public Date getDate() { 75 | return originalResponse.getDate(); 76 | } 77 | 78 | @Override 79 | public String getHeaderString(String headerName) { 80 | return originalResponse.getHeaderString(headerName); 81 | } 82 | 83 | @Override 84 | public Map getHeaders() { 85 | return simplifiedHeaders; 86 | } 87 | 88 | private static class StringListToStringEntryTransformer implements Maps.EntryTransformer, String> { 89 | 90 | @Override 91 | public String transformEntry(String s, List strings) { 92 | if (strings == null || strings.isEmpty()) { 93 | return null; 94 | } 95 | return strings.get(0); 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/client/core/JaxRsResponseDeserializationStrategy.java: -------------------------------------------------------------------------------- 1 | /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 2 | * 3 | * ADOBE CONFIDENTIAL 4 | * ___________________ 5 | * 6 | * Copyright 2016 Adobe Systems Incorporated 7 | * All Rights Reserved. 8 | * 9 | * NOTICE: All information contained herein is, and remains 10 | * the property of Adobe Systems Incorporated and its suppliers, 11 | * if any. The intellectual and technical concepts contained 12 | * herein are proprietary to Adobe Systems Incorporated and its 13 | * suppliers and are protected by all applicable intellectual property 14 | * laws, including trade secret and copyright laws. 15 | * Dissemination of this information or reproduction of this material 16 | * is strictly forbidden unless prior written permission is obtained 17 | * from Adobe Systems Incorporated. 18 | * 19 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **/ 20 | package ro.code4.czl.scrape.client.core; 21 | 22 | import java.io.InputStream; 23 | 24 | import javax.ws.rs.core.Response; 25 | 26 | /** 27 | * Deserialization strategy that ensures the response body is safely deserialized and that the input stream is properly closed. 28 | * 29 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com) 30 | */ 31 | class JaxRsResponseDeserializationStrategy { 32 | 33 | @SuppressWarnings("unchecked") 34 | T read(Response response, Class expectedType) { 35 | if (!response.hasEntity()) { 36 | response.close(); 37 | return null; 38 | } 39 | 40 | if (InputStream.class.isAssignableFrom(expectedType)) { 41 | return (T) response.getEntity(); 42 | } else { 43 | if (response.getStatusInfo().getFamily() == Response.Status.Family.SUCCESSFUL) { 44 | try { 45 | return response.readEntity(expectedType); 46 | } finally { 47 | response.close(); 48 | } 49 | } 50 | } 51 | 52 | response.close(); 53 | return null; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/client/model/CreatePublicationRequest.java: -------------------------------------------------------------------------------- 1 | package ro.code4.czl.scrape.client.model; 2 | 3 | import ro.code4.czl.scrape.client.ApiInvoker; 4 | import ro.code4.czl.scrape.client.BaseRequest; 5 | import ro.code4.czl.scrape.client.BaseRequestBuilder; 6 | import ro.code4.czl.scrape.client.representation.PublicationRepresentation; 7 | 8 | import javax.ws.rs.HttpMethod; 9 | import javax.ws.rs.core.MediaType; 10 | 11 | /** 12 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com) 13 | */ 14 | public class CreatePublicationRequest extends BaseRequest { 15 | 16 | private CreatePublicationRequest(CreatePublicationRequest.Builder builder) { 17 | super(builder, "publications/", HttpMethod.POST, MediaType.APPLICATION_JSON, builder.apiInvoker); 18 | 19 | setBody(builder.spaceRepresentation); 20 | } 21 | 22 | public static CreatePublicationRequest.Builder builder(PublicationRepresentation spaceRepresentation, ApiInvoker apiInvoker) { 23 | return new CreatePublicationRequest.Builder(spaceRepresentation, apiInvoker); 24 | } 25 | 26 | @Override 27 | public Class getResponseType() { 28 | return PublicationRepresentation.class; 29 | } 30 | 31 | public static class Builder extends BaseRequestBuilder { 32 | 33 | private final ApiInvoker apiInvoker; 34 | private final PublicationRepresentation spaceRepresentation; 35 | 36 | Builder(PublicationRepresentation spaceRepresentation, ApiInvoker apiInvoker) { 37 | this.apiInvoker = apiInvoker; 38 | this.spaceRepresentation = spaceRepresentation; 39 | } 40 | 41 | @Override 42 | public CreatePublicationRequest build() { 43 | return new CreatePublicationRequest(this); 44 | } 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/client/representation/ContactRepresentation.java: -------------------------------------------------------------------------------- 1 | package ro.code4.czl.scrape.client.representation; 2 | 3 | import com.fasterxml.jackson.annotation.JsonInclude; 4 | import com.fasterxml.jackson.annotation.JsonInclude.Include; 5 | 6 | /** 7 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com) 8 | */ 9 | @JsonInclude(Include.NON_NULL) 10 | public class ContactRepresentation { 11 | 12 | private String tel; 13 | private String email; 14 | 15 | public ContactRepresentation() { 16 | } 17 | 18 | public ContactRepresentation(String tel, String email) { 19 | this.tel = tel; 20 | this.email = email; 21 | } 22 | 23 | public String getTel() { 24 | return tel; 25 | } 26 | 27 | public void setTel(String tel) { 28 | this.tel = tel; 29 | } 30 | 31 | public String getEmail() { 32 | return email; 33 | } 34 | 35 | public void setEmail(String email) { 36 | this.email = email; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/client/representation/DocumentRepresentation.java: -------------------------------------------------------------------------------- 1 | package ro.code4.czl.scrape.client.representation; 2 | 3 | import com.fasterxml.jackson.annotation.JsonInclude; 4 | import com.fasterxml.jackson.annotation.JsonInclude.Include; 5 | 6 | import org.apache.commons.lang3.builder.ToStringBuilder; 7 | import org.apache.commons.lang3.builder.ToStringStyle; 8 | 9 | /** 10 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com) 11 | */ 12 | @JsonInclude(Include.NON_NULL) 13 | public class DocumentRepresentation { 14 | 15 | private String type; 16 | private String url; 17 | 18 | public DocumentRepresentation() { 19 | } 20 | 21 | public DocumentRepresentation(String type, String url) { 22 | this.type = type; 23 | this.url = url; 24 | } 25 | 26 | public String getType() { 27 | return type; 28 | } 29 | 30 | public void setType(String type) { 31 | this.type = type; 32 | } 33 | 34 | public String getUrl() { 35 | return url; 36 | } 37 | 38 | public void setUrl(String url) { 39 | this.url = url; 40 | } 41 | 42 | @Override 43 | public String toString() { 44 | return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) 45 | .append("type", type) 46 | .append("url", url) 47 | .toString(); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/client/samples/CzlClientSample.java: -------------------------------------------------------------------------------- 1 | package ro.code4.czl.scrape.client.samples; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | import ro.code4.czl.scrape.client.CzlClient; 7 | import ro.code4.czl.scrape.client.CzlClientConfig; 8 | import ro.code4.czl.scrape.client.authentication.TokenAuthenticationStrategy; 9 | import ro.code4.czl.scrape.client.representation.PublicationRepresentation.PublicationRepresentationBuilder; 10 | 11 | /** 12 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com) 13 | */ 14 | public class CzlClientSample { 15 | 16 | private static final Logger logger = LoggerFactory.getLogger(CzlClientSample.class); 17 | 18 | public static void main(String[] args) { 19 | 20 | CzlClientConfig clientConfig = CzlClientConfig.builder() 21 | .endpointURI("http://czl-api.code4.ro/api/") 22 | .connectionRequestTimeout(500) 23 | .connectTimeout(500) 24 | .socketTimeout(3000) 25 | .authenticationStrategy(new TokenAuthenticationStrategy()) 26 | .build(); 27 | 28 | try (CzlClient czlClient = CzlClient.newClient(clientConfig)) { 29 | czlClient.apiV1().createPublication(PublicationRepresentationBuilder 30 | .aPublicationRepresentation() 31 | .withIdentifier("1") 32 | .withInstitution("finantepub") 33 | .withType("HG") 34 | .withDate("2017-03-08") 35 | .build()) 36 | .execute(); 37 | } catch (Exception e) { 38 | logger.error("Met an error.", e); 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/text/ProposalType.java: -------------------------------------------------------------------------------- 1 | package ro.code4.czl.scrape.text; 2 | 3 | /** 4 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com) 5 | */ 6 | public enum ProposalType { 7 | 8 | HG, LEGE, OM, OG, OUG, OTHER; 9 | 10 | public static ProposalType fromLabel(String label) { 11 | switch (label.toLowerCase()) { 12 | case "hg": 13 | case "hotarare": { 14 | return HG; 15 | } 16 | case "lege": { 17 | return LEGE; 18 | } 19 | case "om": 20 | case "ordin": { 21 | return OM; 22 | } 23 | case "og": { 24 | return OG; 25 | } 26 | case "oug": { 27 | return OUG; 28 | } 29 | default: { 30 | return OTHER; 31 | } 32 | } 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /_commons-java/src/main/java/ro/code4/czl/scrape/text/RomanianMonth.java: -------------------------------------------------------------------------------- 1 | package ro.code4.czl.scrape.text; 2 | 3 | /** 4 | * @author Ionut-Maxim Margelatu (ionut.margelatu@gmail.com) 5 | */ 6 | public enum RomanianMonth { 7 | 8 | IANUARIE(1), 9 | FEBRUARIE(2), 10 | MARTIE(3), 11 | APRILIE(4), 12 | MAI(5), 13 | IUNIE(6), 14 | IULIE(7), 15 | AUGUST(8), 16 | SEPTEMBRIE(9), 17 | OCTOMBRIE(10), 18 | NOIEMBRIE(11), 19 | DECEMBRIE(12); 20 | 21 | private final int number; 22 | 23 | RomanianMonth(int number) { 24 | this.number = number; 25 | } 26 | 27 | public int getNumber() { 28 | return number; 29 | } 30 | 31 | public static RomanianMonth fromLabel(String value) { 32 | switch (value.toLowerCase()) { 33 | case "ianuarie": { 34 | return IANUARIE; 35 | } 36 | case "februarie": { 37 | return FEBRUARIE; 38 | } 39 | case "martie": { 40 | return MARTIE; 41 | } 42 | case "aprilie": { 43 | return APRILIE; 44 | } 45 | case "mai": { 46 | return MAI; 47 | } 48 | case "iunie": { 49 | return IUNIE; 50 | } 51 | case "iulie": { 52 | return IULIE; 53 | } 54 | case "august": { 55 | return AUGUST; 56 | } 57 | case "septembrie": { 58 | return SEPTEMBRIE; 59 | } 60 | case "octombrie": { 61 | return OCTOMBRIE; 62 | } 63 | case "noiembrie": { 64 | return NOIEMBRIE; 65 | } 66 | case "decembrie": { 67 | return DECEMBRIE; 68 | } 69 | default: { 70 | throw new RuntimeException("Unrecognized month label " + value); 71 | } 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-slate -------------------------------------------------------------------------------- /afaceri/README.md: -------------------------------------------------------------------------------- 1 | # Ministerul pentru Mediul de Afaceri, Comerț și Antreprenoriat 2 | 3 | Surse de documente este http://www.antreprenoriat.gov.ro/categorie/transparenta-decizionala/proiecte-in-dezbatere-publica/ . 4 | 5 | ### Tehnologie 6 | *NodeJS* - serverul se conecteaza la URL-ul setat in fisierul din config, descarca fisierele PDF, parseaza continutul lor, trimite obiectele generate la API si sterge fisierele PDF de pe disc. 7 | 8 | ### Instructiuni 9 | Token-ul de autentificare la API trebuie setat in fisierul *config.json*. 10 | 11 | Continutul PDF-urilor se proceseaza in paragrafe. Serverul obtine datele necesare din paragraful relevant. Paragraful relevant reprezinta primul paragraf cu un numar total mai mare de 8 cuvinte si 50 de litere (configurabil in *config.json*) 12 | ``` 13 | npm install 14 | node server/server.js 15 | ``` 16 | 17 | ### Exceptii 18 | Datele documentelor nu exista intr-un format standardizat. Date interpretabile exista in URL-urile fisierelor si in numele acestora. 19 | 20 | La fiecare rulare a server-ului, sunt (re)procesate fisierele din URL-ul principal. 21 | -------------------------------------------------------------------------------- /afaceri/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "afaceri", 3 | "version": "1.0.0", 4 | "main": "server/server.js", 5 | "scripts": { 6 | "lint": "eslint .", 7 | "start": "node .", 8 | "posttest": "npm run lint && nsp check" 9 | }, 10 | "dependencies": { 11 | "async": "^2.1.5", 12 | "cheerio": "^0.22.0", 13 | "compression": "^1.0.3", 14 | "cors": "^2.5.2", 15 | "helmet": "^1.3.0", 16 | "loopback": "^2.22.0", 17 | "loopback-boot": "^2.6.5", 18 | "loopback-component-explorer": "^2.4.0", 19 | "loopback-datasource-juggler": "^2.39.0", 20 | "pdf2json": "^1.1.7", 21 | "serve-favicon": "^2.0.1", 22 | "string": "^3.3.3", 23 | "strong-error-handler": "^1.0.1" 24 | }, 25 | "devDependencies": { 26 | "eslint": "^2.13.1", 27 | "eslint-config-loopback": "^4.0.0", 28 | "nsp": "^2.1.0" 29 | }, 30 | "repository": { 31 | "type": "", 32 | "url": "" 33 | }, 34 | "license": "UNLICENSED", 35 | "description": "afaceri" 36 | } 37 | -------------------------------------------------------------------------------- /afaceri/server/boot/authentication.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | module.exports = function enableAuthentication(server) { 4 | // enable authentication 5 | server.enableAuth(); 6 | }; 7 | -------------------------------------------------------------------------------- /afaceri/server/boot/root.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | module.exports = function(server) { 4 | // Install a `/` route that returns server status 5 | var router = server.loopback.Router(); 6 | router.get('/', server.loopback.status()); 7 | server.use(router); 8 | }; 9 | -------------------------------------------------------------------------------- /afaceri/server/component-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "loopback-component-explorer": { 3 | "mountPath": "/explorer" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /afaceri/server/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "restApiRoot": "/api", 3 | "host": "0.0.0.0", 4 | "port": 3000, 5 | "remoting": { 6 | "context": false, 7 | "rest": { 8 | "handleErrors": false, 9 | "normalizeHttpPath": false, 10 | "xml": false 11 | }, 12 | "json": { 13 | "strict": false, 14 | "limit": "100kb" 15 | }, 16 | "urlencoded": { 17 | "extended": true, 18 | "limit": "100kb" 19 | }, 20 | "cors": false 21 | }, 22 | "legacyExplorer": false, 23 | "logoutSessionsOnSensitiveChanges": true, 24 | "userAgent": "jesus", 25 | "downloadsFolder": "downloads", 26 | "firstParagraphMinWords": 8, 27 | "firstParagraphMinLetters": 50, 28 | "APIKey": "Token dummy", 29 | "mainURL": "http://www.antreprenoriat.gov.ro/categorie/transparenta-decizionala/proiecte-in-dezbatere-publica/" 30 | } 31 | -------------------------------------------------------------------------------- /afaceri/server/config/keywords.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by Andrei on 3/13/2017. 3 | */ 4 | 5 | var docType = [ 6 | { 7 | type: "LEGE", 8 | regex: new RegExp("proiect ([a-zA-Z]+\s?){1,3} ordonan", "i") 9 | }, 10 | { 11 | type: "OUG", 12 | regex: new RegExp("ordonan\\S{1,2} de urgen\\S{1,2}", "i") 13 | }, 14 | { 15 | type: "HG", 16 | regex: new RegExp("hot\S{1}r\S{1}re", "i") 17 | } 18 | ]; 19 | 20 | var titleStartMarkStrings = [ 21 | "privind ", 22 | "pentru " 23 | ]; 24 | 25 | var titleEndMarkStrings = [ 26 | "\n", 27 | "\r\n" 28 | ]; 29 | 30 | var titleEndMarkRegex = [ 31 | new RegExp("sec\\S{1}iune", "i") 32 | ]; 33 | 34 | module.exports = { 35 | docType: docType, 36 | titleStartMarkStrings: titleStartMarkStrings, 37 | titleEndMarkStrings: titleEndMarkStrings, 38 | titleEndMarkRegex: titleEndMarkRegex 39 | }; -------------------------------------------------------------------------------- /afaceri/server/datasources.json: -------------------------------------------------------------------------------- 1 | { 2 | "db": { 3 | "name": "db", 4 | "connector": "memory" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /afaceri/server/middleware.development.json: -------------------------------------------------------------------------------- 1 | { 2 | "final:after": { 3 | "strong-error-handler": { 4 | "params": { 5 | "debug": true, 6 | "log": true 7 | } 8 | } 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /afaceri/server/middleware.json: -------------------------------------------------------------------------------- 1 | { 2 | "initial:before": { 3 | "loopback#favicon": {} 4 | }, 5 | "initial": { 6 | "compression": {}, 7 | "cors": { 8 | "params": { 9 | "origin": true, 10 | "credentials": true, 11 | "maxAge": 86400 12 | } 13 | }, 14 | "helmet#xssFilter": {}, 15 | "helmet#frameguard": { 16 | "params": [ 17 | "deny" 18 | ] 19 | }, 20 | "helmet#hsts": { 21 | "params": { 22 | "maxAge": 0, 23 | "includeSubdomains": true 24 | } 25 | }, 26 | "helmet#hidePoweredBy": {}, 27 | "helmet#ieNoOpen": {}, 28 | "helmet#noSniff": {}, 29 | "helmet#noCache": { 30 | "enabled": false 31 | } 32 | }, 33 | "session": {}, 34 | "auth": {}, 35 | "parse": {}, 36 | "routes": { 37 | "loopback#rest": { 38 | "paths": [ 39 | "${restApiRoot}" 40 | ] 41 | } 42 | }, 43 | "files": {}, 44 | "final": { 45 | "loopback#urlNotFound": {} 46 | }, 47 | "final:after": { 48 | "strong-error-handler": {} 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /afaceri/server/model-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "sources": [ 4 | "loopback/common/models", 5 | "loopback/server/models", 6 | "../common/models", 7 | "./models" 8 | ], 9 | "mixins": [ 10 | "loopback/common/mixins", 11 | "loopback/server/mixins", 12 | "../common/mixins", 13 | "./mixins" 14 | ] 15 | }, 16 | "User": { 17 | "dataSource": "db" 18 | }, 19 | "AccessToken": { 20 | "dataSource": "db", 21 | "public": false 22 | }, 23 | "ACL": { 24 | "dataSource": "db", 25 | "public": false 26 | }, 27 | "RoleMapping": { 28 | "dataSource": "db", 29 | "public": false, 30 | "options": { 31 | "strictObjectIDCoercion": true 32 | } 33 | }, 34 | "Role": { 35 | "dataSource": "db", 36 | "public": false 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /afaceri/server/server.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | var loopback = require('loopback'); 4 | var boot = require('loopback-boot'); 5 | var contentParser = require('./controllers/contentParser'); 6 | 7 | var app = module.exports = loopback(); 8 | 9 | app.start = function() { 10 | // start the web server 11 | return app.listen(function() { 12 | app.emit('started'); 13 | var baseUrl = app.get('url').replace(/\/$/, ''); 14 | console.log('Web server listening at: %s', baseUrl); 15 | if (app.get('loopback-component-explorer')) { 16 | var explorerPath = app.get('loopback-component-explorer').mountPath; 17 | console.log('Browse your REST API at %s%s', baseUrl, explorerPath); 18 | } 19 | contentParser.init(); 20 | }); 21 | }; 22 | 23 | // Bootstrap the application, configure models, datasources and middleware. 24 | // Sub-apps like REST API are mounted via boot scripts. 25 | boot(app, __dirname, function(err) { 26 | if (err) throw err; 27 | 28 | // start the server if `$ node server.js` 29 | if (require.main === module) 30 | app.start(); 31 | }); 32 | -------------------------------------------------------------------------------- /agricultura/.gitignore: -------------------------------------------------------------------------------- 1 | /node_modules/ -------------------------------------------------------------------------------- /agricultura/README.md: -------------------------------------------------------------------------------- 1 | # Ministerul Agriculturii Şi Dezvoltării Rurale 2 | 3 | ## Tehnologie 4 | NodeJS, [Nightmare](http://www.nightmarejs.org) 5 | 6 | ## Instructiuni 7 | ``` 8 | npm install 9 | API_TOKEN=the_secret_api_token npm start 10 | ``` 11 | 12 | ## Exceptii 13 | -------------------------------------------------------------------------------- /agricultura/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "agricultura", 3 | "version": "1.0.0", 4 | "description": "scraper pentru agricultura", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "todo: add tests", 8 | "start": "node index.js" 9 | }, 10 | "repository": { 11 | "type": "git", 12 | "url": "git+https://github.com/ciprian-chichirita/czl-scrape.git" 13 | }, 14 | "keywords": [ 15 | "code4romania", 16 | "ce", 17 | "zice", 18 | "legea", 19 | "agricultura" 20 | ], 21 | "author": "ciprian chichirita, alex morega", 22 | "license": "MIT", 23 | "bugs": { 24 | "url": "https://github.com/ciprian-chichirita/czl-scrape/issues" 25 | }, 26 | "homepage": "https://github.com/ciprian-chichirita/czl-scrape#readme", 27 | "devDependencies": { 28 | "moment": "^2.17.1", 29 | "nightmare": "^2.10.0", 30 | "request": "^2.81.0", 31 | "request-promise": "^4.1.1", 32 | "sha256": "^0.2.0" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /aparare/README.md: -------------------------------------------------------------------------------- 1 | # Ministerul Apărării Naţionale 2 | Sursa documente: http://dlaj.mapn.ro/ 3 | ## Tehnologie 4 | *PHP* - Script simplu old-school 5 | ## Instructiuni 6 | Nu are instructiuni speciale. 7 | 8 | Tokenul va fi transmis ca argument: 9 | ```bash 10 | $ php mapn_plugin.php TOKEN 11 | ``` 12 | ## Exceptii 13 | Din cauza faptului ca pagina html nu e consistenta, au fost folosite RegExuri pentru a lua informatiile. 14 | 15 | O problema a constat in faptul ca o intrare este constituita pe site-ul acesta din 2 elemente practic, mai exact 16 | titlul proiectului si documentele aferente, dar ele nu pot fi legate una de cealalta logic. De aceea, scriptul 17 | va functiona doar in cazul in care gaseste acelasi numar de titluri si grupuri de documente. 18 | 19 | Scriptul va intoarce false in urmatoarele situatii: 20 | * pagina este down 21 | * unul din elementele cheie de content este schimbat (titlurile nu mai au *, calea spre documente este schimbata) 22 | * numarul de titluri si numarul de grupuri de documente nu este acelasi -------------------------------------------------------------------------------- /apepaduri/README.md: -------------------------------------------------------------------------------- 1 | # Ministerul Apelor și Pădurilor 2 | 3 | ## Tehnologie 4 | 5 | ## Instructiuni 6 | 7 | ## Exceptii -------------------------------------------------------------------------------- /cdep/README.md: -------------------------------------------------------------------------------- 1 | # Camera Deputatilor 2 | 3 | ## Tehnologie 4 | python, scrapy 5 | 6 | ## Instructiuni 7 | ``` 8 | pip install -r requirements.txt 9 | API_TOKEN='the secret token' python scraper.py 10 | ``` 11 | 12 | ## Exceptii 13 | -------------------------------------------------------------------------------- /cdep/requirements.in: -------------------------------------------------------------------------------- 1 | scrapy 2 | requests 3 | -------------------------------------------------------------------------------- /cdep/requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile --output-file requirements.txt requirements.in 6 | # 7 | asn1crypto==0.21.1 # via cryptography 8 | attrs==19.1.0 # via automat, service-identity, twisted 9 | automat==0.5.0 # via twisted 10 | certifi==2019.9.11 # via requests 11 | cffi==1.9.1 # via cryptography 12 | chardet==3.0.4 # via requests 13 | constantly==15.1.0 # via twisted 14 | cryptography==2.7 # via pyopenssl 15 | cssselect==1.0.1 # via parsel, scrapy 16 | hyperlink==19.0.0 # via twisted 17 | idna==2.7 # via hyperlink, requests 18 | incremental==16.10.1 # via twisted 19 | lxml==3.7.3 # via parsel, scrapy 20 | parsel==1.1.0 # via scrapy 21 | pyasn1-modules==0.0.8 # via service-identity 22 | pyasn1==0.2.3 # via pyasn1-modules, service-identity 23 | pycparser==2.17 # via cffi 24 | pydispatcher==2.0.5 # via scrapy 25 | pyhamcrest==1.9.0 # via twisted 26 | pyopenssl==17.5.0 # via scrapy, service-identity 27 | queuelib==1.4.2 # via scrapy 28 | requests==2.20.0 29 | scrapy==1.3.3 30 | service-identity==16.0.0 # via scrapy 31 | six==1.10.0 # via automat, cryptography, parsel, pyhamcrest, pyopenssl, scrapy, w3lib 32 | twisted==19.7.0 # via scrapy 33 | urllib3==1.24.3 # via requests 34 | w3lib==1.17.0 # via parsel, scrapy 35 | zope.interface==4.6.0 # via twisted 36 | 37 | # The following packages are considered to be unsafe in a requirements file: 38 | # setuptools==41.2.0 # via pyhamcrest, zope.interface 39 | -------------------------------------------------------------------------------- /cdep/scraper.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import re 3 | import requests 4 | import os 5 | 6 | API_URL = 'http://czl-api.code4.ro/api/publications/' 7 | API_TOKEN = os.environ['API_TOKEN'] 8 | 9 | INDEX_URL = 'http://www.cdep.ro/pls/proiecte/upl_pck2015.lista?cam=2&anp=2017' 10 | 11 | def upload(doc): 12 | headers = {'Authorization': 'Token ' + API_TOKEN} 13 | resp = requests.post(API_URL, json=doc, headers=headers) 14 | if resp.status_code == 400: 15 | if re.search(r'Integrity Error: Key .* already exists', resp.text): 16 | return 17 | assert resp.status_code == 201 18 | 19 | class EducatieSpider(scrapy.Spider): 20 | 21 | name = 'cdep' 22 | start_urls = [INDEX_URL] 23 | 24 | def parse(self, response): 25 | for tr in response.css('.grup-parlamentar-list > table > tbody > tr'): 26 | href = tr.css('a::attr(href)').extract_first() 27 | url = response.urljoin(href) 28 | yield scrapy.Request(url, self.parse_proposal) 29 | 30 | def parse_proposal(self, response): 31 | cale_txt = ' '.join(t.extract() for t in response.css('.cale *::text')) 32 | plx_code = 'pl-x ' + re.search(r'pl-x\s+(\S+)', cale_txt.lower()).group(1) 33 | title = response.css('.detalii-initiativa h4::text').extract_first() 34 | 35 | table = response.css('#olddiv > table')[-1] 36 | for td in table.css('td'): 37 | td_text = (td.css('::text').extract_first() or '').strip() 38 | m = re.match(r'^(\d{2})\.(\d{2})\.(\d{4})$', td_text) 39 | if m: 40 | date = '{}-{}-{}'.format(m.group(3), m.group(2), m.group(1)) 41 | break 42 | 43 | documents = [] 44 | 45 | for pdf_link in response.css('.program-lucru-detalii a'): 46 | target = pdf_link.css('::attr(target)').extract_first() or '' 47 | if target.lower() != 'pdf': 48 | continue 49 | pdf_href = pdf_link.css('::attr(href)').extract_first() 50 | pdf_url = response.urljoin(pdf_href) 51 | label_tds = pdf_link.xpath('../../td') 52 | pdf_label = ' '.join( 53 | td.css('::text').extract_first() 54 | for td in label_tds[1:] 55 | ).strip() 56 | documents.append({ 57 | 'type': pdf_label, 58 | 'url': pdf_url, 59 | }) 60 | 61 | doc = { 62 | 'identifier': plx_code, 63 | 'title': title, 64 | 'institution': 'cdep', 65 | 'description': '', 66 | 'type': 'LEGE', 67 | 'date': date, 68 | 'documents': documents, 69 | } 70 | upload(doc) 71 | 72 | def main(): 73 | from scrapy.crawler import CrawlerProcess, Crawler 74 | process = CrawlerProcess() 75 | process.crawl(EducatieSpider) 76 | process.start() 77 | 78 | if __name__ == '__main__': 79 | main() 80 | -------------------------------------------------------------------------------- /cercetare/.editorconfig: -------------------------------------------------------------------------------- 1 | [*] 2 | charset=utf-8 3 | end_of_line=crlf 4 | insert_final_newline=false 5 | indent_style=space 6 | indent_size=4 7 | 8 | [{*.jhm,*.xslt,*.xul,*.rng,*.xsl,*.xsd,*.ant,*.svg,*.tld,*.fxml,*.jrxml,*.xml,*.jnlp,*.wsdl}] 9 | indent_style=space 10 | indent_size=2 11 | 12 | [{.eslintrc,.babelrc,.stylelintrc,*.json,*.jsb3,*.jsb2,*.bowerrc}] 13 | indent_style=space 14 | indent_size=2 15 | 16 | [{*.applejs,*.js}] 17 | indent_style=space 18 | indent_size=4 19 | 20 | [{.analysis_options,*.yml,*.yaml}] 21 | indent_style=space 22 | indent_size=2 23 | 24 | -------------------------------------------------------------------------------- /cercetare/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | node_modules 3 | secrets.json 4 | data.json -------------------------------------------------------------------------------- /cercetare/README.md: -------------------------------------------------------------------------------- 1 | # Ministerul Cercetării și Inovării 2 | 3 | ## Tehnologie 4 | 5 | 6 | JavaScript EcmaScript2015 (ES6) 7 | 8 | 1. nodejs - https://nodejs.org/en/ 9 | 1. nightmare - https://github.com/segmentio/nightmare 10 | 1. cheerio - https://github.com/cheeriojs/cheerio 11 | 1. jsonfile - https://github.com/jprichardson/node-jsonfile 12 | 1. request - https://github.com/request/request 13 | 1. argv - https://github.com/yargs/yargs 14 | 1. diacritics - https://github.com/andrewrk/node-diacritics 15 | 16 | ## Instructiuni 17 | 18 | 1. install nodejs 19 | 1. run `npm update` 20 | 1. run `node app.js`, passing param `--post` will upload to api and also generate a file `data.json` to view data. 21 | 22 | ## Exceptii 23 | -------------------------------------------------------------------------------- /cercetare/app.js: -------------------------------------------------------------------------------- 1 | let nightmareConfig = {show: false}, 2 | cheerio = require('cheerio'), 3 | request = require('request'), 4 | parseProject = require('./parseProject'), 5 | jsonfile = require('jsonfile'), 6 | argv = require('yargs').argv, 7 | secrets = require('./secrets.json') || {}; 8 | 9 | const URL = 'http://www.research.gov.ro/ro/articol/1029/despre-ancs-legislatie-proiecte-de-acte-normative', 10 | BASE = 'http://www.research.gov.ro'; 11 | 12 | const FILE = 'data.json'; 13 | 14 | /** ====== MAIN ====== */ 15 | 16 | getNightmareInstance() 17 | .goto(URL) 18 | .wait('body') 19 | .evaluate(getHTMLContent) 20 | .end() 21 | .then(processHTMLContent) 22 | .then(parseListItems) 23 | .then(postParsedResults) 24 | .catch(handleErrors); 25 | 26 | 27 | /** ====== page ====== */ 28 | 29 | function getHTMLContent() { 30 | return document.querySelector('.icr_main .special_edit').innerHTML; 31 | } 32 | 33 | function processHTMLContent(result) { 34 | console.log('processing html page...'); 35 | 36 | return { 37 | feedback_days_element: cheerio.load(result)('p').children('a[href^=mailto]').parent()[0], 38 | items: cheerio.load(result)('table tbody tr') //.not(function(item) {return cheerio.load(item).text() && cheerio.load(item).text().indexOf('Data publicarii') === -1}) 39 | }; 40 | } 41 | 42 | 43 | /** ====== list items ====== */ 44 | 45 | function parseListItems(resultObject) { 46 | let items = resultObject.items, 47 | parseResults = []; 48 | 49 | items.each(function (i, item) { 50 | let $ = cheerio.load(item), 51 | content = $.text().replace(/\n/g, '').replace(/\t/g, ''); 52 | 53 | if(content && content.indexOf('Data publicarii') != 0) { 54 | parseResults.push(parseItem(resultObject.feedback_days_element, item)); 55 | } 56 | }); 57 | 58 | return parseResults; 59 | } 60 | 61 | function parseItem(feedback_days, item) { 62 | return parseProject(cheerio.load(item), BASE, cheerio.load(feedback_days)); 63 | } 64 | 65 | 66 | /** ====== post ====== */ 67 | 68 | function postParsedResults(parsedResultsArr) { 69 | 70 | console.log('saving data to file...'); 71 | 72 | jsonfile.writeFileSync(FILE, parsedResultsArr, {spaces: 4}); 73 | 74 | if (argv.post) { 75 | if (!(secrets.API_URL && secrets.TOKEN)) { 76 | throw new Error('Share your secrets with me. Pretty please :)'); 77 | } 78 | 79 | console.log('posting data to api...'); 80 | 81 | let requestsArr = []; 82 | 83 | parsedResultsArr.forEach(function (result, i) { 84 | let promise = new Promise(function (resolve, reject) { 85 | request({ 86 | uri: secrets.API_URL, 87 | method: 'POST', 88 | headers: { 89 | 'Authorization': 'Token ' + secrets.TOKEN, 90 | 'Content-Type': 'application/json' 91 | }, 92 | json: result 93 | }, function (error, response, body) { 94 | if (error || response.statusCode !== 200) { 95 | console.error('request failed: ', error) 96 | } 97 | 98 | resolve(body); 99 | }) 100 | }); 101 | 102 | requestsArr.push(promise); 103 | }); 104 | 105 | Promise.all(requestsArr).then(function (response) { 106 | console.log('done!'); 107 | process.exit(0); 108 | }).catch(function (err) { 109 | throw new Error(err); 110 | }); 111 | } else { 112 | console.log('done!'); 113 | process.exit(0); 114 | } 115 | } 116 | 117 | 118 | /** ====== utils ====== */ 119 | 120 | function getNightmareInstance() { 121 | return require('nightmare')(nightmareConfig); 122 | } 123 | 124 | function handleErrors(error) { 125 | throw new Error(error); 126 | } -------------------------------------------------------------------------------- /cercetare/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pretutindeni", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "app.js", 6 | "scripts": { 7 | "crawl": "node app.js" 8 | }, 9 | "author": "", 10 | "license": "ISC", 11 | "dependencies": { 12 | "cheerio": "0.22.0", 13 | "diacritics": "1.3.0", 14 | "jsonfile": "2.4.0", 15 | "nightmare": "2.10.0", 16 | "nodemon": "1.11.0", 17 | "q": "1.4.1", 18 | "request": "^2.81.0", 19 | "yargs": "7.0.2" 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /cercetare/secrets.json.txt: -------------------------------------------------------------------------------- 1 | { 2 | "TOKEN": "something something", 3 | "API_URL": "http://something.com/api/post-parsed-results" 4 | } -------------------------------------------------------------------------------- /dezvoltare/.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.swo 3 | .DS_Store 4 | *.egg-info 5 | build 6 | *.pyc 7 | **/*.pyc 8 | dbs 9 | -------------------------------------------------------------------------------- /dezvoltare/README.md: -------------------------------------------------------------------------------- 1 | # Ministerul Dezvoltării Regionale, Administrației Publice și Fondurilor Europene 2 | 3 | http://www.mdrap.gov.ro/transparenta/consultari-publice/ 4 | 5 | ## Tehnologie 6 | 7 | *Python 2.7* 8 | [Scrapy 1.3.3](https://scrapy.org/) 9 | 10 | ## Instructiuni 11 | 12 | ``` 13 | pip install -r requirements.txt 14 | cd crawl_dezvoltare 15 | scrapy crawl mdrap -a token=xxxx 16 | ``` 17 | 18 | ## Exceptii -------------------------------------------------------------------------------- /dezvoltare/crawl_dezvoltare/crawl_dezvoltare/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/dezvoltare/crawl_dezvoltare/crawl_dezvoltare/__init__.py -------------------------------------------------------------------------------- /dezvoltare/crawl_dezvoltare/crawl_dezvoltare/exporters.py: -------------------------------------------------------------------------------- 1 | from scrapy.exporters import BaseItemExporter -------------------------------------------------------------------------------- /dezvoltare/crawl_dezvoltare/crawl_dezvoltare/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class CrawlDezvoltareItem(scrapy.Item): 12 | identifier = scrapy.Field() 13 | title = scrapy.Field() 14 | type = scrapy.Field() 15 | institution = scrapy.Field() 16 | institution = scrapy.Field() 17 | date = scrapy.Field() 18 | description = scrapy.Field() 19 | feedback_days = scrapy.Field() 20 | contact = scrapy.Field() 21 | tel = scrapy.Field() 22 | email = scrapy.Field() 23 | documents = scrapy.Field() -------------------------------------------------------------------------------- /dezvoltare/crawl_dezvoltare/crawl_dezvoltare/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class CrawlDezvoltareSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /dezvoltare/crawl_dezvoltare/crawl_dezvoltare/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import requests 8 | 9 | class CrawlDezvoltarePipeline(object): 10 | def process_item(self, item, spider): 11 | doc = { 12 | 'identifier': item['identifier'], 13 | 'title': item['title'], 14 | 'institution': item['institution'], 15 | 'description': item['description'], 16 | 'type': item['type'], 17 | 'date': item['date'], 18 | 'documents': item['documents'], 19 | 'contact':item['contact'], 20 | 'feedback_days': item['feedback_days'] 21 | } 22 | 23 | response = requests.post('http://czl-api.code4.ro/api/publications/', headers={'Authorization': 'Token ' + spider.token }, json=doc) 24 | # print '---------' 25 | # print response 26 | # print response.text 27 | # print '---------' 28 | return item 29 | 30 | 31 | -------------------------------------------------------------------------------- /dezvoltare/crawl_dezvoltare/crawl_dezvoltare/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for crawl_dezvoltare project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'crawl_dezvoltare' 13 | 14 | SPIDER_MODULES = ['crawl_dezvoltare.spiders'] 15 | NEWSPIDER_MODULE = 'crawl_dezvoltare.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'code4romania (http://code4.ro)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'crawl_dezvoltare.middlewares.CrawlDezvoltareSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'crawl_dezvoltare.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'crawl_dezvoltare.pipelines.CrawlDezvoltarePipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /dezvoltare/crawl_dezvoltare/crawl_dezvoltare/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /dezvoltare/crawl_dezvoltare/crawl_dezvoltare/spiders/testing.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | 4 | item = {'contact': {'addr': u'Apolodor, nr. 17, sector 5', 5 | 'email': u'iulia.matei@mdrap.ro', 6 | 'fax': u'0372.114.569.'}, 7 | 'date': u'22-02-2017', 8 | 'description': u'\xcen temeiul art. 7 din Legea nr. 52/2003 privind transparen\u0163a decizional\u0103 \xeen administra\u0163ia public\u0103, republicat\u0103, Ministerul Dezvolt\u0103rii Regionale, Administra\u0163iei Publice si Fondurilor Europene aduce la cuno\u015ftin\u0163a publicului textul urm\u0103torului proiect de act normativ \u2013 Ordin al viceprim-ministrului, ministrul dezvolt\u0103rii regionale, administra\u0163iei publice \u0219i fondurilor europene pentru aplicarea prevederilor art. III, alin. (11) din Ordonan\u0163a de urgen\u0163\u0103 a Guvernului nr. 63/2010 pentru modificarea \u015fi completarea Legii nr. 273/2006 privind finan\u0163ele publice locale, precum \u015fi pentru stabilirea unor m\u0103suri financiare.', 9 | 'documents': [{'type': u'Referat de aprobare', 10 | 'url': '/userfiles/referat_ordin_oug63.doc'}], 11 | 'feedback_days': u'10', 12 | 'identifier': u'proiect-de-omdrapfe-pentru-aplicarea-prevederilor-art-iii-alin-11-din-ordonanta-de-urgenta-a-guvernului-nr-632010-pentru-modificarea-si-completarea-legii-nr-2732006-privind-finantele-publice-locale-precum-si-pentru-stabilirea-unor-masuri-financiare-22-02-2017', 13 | 'institution': 'dezvoltare', 14 | 'title': u'Proiect de OMDRAPFE pentru aplicarea prevederilor art. III, alin. (11) din Ordonan\u0163a de urgen\u0163\u0103 a Guvernului nr. 63/2010 pentru modificarea \u015fi completarea Legii nr. 273/2006 privind finan\u0163ele publice locale, precum \u015fi pentru stabilirea unor m\u0103suri financiare ', 15 | 'type': 'OMDRAPFE'} 16 | 17 | r = requests.post('http://czl-api.code4.ro/api/publications/', headers={'Authorization': 'Token dezvoltare-very-secret-token'}, data=item) 18 | -------------------------------------------------------------------------------- /dezvoltare/crawl_dezvoltare/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = crawl_dezvoltare.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = crawl_dezvoltare 12 | -------------------------------------------------------------------------------- /dezvoltare/requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.3 2 | appnope==0.1.0 3 | asn1crypto==0.21.1 4 | attrs==16.3.0 5 | Automat==0.5.0 6 | backports.shutil-get-terminal-size==1.0.0 7 | beautifulsoup4==4.5.3 8 | cffi==1.9.1 9 | constantly==15.1.0 10 | cryptography==1.8.1 11 | cssselect==1.0.1 12 | decorator==4.0.11 13 | enum34==1.1.6 14 | idna==2.5 15 | incremental==16.10.1 16 | ipaddress==1.0.18 17 | ipython==5.3.0 18 | ipython-genutils==0.1.0 19 | lxml==3.7.3 20 | packaging==16.8 21 | parsel==1.1.0 22 | pathlib2==2.2.1 23 | pexpect==4.2.1 24 | pickleshare==0.7.4 25 | prompt-toolkit==1.0.13 26 | ptyprocess==0.5.1 27 | pyasn1==0.2.3 28 | pyasn1-modules==0.0.8 29 | pycparser==2.17 30 | PyDispatcher==2.0.5 31 | Pygments==2.2.0 32 | pyOpenSSL==17.5.0 33 | pyparsing==2.2.0 34 | queuelib==1.4.2 35 | requests==2.20.0 36 | scandir==1.5 37 | Scrapy==1.3.3 38 | service-identity==16.0.0 39 | simplegeneric==0.8.1 40 | six==1.10.0 41 | slugify==0.0.1 42 | traitlets==4.3.2 43 | Twisted==19.7.0 44 | w3lib==1.17.0 45 | wcwidth==0.1.7 46 | zope.interface==4.3.3 47 | -------------------------------------------------------------------------------- /economie/.editorconfig: -------------------------------------------------------------------------------- 1 | [*] 2 | charset=utf-8 3 | end_of_line=crlf 4 | insert_final_newline=false 5 | indent_style=space 6 | indent_size=4 7 | 8 | [{*.jhm,*.xslt,*.xul,*.rng,*.xsl,*.xsd,*.ant,*.svg,*.tld,*.fxml,*.jrxml,*.xml,*.jnlp,*.wsdl}] 9 | indent_style=space 10 | indent_size=2 11 | 12 | [{.eslintrc,.babelrc,.stylelintrc,*.json,*.jsb3,*.jsb2,*.bowerrc}] 13 | indent_style=space 14 | indent_size=2 15 | 16 | [{*.applejs,*.js}] 17 | indent_style=space 18 | indent_size=4 19 | 20 | [{.analysis_options,*.yml,*.yaml}] 21 | indent_style=space 22 | indent_size=2 23 | 24 | -------------------------------------------------------------------------------- /economie/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | node_modules 3 | secrets.json 4 | data.json -------------------------------------------------------------------------------- /economie/README.md: -------------------------------------------------------------------------------- 1 | # Ministerul Economiei, Comerțului și Relațiilor cu Mediul de Afaceri 2 | 3 | ## Tehnologie 4 | 5 | JavaScript EcmaScript2015 (ES6) 6 | 7 | 1. nodejs - https://nodejs.org/en/ 8 | 1. nightmare - https://github.com/segmentio/nightmare 9 | 1. cheerio - https://github.com/cheeriojs/cheerio 10 | 1. jsonfile - https://github.com/jprichardson/node-jsonfile 11 | 1. request - https://github.com/request/request 12 | 1. argv - https://github.com/yargs/yargs 13 | 1. diacritics - https://github.com/andrewrk/node-diacritics 14 | 15 | ## Instructiuni 16 | 17 | 1. install nodejs 18 | 1. run `npm update` 19 | 1. run `node app.js`, passing param `--post` will upload to api and also generate a file `data.json` to view data. 20 | 21 | ## Exceptii 22 | 23 | -------------------------------------------------------------------------------- /economie/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pretutindeni", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "app.js", 6 | "scripts": { 7 | "crawl": "node app.js" 8 | }, 9 | "author": "", 10 | "license": "ISC", 11 | "dependencies": { 12 | "cheerio": "0.22.0", 13 | "diacritics": "1.3.0", 14 | "jsonfile": "2.4.0", 15 | "nightmare": "2.10.0", 16 | "nodemon": "1.11.0", 17 | "q": "1.4.1", 18 | "request": "^2.81.0", 19 | "yargs": "7.0.2" 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /economie/secrets.json.txt: -------------------------------------------------------------------------------- 1 | { 2 | "TOKEN": "something something", 3 | "API_URL": "http://something.com/api/post-parsed-results" 4 | } -------------------------------------------------------------------------------- /educatie/README.md: -------------------------------------------------------------------------------- 1 | # Ministerul Educaţiei Naţionale și Cercetării Științifice 2 | 3 | ## Tehnologie 4 | 5 | Node.js, [nightmare](http://www.nightmarejs.org/) 6 | 7 | ## Instrucțiuni 8 | 9 | ``` 10 | npm install 11 | ``` 12 | 13 | edit config.js, change API token (can also be specified on the command line) and other config vars 14 | 15 | ``` 16 | [API_TOKEN=foobar] npm start 17 | ``` 18 | 19 | ## Excepții 20 | -------------------------------------------------------------------------------- /educatie/config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | api: { 3 | url: 'http://czl-api.code4.ro/api/publications/', 4 | token: 'educatie-very-secret-key' 5 | }, 6 | scrape: { 7 | //url of the proposals listing page 8 | baseUrl: 'https://www.edu.ro/proiecte-acte-normative-0', 9 | //how many proposals to consider 10 | proposals: 20, 11 | defaultEmail: 'dgis@edu.gov.ro' 12 | } 13 | }; 14 | -------------------------------------------------------------------------------- /educatie/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "edu-scraper", 3 | "version": "1.0.0", 4 | "description": "Data scraper pentru Ministerul Educatiei", 5 | "main": "index.js", 6 | "scripts": { 7 | "start": "node index.js", 8 | "test": "echo \"Error: no test specified\" && exit 1" 9 | }, 10 | "repository": { 11 | "type": "git", 12 | "url": "git+https://github.com/lbogdan/czl-scrape" 13 | }, 14 | "author": { 15 | "name": "Bogdan Luca", 16 | "email": "luca.bogdan@gmail.com" 17 | }, 18 | "license": "MIT", 19 | "dependencies": { 20 | "diacritics": "^1.3.0", 21 | "jsonfile": "^2.4.0", 22 | "moment": "^2.17.1", 23 | "nightmare": "^2.10.0", 24 | "request-promise": "^4.1.1" 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /energie/.gitignore: -------------------------------------------------------------------------------- 1 | # OS files 2 | .DS_Store 3 | 4 | # Java files 5 | *.class 6 | 7 | # Log files 8 | *.log 9 | logs 10 | 11 | # Maven 12 | target 13 | pom.xml.versionsBackup 14 | 15 | # Dropwizard 16 | dependency-reduced-pom.xml 17 | 18 | # Mobile Tools for Java (J2ME) 19 | .mtj.tmp/ 20 | 21 | # Package Files 22 | *.jar 23 | *.war 24 | *.ear 25 | 26 | # IntelliJ IDEA 27 | *.iml 28 | .idea 29 | 30 | # Eclipse 31 | .project 32 | .settings 33 | .classpath 34 | test-output 35 | 36 | # Vim 37 | *.swp 38 | 39 | # Virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 40 | hs_err_pid* 41 | 42 | # Misc 43 | *git.properties 44 | 45 | # Asciidoc 46 | .asciidoctor 47 | diag-*.png 48 | -------------------------------------------------------------------------------- /energie/README.md: -------------------------------------------------------------------------------- 1 | # Ministerul Energiei 2 | 3 | ## Tehnologie 4 | 5 | ## Instructiuni 6 | 7 | ## Exceptii -------------------------------------------------------------------------------- /energie/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | 6 | ro.code4.czl 7 | czl-scrape 8 | 0.0.1-SNAPSHOT 9 | ../ 10 | 11 | 12 | czl-scrape-energie 13 | jar 14 | Ce Zice Legea :: Scraper :: Energie 15 | 16 | 17 | 18 | ${project.groupId} 19 | czl-scrape-commons 20 | ${project.version} 21 | 22 | 23 | 24 | org.jsoup 25 | jsoup 26 | 27 | 28 | 29 | 30 | ch.qos.logback 31 | logback-classic 32 | runtime 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /energie/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | scraper-energie.log 6 | 7 | 8 | scraper-energie.%i.log.zip 9 | 1 10 | 3 11 | 12 | 13 | 14 | 500MB 15 | 16 | 17 | 18 | %date{"yyyy-MM-dd'T'HH:mm:ss,SSSXXX", UTC} [%t] %-5level %c{1.} %msg%n 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 2048 28 | 29 | 30 | 31 | 0 32 | 33 | 34 | 35 | false 36 | 37 | 38 | 39 | 40 | %date{"yyyy-MM-dd'T'HH:mm:ss,SSSXXX", UTC} [%t] %-5level %c{1.} %msg%n 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /externe/README.md: -------------------------------------------------------------------------------- 1 | # Ministerul Afacerilor Externe 2 | 3 | ## Tehnologie 4 | - Python 3 (developed and tested on 3.5.2) 5 | - BeautifulSoup 4 6 | - Requests 7 | - Click 8 | - **E**xtraordinarily **U**nderwhelming but also **S**uper **E**levated **B**inary **I**nformation **U**nit. 9 | 10 | A.K.A Eusebiu. 11 | 12 | ## Instructiuni 13 | Pentru a-l convinge pe Eusebiu să ia la mână articolele de pe site-ul MAE, trebuie să: 14 | - Instalezi `python3` si `pip` 15 | - Rulezi `python3 setup.py install` sau cu `sudo` in fata, daca nu ai un virtualenv 16 | - Ca să aflii ce poate Eusebiu să facă pentru umanitate: `python eusebiu.py --help`: 17 | ``` 18 | Options: 19 | --page TEXT Selects the page to scrape. Available options are: 20 | 21 | scrapes the latest articles and falls back to 22 | observer mode 23 | ____________________________________________________ 24 | 25 | scrape the 2016 archive and switch to 26 | observer mode 27 | ____________________________________________________ 28 | 29 | scrape the 2014-2015 archive and switch 30 | to observer mode 31 | ____________________________________________________ 32 | --log_level TEXT Sets the logging level. Available values: ERROR, 33 | WARNING, INFO, DEBUG, 34 | --delay FLOAT Number of hours to wait before checking for changes. 35 | Default=1 36 | --observer Periodically checks for changes and scrapes them if 37 | available. 38 | --help Show this message and exit. 39 | ``` 40 | ## Exceptii 41 | Eusebiu se bazeaza in mare parte pe regex-uri pentru a extrage (silit, sau nu) informatii 42 | de la MAE. 43 | 44 | In cazul in care persoanele responsabil pentru introducerea articolelor in sistem 45 | se decid subit sa foloseasca alte pattern-uri decat cele pe le intelege Eusebiu, scraperul va 46 | genera articole invalide. Daca un articol nu contine toate detalii obligatorii, Eusebiu nu-i va 47 | face POST la API. 48 | -------------------------------------------------------------------------------- /externe/__init__.py: -------------------------------------------------------------------------------- 1 | VERSION = '17.03.12' 2 | -------------------------------------------------------------------------------- /externe/eusebiu.py: -------------------------------------------------------------------------------- 1 | import time 2 | import click 3 | import inspect 4 | import logging 5 | import os 6 | 7 | from scraper.article_serializer import ArticleSerializer 8 | from scraper.extractor import * 9 | from utils.api_client import post_data 10 | from utils.settings import * 11 | 12 | 13 | @click.command() 14 | @click.option('--page', default='feed', help=CLICK_HELPER['page']) 15 | @click.option('--log_level', default='INFO', help=CLICK_HELPER['log-level']) 16 | @click.option('--delay', default=1, type=float, help=CLICK_HELPER['delay']) 17 | @click.option('--observer', is_flag=True, default=False, 18 | help=CLICK_HELPER['observer']) 19 | def get_to_work(page, delay, observer, log_level): 20 | # init logging 21 | if log_level not in LOG_LEVELS: 22 | logging.warning('Unrecognized log_level: %s. Defaulting to INFO') 23 | log_level = 'INFO' 24 | 25 | current_dir = os.path.dirname( 26 | os.path.abspath(inspect.getfile(inspect.currentframe())) 27 | ) 28 | logs_dir = current_dir + LOGS_DIR 29 | if not os.path.exists(logs_dir): 30 | os.makedirs(logs_dir) 31 | 32 | logging.basicConfig(filename=LOG_FILE, level=LOG_LEVELS[log_level], 33 | format='%(asctime)s %(levelname)s %(message)s') 34 | 35 | # if observer flag is set, ignore everything else and start eavesdropping 36 | if observer: 37 | shut_up_and_listen(delay) 38 | 39 | # validate page selection 40 | if page not in SCRAPER_PAGES: 41 | logging.error('Page name: %s not recognized. See help for available pages', page) 42 | exit() 43 | 44 | # scrape all articles on this page, and dump them on the API 45 | dump_one_of_these(page) 46 | 47 | # then get back to eavesdropping 48 | shut_up_and_listen(delay) 49 | 50 | 51 | def shut_up_and_listen(delay): 52 | """ Eusebiu skillfully lurks in the shadows, waiting for a new article to be posted. 53 | :param delay: int: number of hours to wait before the next tactical strike. 54 | :return: None 55 | """ 56 | current_latest = [] 57 | while True: 58 | feed_extractor = Extractor(settings.URLS.get('feed')) 59 | latest_entries = feed_extractor.get_identifier_list() 60 | logging.debug('latest_entries: %s', latest_entries) 61 | 62 | if not current_latest: 63 | logging.info('Assuming current state of feed is the latest ...') 64 | current_latest = latest_entries[:] 65 | 66 | diff = set(current_latest) - set(latest_entries) 67 | for identifier in diff: 68 | # be polite to the MAE website 69 | time.sleep(0.5) 70 | logging.info('Found new article: %s', identifier) 71 | article = feed_extractor.get_article_by_id(identifier) 72 | diff.remove(article.identifier) 73 | post_article(article) 74 | 75 | logging.info('ETA until next scrape: %s hour(s)', delay) 76 | time.sleep(hours_to_sec(delay)) 77 | 78 | 79 | def dump_one_of_these(page): 80 | """ 81 | Eusebiu masterfully extracts all the articles on a given page, and swiftly dumps 82 | them onto the API. 83 | :param page: the page to eviscerate. 84 | :return: None 85 | """ 86 | extractor = Extractor(settings.URLS.get(page)) 87 | articles = extractor.get_all_articles() 88 | for article in articles: 89 | # be polite to the API 90 | time.sleep(0.5) 91 | post_article(article) 92 | 93 | 94 | def post_article(article): 95 | """Attempts to POST and article to the API. 96 | :param article: the object to POST. 97 | :return: True if successful, False otherwise. 98 | """ 99 | if not ArticleSerializer().is_valid(article): 100 | logging.error('Invalid article: %s \n WILL NOT POST TO API', article) 101 | return False 102 | data = ArticleSerializer().serialize(article) 103 | return post_data(data) 104 | 105 | 106 | if __name__ == '__main__': 107 | get_to_work() 108 | -------------------------------------------------------------------------------- /externe/scraper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/externe/scraper/__init__.py -------------------------------------------------------------------------------- /externe/scraper/article_serializer.py: -------------------------------------------------------------------------------- 1 | from utils import settings 2 | 3 | 4 | class ArticleSerializer: 5 | @staticmethod 6 | def serialize(article): 7 | return dict( 8 | # TODO 9 | identifier=article.identifier, 10 | title=article.title, 11 | type=article.article_type, 12 | institution=settings.INSTITUTION, 13 | date=article.published_at.isoformat(), 14 | description='N\A', 15 | feedback_days=article.feedback_days, 16 | contact=article.contact, 17 | documents=article.documents, 18 | ) 19 | 20 | @staticmethod 21 | def is_valid(article): 22 | """Checks if an Article is valid, according to the API specs. 23 | :param article: The Article instance to validate 24 | :return: True or False 25 | """ 26 | for field in settings.MANDATORY_FIELDS: 27 | if not getattr(article, field): 28 | return False 29 | return True 30 | -------------------------------------------------------------------------------- /externe/scraper/extractor.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup as beautiful_soup 3 | 4 | import utils.settings as settings 5 | from scraper.article import Article 6 | 7 | 8 | class Extractor: 9 | """Extractor object, responsible for fetching data from the MAE website. 10 | """ 11 | url = None 12 | content = None 13 | articles = None 14 | 15 | def __init__(self, url): 16 | self.url = url 17 | self.content = self._fetch_page() 18 | 19 | def get_all_articles(self): 20 | """Generates a list of all Article objects fetched from MAE. 21 | :return: the list of Articles 22 | """ 23 | self.articles = [Article(table) for table in self._get_tables()] 24 | return self.articles 25 | 26 | def get_article_by_id(self, identifier): 27 | """Returns the article matching the given identifier. 28 | :param identifier: the id 29 | :return: the matching Article, or None 30 | """ 31 | if not self.articles: 32 | self.get_all_articles() 33 | 34 | for a in self.articles: 35 | if a.identifier == identifier: 36 | return a 37 | 38 | def get_identifier_list(self): 39 | """Extracts a list of identifiers of the latest articles. 40 | :return: list 41 | """ 42 | latest = [] 43 | for table in self._get_tables(): 44 | tr = table.select('tr') 45 | article = Article() 46 | article._extract_article_type(tr) 47 | article._extract_title(tr) 48 | article._generate_id() 49 | latest.append(article.identifier) 50 | return latest 51 | 52 | def _fetch_page(self): 53 | page = requests.get(self.url, headers=settings.HEADERS) 54 | return beautiful_soup(page.text, 'html.parser') 55 | 56 | def _get_tables(self): 57 | return self.content.select_one('div.art').select('table') 58 | -------------------------------------------------------------------------------- /externe/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import re 3 | from setuptools import setup 4 | 5 | install_requires = [ 6 | 'beautifulsoup4', 7 | 'requests', 8 | 'click', 9 | 'lxml' 10 | ] 11 | 12 | version_regex = re.compile("VERSION\s*=\s*'(.*?)'$") 13 | 14 | with open('__init__.py') as stream: 15 | VERSION = version_regex.search(stream.read()).group(1) 16 | 17 | setup( 18 | version=VERSION, 19 | name='mae-scraper', 20 | url='https://github.com/code4romania/czl-scrape/tree/master/externe', 21 | author='Rares Urdea, Alexandru Hodorogea', 22 | author_email='contact@code4.ro', 23 | description='Scraper pentru site-ul Ministerului de Afaceri Externe', 24 | install_requires=install_requires, 25 | ) 26 | -------------------------------------------------------------------------------- /externe/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/externe/utils/__init__.py -------------------------------------------------------------------------------- /externe/utils/api_client.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import requests 3 | import time 4 | 5 | from utils.settings import * 6 | 7 | 8 | def post_data(data): 9 | attempts = 5 10 | success = False 11 | 12 | while not success and attempts > 0: 13 | attempts -= 1 14 | response = requests.post(URLS['api-publications'], data, headers=HEADERS) 15 | 16 | if _already_exists(response): 17 | logging.warning( 18 | 'Object: %s \nalready exists, according to API. Skipping.', data 19 | ) 20 | break 21 | 22 | success = response.status_code == STATUS_CREATED 23 | if success: 24 | break 25 | time.sleep(30) 26 | 27 | if not success: 28 | logging.error('Failed to POST data to API: %s', data) 29 | 30 | return success 31 | 32 | 33 | def _already_exists(response): 34 | return response.status_code == STATUS_BAD_REQUEST \ 35 | and ALREADY_EXISTS in response.text.lower() 36 | -------------------------------------------------------------------------------- /externe/utils/lang.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | class LangHelper(object): 3 | FUCK_NO = [ 4 | # new line 5 | '\n', 6 | # tab 7 | '\t', 8 | # non-breaking space 9 | '\xa0', 10 | # 0 width space 11 | '\u200b' 12 | ] 13 | 14 | @staticmethod 15 | def englishize_romanian(string): 16 | symbols = (u"țţȚŢșşȘŞăĂîÎâÂ", 17 | u"ttTTssSSaAiIaA") 18 | 19 | tr = {ord(a): ord(b) for a, b in zip(*symbols)} 20 | return string.translate(tr) 21 | 22 | @staticmethod 23 | def beautify_romanian(string): 24 | symbols = (u"ţşŢŞ", 25 | u"țșȚȘ") 26 | tr = {ord(a): ord(b) for a, b in zip(*symbols)} 27 | return string.translate(tr) 28 | 29 | @staticmethod 30 | def sanitize(string): 31 | """Sanitize a string. 32 | Removes new lines and 0 width spaces, because fuck those. 33 | 34 | :param string: The string to sanitize. 35 | :return: A clean string. 36 | """ 37 | if string: 38 | for this_little_shit in LangHelper.FUCK_NO: 39 | string = string.replace(this_little_shit, '') 40 | return string 41 | -------------------------------------------------------------------------------- /externe/utils/settings.py: -------------------------------------------------------------------------------- 1 | WAIT = { 2 | '1_sec': 1, 3 | '0.5_sec': 0.5 4 | } 5 | 6 | HEADERS = { 7 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) ' 8 | 'AppleWebKit/537.36 (KHTML, like Gecko) ' 9 | 'Chrome/39.0.2171.95 Safari/537.36', 10 | 'Authorization': 'Token externe-very-secret-key' 11 | } 12 | 13 | SCRAPER_PAGES = [ 14 | 'arhiva-1415', 15 | 'arhiva-2016', 16 | 'feed' 17 | ] 18 | 19 | # The keys linking to MAE pages need to match the items in SCRAPER_PAGES 20 | URLS = { 21 | 'mae_base': 'http://www.mae.ro', 22 | 'feed': 'https://www.mae.ro/node/2011#null', 23 | 'arhiva-2016': 'http://www.mae.ro/node/40248', 24 | 'arhiva-1415': 'http://www.mae.ro/node/35609', 25 | 'api-publications': 'http://czl-api.code4.ro/api/publications/' 26 | } 27 | 28 | STATUS_CREATED = 201 29 | STATUS_BAD_REQUEST = 400 30 | ALREADY_EXISTS = 'already exists' 31 | 32 | TYPES = { 33 | 'HOTARARE': 'HG', 34 | 'ORDONANTA': 'OG', 35 | 'ORDONANTA DE URGENTA': 'OUG', 36 | 'ORDINUL MINISTRULUI AFACERILOR EXTERNE': 'OM', 37 | 'ORDIN': 'OM', 38 | 'PROIECT DE LEGE': 'LEGE', 39 | 'LEGE': 'LEGE', 40 | 'OTHER': 'OTHER' 41 | } 42 | 43 | MONTHS = dict( 44 | ianuarie='01', 45 | februarie='02', 46 | martie='03', 47 | aprilie='04', 48 | mai='05', 49 | iunie='06', 50 | iulie='07', 51 | august='08', 52 | septembrie='09', 53 | octombrie='10', 54 | noiembrie='11', 55 | decembrie='12' 56 | ) 57 | 58 | CLICK_HELPER = { 59 | 60 | 'log-level': '\b Sets the logging level. Available values: ERROR, WARNING, INFO, DEBUG,', 61 | 'page': """ 62 | \b Selects the page to scrape. Available options are: 63 | \b scrapes the latest articles and falls back to observer mode 64 | ____________________________________________________ 65 | \b scrape the 2016 archive and switch to observer mode 66 | ____________________________________________________ 67 | \b scrape the 2014-2015 archive and switch 68 | to observer mode 69 | ____________________________________________________ 70 | """, 71 | 'observer': 'Periodically checks for changes and scrapes them if available. ' 72 | 'NOTE: in observer mode, any argument is ignored.', 73 | 'delay': 'Number of hours to wait before checking for changes. Default=1' 74 | } 75 | 76 | LOG_LEVELS = { 77 | 'ERROR': 40, 78 | 'WARNING': 30, 79 | 'INFO': 20, 80 | 'DEBUG': 10 81 | } 82 | 83 | LOG_FILE = 'logs/scraper.log' 84 | LOGS_DIR = '/logs' 85 | 86 | INSTITUTION = 'externe' 87 | 88 | MANDATORY_FIELDS = ['identifier', 'title', 'published_at', 'article_type'] 89 | 90 | DATE_FMT = '%Y-%m-%d' 91 | 92 | 93 | def hours_to_sec(hours): 94 | return hours * 3600 95 | -------------------------------------------------------------------------------- /finantepub/.gitignore: -------------------------------------------------------------------------------- 1 | /node_modules 2 | -------------------------------------------------------------------------------- /finantepub/README.md: -------------------------------------------------------------------------------- 1 | # Ministerul Finanţelor Publice 2 | 3 | ## Tehnologie 4 | NodeJS, [Nightmare](http://www.nightmarejs.org) 5 | 6 | ## Instructiuni 7 | ``` 8 | npm install 9 | API_TOKEN=the_secret_api_token npm start 10 | ``` 11 | 12 | ## Exceptii 13 | -------------------------------------------------------------------------------- /finantepub/index.js: -------------------------------------------------------------------------------- 1 | const sha256 = require('sha256'); 2 | const rp = require('request-promise'); 3 | const Nightmare = require('nightmare'); 4 | const nightmare = Nightmare({ show: false, typeInterval: 2, waitTimeout: 5000 }); 5 | 6 | const YEAR_THRESHOLD = 2017; 7 | 8 | const API_TOKEN = process.env['API_TOKEN']; 9 | 10 | function guessType(text) { 11 | text = text.toLowerCase().trim(); 12 | text = text.replace(/^proiect\s*/, ''); 13 | if(text.match(/^ordonanță de urgență/)) return 'OUG'; 14 | if(text.match(/^lege/)) return 'LEGE'; 15 | if(text.match(/^ordin/)) return 'OG'; 16 | if(text.match(/^hotărâre/)) return 'HG'; 17 | throw new Error(`failz: ${text}`); 18 | } 19 | 20 | function parsePage(page = 1) { 21 | nightmare 22 | .cookies.clear() 23 | .useragent(`Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.${Math.round(Math.random()*100)}`) 24 | .goto(`http://www.mfinante.gov.ro/transparent.html?method=transparenta&pagina=acasa&locale=ro&d-6834775-p=${page}`) 25 | .wait("#transparentaList") 26 | .evaluate(()=> { 27 | if(document.querySelector('#transparentaList').innerText.trim() == '') return; 28 | let itemsList = [], items = [... document.querySelectorAll('#transparentaList > tbody > tr ')]; 29 | for (let item of items) { 30 | let text = item.innerText; 31 | let match = text.replace(/\s+/g, ' ').match( 32 | /(.*?)\s*- publicat în data de\s*(\d{2})\.(\d{2})\.(\d{4})/); 33 | 34 | if(! match) { 35 | throw new Error(`Can't match title and date in text: "${text}"`); 36 | } 37 | 38 | let documents = [] 39 | let links = item.querySelectorAll('a.downlPDF'); 40 | for (let doc of links) { 41 | documents.push({ 42 | type: 'act', 43 | url: doc.href 44 | }); 45 | } 46 | 47 | let returnObj = { 48 | title: match[1], 49 | date: `${match[4]}-${match[3]}-${match[2]}`, 50 | documents: documents, 51 | label: links[0].innerText 52 | }; 53 | 54 | itemsList.push(returnObj); 55 | } 56 | return itemsList; 57 | }) 58 | .then((result) => { 59 | 60 | if(! result) { 61 | console.log("halt!"); 62 | nightmare.halt(); 63 | return; 64 | } 65 | 66 | let itemsList = []; 67 | 68 | for(let val of result) { 69 | let year = val.date.split('-')[0] 70 | if (year < YEAR_THRESHOLD) { 71 | console.log("halt!"); 72 | nightmare.halt(); 73 | return; 74 | } 75 | 76 | val.identifier = sha256(val.documents[0].url); 77 | val.institution = 'finantepub'; 78 | val.description = ''; 79 | val.type = guessType(val.label); 80 | delete val.label; 81 | itemsList.push(val); 82 | } 83 | 84 | function postAllItems(remaining) { 85 | if(! remaining.length) return; 86 | let val = remaining[0]; 87 | return rp.post({ 88 | url: 'http://czl-api.code4.ro/api/publications/', 89 | headers: {Authorization: `Token ${API_TOKEN}`}, 90 | json: val 91 | }) 92 | .then(() => { 93 | console.log('posted item: ', val.identifier); 94 | return postAllItems(remaining.slice(1)); 95 | }); 96 | } 97 | 98 | return postAllItems(itemsList); 99 | 100 | }) 101 | .then(() => { 102 | parsePage(page + 1); 103 | }) 104 | .catch((error) => { 105 | console.error('error:', error); 106 | nightmare.halt(); 107 | }); 108 | } 109 | 110 | parsePage(); 111 | -------------------------------------------------------------------------------- /finantepub/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "finantepub", 3 | "version": "1.0.0", 4 | "description": "## Tehnologie", 5 | "main": "index.js", 6 | "scripts": { 7 | "start": "node index.js" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "git+https://github.com/mgax/czl-scrape.git" 12 | }, 13 | "author": "ciprian chichirita, alex morega", 14 | "license": "MIT", 15 | "devDependencies": { 16 | "nightmare": "^2.10.0", 17 | "request-promise": "^4.1.1", 18 | "sha256": "^0.2.0" 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /interne/.editorconfig: -------------------------------------------------------------------------------- 1 | [*] 2 | charset=utf-8 3 | end_of_line=crlf 4 | insert_final_newline=false 5 | indent_style=space 6 | indent_size=4 7 | 8 | [{*.jhm,*.xslt,*.xul,*.rng,*.xsl,*.xsd,*.ant,*.svg,*.tld,*.fxml,*.jrxml,*.xml,*.jnlp,*.wsdl}] 9 | indent_style=space 10 | indent_size=2 11 | 12 | [{.eslintrc,.babelrc,.stylelintrc,*.json,*.jsb3,*.jsb2,*.bowerrc}] 13 | indent_style=space 14 | indent_size=2 15 | 16 | [{*.applejs,*.js}] 17 | indent_style=space 18 | indent_size=4 19 | 20 | [{.analysis_options,*.yml,*.yaml}] 21 | indent_style=space 22 | indent_size=2 23 | 24 | -------------------------------------------------------------------------------- /interne/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | node_modules 3 | secrets.json 4 | data.json -------------------------------------------------------------------------------- /interne/README.md: -------------------------------------------------------------------------------- 1 | # Ministerul Afacerilor Interne 2 | 3 | ## Tehnologie 4 | 5 | ## Instructiuni 6 | 7 | ## Exceptii -------------------------------------------------------------------------------- /interne/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pretutindeni", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "app.js", 6 | "scripts": { 7 | "crawl": "node app.js" 8 | }, 9 | "author": "", 10 | "license": "ISC", 11 | "dependencies": { 12 | "cheerio": "0.22.0", 13 | "diacritics": "1.3.0", 14 | "jsonfile": "2.4.0", 15 | "nightmare": "2.10.0", 16 | "nodemon": "1.11.0", 17 | "q": "1.4.1", 18 | "request": "2.81.0", 19 | "yargs": "7.0.2" 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /interne/secrets.json.txt: -------------------------------------------------------------------------------- 1 | { 2 | "TOKEN": "something something", 3 | "API_URL": "http://something.com/api/post-parsed-results" 4 | } -------------------------------------------------------------------------------- /justitie/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .scrapy 3 | 4 | *.pyc 5 | __pycache__ 6 | **/__pycache__ 7 | -------------------------------------------------------------------------------- /justitie/README.md: -------------------------------------------------------------------------------- 1 | # Ministerul Justiţiei 2 | 3 | ## Tehnologie 4 | 5 | *Python 3.6*, [virtualenv](https://virtualenv.pypa.io/) e un prieten bun 6 | [Scrapy](https://scrapy.org/) 7 | 8 | ``` 9 | pip install -r requirements.txt 10 | 11 | # on windows: 12 | pip install win32api 13 | ``` 14 | 15 | ## Instructiuni 16 | 17 | ``` 18 | scrapy crawl publication 19 | ``` 20 | 21 | ## Altele 22 | 23 | Data understading & values 24 | * [online](https://etherpad.net/p/hackajust) 25 | * see doc folder 26 | -------------------------------------------------------------------------------- /justitie/just/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/justitie/just/__init__.py -------------------------------------------------------------------------------- /justitie/just/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | class JustPublication(scrapy.Item): 11 | # define the fields for your item here like: 12 | # name = scrapy.Field() 13 | identifier = scrapy.Field() 14 | title = scrapy.Field() 15 | type = scrapy.Field() 16 | institution = scrapy.Field() 17 | date = scrapy.Field() 18 | description = scrapy.Field() 19 | feedback_days = scrapy.Field() 20 | contact = scrapy.Field() 21 | documents = scrapy.Field() 22 | 23 | pass 24 | 25 | -------------------------------------------------------------------------------- /justitie/just/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class JustSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /justitie/just/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import requests 9 | import json 10 | import logging 11 | 12 | from just.items import JustPublication 13 | import logging 14 | 15 | API_KEY = 'justitie-very-secret-key' 16 | API_PUBLICATIONS = 'http://czl-api.code4.ro/api/publications/' 17 | 18 | class JustPublicationsToApiPipeline(object): 19 | def process_item(self, item, spider): 20 | 21 | if type(item) != JustPublication: 22 | return item 23 | 24 | r = requests.post(API_PUBLICATIONS, json=dict(item), headers={'Authorization': 'Token %s' % (API_KEY,) } ) 25 | 26 | 27 | if r.status_code == 200 or r.status_code == '200': 28 | logging.log(msg=r.status_code, level=logging.INFO) 29 | else: 30 | logging.log(msg=r.status_code, level=logging.ERROR) 31 | logging.log(msg=r.content, level=logging.INFO) 32 | 33 | return item 34 | -------------------------------------------------------------------------------- /justitie/just/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for just project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'just' 13 | 14 | SPIDER_MODULES = ['just.spiders'] 15 | NEWSPIDER_MODULE = 'just.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | USER_AGENT = 'code4romania (+http://www.code4.ro)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = True 22 | 23 | LOG_ENABLED = True 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | DOWNLOAD_DELAY = 5 32 | # The download delay setting will honor only one of: 33 | CONCURRENT_REQUESTS_PER_DOMAIN = 1 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | #COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | #DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | #} 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'just.middlewares.JustSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'just.middlewares.MyCustomDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.AutoThrottle': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'just.pipelines.JustPublicationsToApiPipeline': 100, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | # AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | # AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | # AUTOTHROTTLE_MAX_DELAY = 30 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0 82 | # Enable showing throttling stats for every response received: 83 | # AUTOTHROTTLE_DEBUG = True 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | HTTPCACHE_ENABLED = True 88 | HTTPCACHE_EXPIRATION_SECS = 30 89 | HTTPCACHE_DIR = 'httpcache' 90 | HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /justitie/just/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /justitie/requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.3 2 | asn1crypto==0.21.1 3 | attrs==16.3.0 4 | Automat==0.5.0 5 | cffi==1.9.1 6 | constantly==15.1.0 7 | convertdate==2.1.0 8 | cryptography==1.8.1 9 | cssselect==1.0.1 10 | ephem==3.7.6.0 11 | idna==2.5 12 | incremental==16.10.1 13 | jdatetime==1.8.2 14 | lxml==3.7.3 15 | packaging==16.8 16 | parsel==1.1.0 17 | pyasn1==0.2.3 18 | pyasn1-modules==0.0.8 19 | pycparser==2.17 20 | PyDispatcher==2.0.5 21 | pyOpenSSL==17.5.0 22 | pyparsing==2.2.0 23 | pytz==2016.10 24 | queuelib==1.4.2 25 | regex==2017.2.8 26 | ruamel.yaml==0.13.14 27 | Scrapy==1.3.3 28 | service-identity==16.0.0 29 | six==1.10.0 30 | Twisted==19.7.0 31 | tzlocal==1.3 32 | umalqurra==0.2 33 | Unidecode==0.4.20 34 | w3lib==1.17.0 35 | zope.interface==4.3.3 36 | -------------------------------------------------------------------------------- /justitie/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = just.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = just 12 | -------------------------------------------------------------------------------- /mediu/.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.swo 3 | .DS_Store 4 | *.egg-info 5 | build 6 | *.pyc 7 | **/*.pyc 8 | dbs 9 | -------------------------------------------------------------------------------- /mediu/README.md: -------------------------------------------------------------------------------- 1 | # Ministerul Mediului 2 | 3 | ## Tehnologie 4 | 5 | *Python 2.7* 6 | [Scrapy 1.3.3](https://scrapy.org/) 7 | 8 | ## Instructiuni 9 | 10 | ``` 11 | pip install -r requirements.txt 12 | cd crawl_mediu 13 | scrapy crawl mmediu -a token=xxxx 14 | ``` 15 | 16 | 17 | ## Exceptii -------------------------------------------------------------------------------- /mediu/crawl_mediu/crawl_mediu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/mediu/crawl_mediu/crawl_mediu/__init__.py -------------------------------------------------------------------------------- /mediu/crawl_mediu/crawl_mediu/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class CrawlMediuItem(scrapy.Item): 12 | identifier = scrapy.Field() 13 | title = scrapy.Field() 14 | type = scrapy.Field() 15 | institution = scrapy.Field() 16 | institution = scrapy.Field() 17 | date = scrapy.Field() 18 | description = scrapy.Field() 19 | feedback_days = scrapy.Field() 20 | contact = scrapy.Field() 21 | tel = scrapy.Field() 22 | email = scrapy.Field() 23 | documents = scrapy.Field() -------------------------------------------------------------------------------- /mediu/crawl_mediu/crawl_mediu/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class CrawlMediuSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /mediu/crawl_mediu/crawl_mediu/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import requests 8 | 9 | class CrawlMediuPipeline(object): 10 | def process_item(self, item, spider): 11 | doc = { 12 | 'identifier': item['identifier'], 13 | 'title': item['title'], 14 | 'institution': item['institution'], 15 | 'description': item['description'], 16 | 'type': item['type'], 17 | 'date': item['date'], 18 | 'documents': item['documents'], 19 | 'contact':item['contact'], 20 | 'feedback_days': item['feedback_days'] 21 | } 22 | 23 | response = requests.post('http://czl-api.code4.ro/api/publications/', headers={'Authorization': 'Token ' + spider.token }, json=doc) 24 | # print '---------' 25 | # print response 26 | # print response.text 27 | # print '---------' 28 | return item 29 | 30 | -------------------------------------------------------------------------------- /mediu/crawl_mediu/crawl_mediu/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for crawl_mediu project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'crawl_mediu' 13 | 14 | SPIDER_MODULES = ['crawl_mediu.spiders'] 15 | NEWSPIDER_MODULE = 'crawl_mediu.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'crawl_mediu (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'crawl_mediu.middlewares.CrawlMediuSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'crawl_mediu.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'crawl_mediu.pipelines.CrawlMediuPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /mediu/crawl_mediu/crawl_mediu/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /mediu/crawl_mediu/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = crawl_mediu.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = crawl_mediu 12 | -------------------------------------------------------------------------------- /mediu/requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.3 2 | appnope==0.1.0 3 | asn1crypto==0.21.1 4 | attrs==16.3.0 5 | Automat==0.5.0 6 | backports.shutil-get-terminal-size==1.0.0 7 | beautifulsoup4==4.5.3 8 | cffi==1.9.1 9 | constantly==15.1.0 10 | cryptography==1.8.1 11 | cssselect==1.0.1 12 | decorator==4.0.11 13 | enum34==1.1.6 14 | idna==2.5 15 | incremental==16.10.1 16 | ipaddress==1.0.18 17 | ipython==5.3.0 18 | ipython-genutils==0.1.0 19 | lxml==3.7.3 20 | packaging==16.8 21 | parsel==1.1.0 22 | pathlib2==2.2.1 23 | pexpect==4.2.1 24 | pickleshare==0.7.4 25 | prompt-toolkit==1.0.13 26 | ptyprocess==0.5.1 27 | pyasn1==0.2.3 28 | pyasn1-modules==0.0.8 29 | pycparser==2.17 30 | PyDispatcher==2.0.5 31 | Pygments==2.2.0 32 | pyOpenSSL==16.2.0 33 | pyparsing==2.2.0 34 | queuelib==1.4.2 35 | requests==2.13.0 36 | scandir==1.5 37 | Scrapy==1.3.3 38 | service-identity==16.0.0 39 | simplegeneric==0.8.1 40 | six==1.10.0 41 | slugify==0.0.1 42 | traitlets==4.3.2 43 | Twisted==17.1.0 44 | w3lib==1.17.0 45 | wcwidth==0.1.7 46 | zope.interface==4.3.3 47 | -------------------------------------------------------------------------------- /presedinte/README.md: -------------------------------------------------------------------------------- 1 | # Presedintia 2 | 3 | ## Tehnologie 4 | 5 | ## Instructiuni 6 | 7 | ## Exceptii -------------------------------------------------------------------------------- /pretutindeni/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | node_modules 3 | parseProject.js -------------------------------------------------------------------------------- /pretutindeni/README.md: -------------------------------------------------------------------------------- 1 | # Ministerul pentru Românii de Pretutindeni 2 | 3 | 1. http://www.dprp.gov.ro/documente-in-consultare-publica/ 4 | 5 | ## Tehnologie 6 | 7 | 1. nodejs - https://nodejs.org/en/ 8 | 2. nightmarejs - https://github.com/segmentio/nightmare 9 | 10 | ## Instructiuni 11 | 12 | ## Exceptii 13 | 14 | Oamenii care updateaza chestia asta sunt exceptii. Paragrafe fara structura aruncate pur si simplu acolo. Foarte dificil de parsat. -------------------------------------------------------------------------------- /pretutindeni/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pretutindeni", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "app.js", 6 | "scripts": { 7 | "crawl": "node app.js" 8 | }, 9 | "author": "", 10 | "license": "ISC", 11 | "dependencies": { 12 | "cheerio": "0.22.0", 13 | "nightmare": "2.10.0" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /pretutindeni/parseProject.example: -------------------------------------------------------------------------------- 1 | var cheerio = require('cheerio') 2 | 3 | module.exports = function(project) { 4 | "use strict"; 5 | 6 | console.log(project); 7 | }; -------------------------------------------------------------------------------- /relparlament/README.md: -------------------------------------------------------------------------------- 1 | # Ministerul pentru Relaţia cu Parlamentul 2 | 3 | ## Tehnologie 4 | 5 | ## Instructiuni 6 | 7 | ## Exceptii -------------------------------------------------------------------------------- /relparlament/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "relparlament", 3 | "version": "1.0.0", 4 | "description": "## Tehnologie", 5 | "main": "index.js", 6 | "scripts": { 7 | "start": "node index.js", 8 | "test": "echo \"Error: no test specified\" && exit 1" 9 | }, 10 | "author": "Mihnea Beldescu", 11 | "license": "ISC", 12 | "dependencies": { 13 | "cheerio": "^0.22.0", 14 | "lokijs": "^1.4.3", 15 | "nightmare": "^2.10.0", 16 | "request": "^2.81.0" 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /sanatate/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.pyc 3 | -------------------------------------------------------------------------------- /sanatate/README.md: -------------------------------------------------------------------------------- 1 | # Ministerul Sănătăţii 2 | Crawler simplu, de la țară, făcut cu scrapy. Nu știe bine românește, dar înțelege oricum (face fuzzy matching pe titluri ca să scoată tipul de act normativ). 3 | ## Tehnologie 4 | - python3, pip 5 | - scrapy, fuzzywuzzy, urllib3 6 | - python-Levenshtein [opțional] 7 | 8 | ## Instructiuni 9 | Bagi chestii în _credentials.json_, după care un clasic _pip install -r requirements.txt_ și un clasic _scrapy crawl sanatate_. 10 | ## Exceptii 11 | Detectarea tipului de act normativ nu e perfectă, și nici a tipului de documente. Asta e o problemă mai mare, și nu are sens să o tratăm doar într-un singur crawler. 12 | -------------------------------------------------------------------------------- /sanatate/credentials.json: -------------------------------------------------------------------------------- 1 | { 2 | "endpoint": "http://czl-api.code4.ro/api/publications/", 3 | "authorization": "weeee" 4 | } -------------------------------------------------------------------------------- /sanatate/requirements.txt: -------------------------------------------------------------------------------- 1 | fuzzywuzzy==0.15.0 2 | python-Levenshtein==0.12.0 3 | Scrapy==1.3.3 4 | urllib3==1.20 5 | -------------------------------------------------------------------------------- /sanatate/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = scrapy_proj.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = scrapy_proj 12 | -------------------------------------------------------------------------------- /sanatate/scrapy_proj/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/sanatate/scrapy_proj/__init__.py -------------------------------------------------------------------------------- /sanatate/scrapy_proj/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from scrapy_proj.helpers.legal import * 4 | from scrapy_proj.helpers.romanian import * 5 | from scrapy_proj.helpers.text import * 6 | -------------------------------------------------------------------------------- /sanatate/scrapy_proj/helpers/legal.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | import fuzzywuzzy.fuzz as fuzz 5 | 6 | from scrapy_proj.helpers.romanian import * 7 | 8 | class LegalHelper(object): 9 | @staticmethod 10 | def get_type_from_title(title): 11 | engrol = RomanianHelper.englishize_romanian(title).lower() 12 | 13 | stop_pos = len(title) 14 | magic_keyword_search_result = re.search(r'(pentru|privind)', engrol) 15 | if magic_keyword_search_result != None: 16 | stop_pos = magic_keyword_search_result.start() 17 | 18 | search_space = engrol[:stop_pos] 19 | 20 | type_to_keywords = { 21 | 'HG': 'hotarare', 22 | 'OM': 'ordin', 23 | 'LEGE': 'lege', 24 | 'OG': 'ordonanta', 25 | 'OUG': 'ordonanta de urgenta' 26 | } 27 | 28 | final_type = None 29 | max_ratio = 0 30 | 31 | for key in type_to_keywords: 32 | ratio = fuzz.ratio(type_to_keywords[key], search_space) 33 | if ratio > max_ratio: 34 | max_ratio = ratio 35 | final_type = key 36 | 37 | return final_type 38 | -------------------------------------------------------------------------------- /sanatate/scrapy_proj/helpers/romanian.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class RomanianHelper(object): 4 | @staticmethod 5 | def englishize_romanian(string): 6 | symbols = (u"țţȚŢșşȘŞăǎĂîÎâÂ", 7 | u"ttTTssSSaaAiIaA") 8 | 9 | tr = {ord(a):ord(b) for a, b in zip(*symbols)} 10 | 11 | return string.translate(tr) 12 | 13 | @staticmethod 14 | def beautify_romanian(string): 15 | symbols = (u"ǎţşŢŞ", 16 | u"ățșȚȘ") 17 | tr = {ord(a):ord(b) for a, b in zip(*symbols)} 18 | return string.translate(tr) 19 | -------------------------------------------------------------------------------- /sanatate/scrapy_proj/helpers/text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | 5 | class TextHelper(object): 6 | 7 | @staticmethod 8 | def remove_non_ascii(string): 9 | return re.sub(r'[^\x00-\x7F]+', ' ', string) 10 | 11 | @staticmethod 12 | def remove_non_numeric(string): 13 | return re.sub('[^0-9]+', '', string) 14 | 15 | @staticmethod 16 | def rws(str): 17 | if str: 18 | return ' '.join(str.split()) 19 | else: 20 | return None 21 | 22 | @staticmethod 23 | def titleize(string): 24 | if string: 25 | return string.title() 26 | else: 27 | return None 28 | -------------------------------------------------------------------------------- /sanatate/scrapy_proj/items/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from scrapy_proj.items.act import * 4 | from scrapy_proj.items.contact import * 5 | -------------------------------------------------------------------------------- /sanatate/scrapy_proj/items/act.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | 5 | class ActItem(scrapy.Item): 6 | identifier = scrapy.Field() 7 | title = scrapy.Field(serializer=str) 8 | type = scrapy.Field() 9 | institution = scrapy.Field() 10 | date = scrapy.Field() 11 | description = scrapy.Field() 12 | feedback_days = scrapy.Field(serializer=int) 13 | contact = scrapy.Field() 14 | documents = scrapy.Field() 15 | -------------------------------------------------------------------------------- /sanatate/scrapy_proj/items/contact.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | 5 | class ContactItem(scrapy.Item): 6 | tel = scrapy.Field(serializer=str) 7 | email = scrapy.Field(serializer=str) 8 | -------------------------------------------------------------------------------- /sanatate/scrapy_proj/loaders/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from scrapy_proj.loaders.act import * 4 | from scrapy_proj.loaders.contact import * 5 | -------------------------------------------------------------------------------- /sanatate/scrapy_proj/loaders/act.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from scrapy.loader import ItemLoader 4 | from scrapy_proj.helpers import * 5 | from scrapy.loader.processors import * 6 | from datetime import datetime as dt 7 | 8 | class ActLoader(ItemLoader): 9 | default_output_processor = TakeFirst() 10 | title_in = MapCompose(TextHelper.rws, RomanianHelper.beautify_romanian) 11 | contact_in = Compose(TakeFirst(), lambda x: dict(x)) 12 | date_in = MapCompose(lambda d: dt.strptime(d, '%d-%m-%Y').strftime('%Y-%m-%d')) 13 | feedback_days_in = MapCompose(int) 14 | documents_in = Identity() 15 | documents_out = Identity() 16 | -------------------------------------------------------------------------------- /sanatate/scrapy_proj/loaders/contact.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from scrapy.loader import ItemLoader 4 | from scrapy_proj.helpers import * 5 | from scrapy.loader.processors import * 6 | 7 | class ContactLoader(ItemLoader): 8 | default_output_processor = TakeFirst() 9 | email_in = MapCompose(str.lower) 10 | tel_in = MapCompose(TextHelper.remove_non_numeric) 11 | -------------------------------------------------------------------------------- /sanatate/scrapy_proj/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from scrapy_proj.pipelines.extrameta import * 4 | from scrapy_proj.pipelines.post import * 5 | -------------------------------------------------------------------------------- /sanatate/scrapy_proj/pipelines/extrameta.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import datetime 4 | import hashlib 5 | 6 | from scrapy_proj.helpers import * 7 | 8 | class SanatatePipelineExtraMeta(object): 9 | def process_item(self, item, spider): 10 | item['institution'] = spider.name 11 | act_type = LegalHelper.get_type_from_title(item['title']) 12 | if act_type == None: 13 | raise scrapy.exceptions.DropItem 14 | item['type'] = act_type 15 | engrol = RomanianHelper.englishize_romanian(item['title']).lower() 16 | engrolna = TextHelper.remove_non_ascii(engrol) 17 | identifier_text = '{0} {1}'.format(engrolna, item['date'] if 'date' in item else 'NA') 18 | identifier_text_hashed = hashlib.md5(identifier_text.encode()).hexdigest() 19 | item['identifier'] = '{0}-{1}-{2}'.format(item['institution'], item['type'], identifier_text_hashed) 20 | return item 21 | -------------------------------------------------------------------------------- /sanatate/scrapy_proj/pipelines/post.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import urllib3 5 | 6 | class SanatatePipelinePost(object): 7 | def open_spider(self, spider): 8 | with open('credentials.json') as credentials_file: 9 | self.credentials = json.load(credentials_file) 10 | def process_item(self, item, spider): 11 | http = urllib3.PoolManager() 12 | r = http.request( 13 | 'POST', 14 | self.credentials['endpoint'], 15 | headers={ 16 | 'Content-Type': 'application/json', 17 | 'Authorization': self.credentials['authorization'] 18 | }, 19 | body=json.dumps(dict(item)) 20 | ) 21 | return item 22 | -------------------------------------------------------------------------------- /sanatate/scrapy_proj/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for scrapy_proj project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'scrapy_proj' 13 | 14 | SPIDER_MODULES = ['scrapy_proj.spiders'] 15 | NEWSPIDER_MODULE = 'scrapy_proj.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | #USER_AGENT = 'scrapy_proj (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = True 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | #CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | #DOWNLOAD_DELAY = 3 30 | # The download delay setting will honor only one of: 31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | #CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | #COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | #TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | #DEFAULT_REQUEST_HEADERS = { 42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | # 'Accept-Language': 'en', 44 | #} 45 | 46 | # Enable or disable spider middlewares 47 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 48 | #SPIDER_MIDDLEWARES = { 49 | # 'scrapy_proj.middlewares.ScrapyProjSpiderMiddleware': 543, 50 | #} 51 | 52 | # Enable or disable downloader middlewares 53 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 54 | #DOWNLOADER_MIDDLEWARES = { 55 | # 'scrapy_proj.middlewares.MyCustomDownloaderMiddleware': 543, 56 | #} 57 | 58 | # Enable or disable extensions 59 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 60 | #EXTENSIONS = { 61 | # 'scrapy.extensions.telnet.TelnetConsole': None, 62 | #} 63 | 64 | # Configure item pipelines 65 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 66 | ITEM_PIPELINES = { 67 | 'scrapy_proj.pipelines.SanatatePipelineExtraMeta': 298, 68 | 'scrapy_proj.pipelines.SanatatePipelinePost': 299, 69 | } 70 | 71 | LOG_LEVEL = 'WARNING' 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | #AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | #AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | #AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | #AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | #HTTPCACHE_ENABLED = True 89 | #HTTPCACHE_EXPIRATION_SECS = 0 90 | #HTTPCACHE_DIR = 'httpcache' 91 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | -------------------------------------------------------------------------------- /sanatate/scrapy_proj/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /sanatate/scrapy_proj/spiders/sanatate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | import scrapy_proj.items as items 5 | import scrapy_proj.loaders as loaders 6 | import re 7 | import sys 8 | 9 | class SanatateSpider(scrapy.Spider): 10 | name = 'sanatate' 11 | 12 | def start_requests(self): 13 | urls = [ 14 | 'http://www.ms.ro/acte-normative-in-transparenta/?vpage=2', 15 | ] 16 | 17 | for url in urls: 18 | yield scrapy.Request(url=url, callback=self.parse) 19 | 20 | def parse(self, response): 21 | date_regex = re.compile('de\s+la\s+(\d{1,2}[-/]\d{2}[-/]\d{4})') 22 | email_regex = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+') 23 | tel_regex = re.compile(r'[^0-9](0(?:[0-9].?){9})') 24 | feedback_days_regex = re.compile(r'termen.*limita.*[^[0-9]]*([0-9]{1,2}).*zi') 25 | 26 | for item in response.css('.panel'): 27 | heading = item.css('div.panel-heading') 28 | body = item.css('div.panel-body') 29 | body_text = ''.join(body.xpath('.//text()').extract()).lower() 30 | 31 | title = item.css('a.panel-title::text').extract_first() 32 | 33 | loader = loaders.ActLoader(items.ActItem()) 34 | loader.add_value('title', title) 35 | 36 | contact_loader = loaders.ContactLoader(items.ContactItem()) 37 | contact_loader.add_value('tel', tel_regex.findall(body_text)) 38 | contact_loader.add_value('email', email_regex.findall(body_text)) 39 | loader.add_value('contact', contact_loader.load_item()) 40 | loader.add_value('date', date_regex.findall(body_text)) 41 | loader.add_value('feedback_days', feedback_days_regex.findall(body_text)) 42 | 43 | keys = ['type', 'url'] 44 | types = body.xpath('.//a[contains(@href, ".pdf")]').xpath('text()').extract() 45 | urls = body.xpath('.//a[contains(@href, ".pdf")]').xpath('@href').extract() 46 | docs = [[types[i], urls[i]] for i in range(len(types))] 47 | loader.add_value('documents', [dict(zip(keys, doc)) for doc in docs]) 48 | 49 | yield loader.load_item() 50 | 51 | next_pages = response.css('.pt-cv-pagination a::attr(href)').extract() 52 | next_pages.reverse() 53 | for next_page in next_pages: 54 | next_page = response.urljoin(next_page) 55 | yield scrapy.Request(next_page, callback=self.parse) 56 | -------------------------------------------------------------------------------- /scrapy/.gitignore: -------------------------------------------------------------------------------- 1 | /.cache 2 | *.pyc 3 | -------------------------------------------------------------------------------- /scrapy/Readme.md: -------------------------------------------------------------------------------- 1 | # Scrapere scrise cu scrapy 2 | 3 | O colecție de scrapere implementate folosind [scrapy](https://scrapy.org). 4 | Fiecărei instituții îi corespunde un scraper care descarcă publicații de pe site. 5 | Mai departe, publicațiile sunt validate într-un pipeline comun, și trimise la 6 | [api](http://czl-api.code4.ro). 7 | 8 | ## Spidere implementate 9 | * [`dialog`](czlscrape/spiders/dialog.py) - Ministerul Consultărilor Publice și 10 | Dialogului Social 11 | 12 | ## Instrucțiuni 13 | * Ai nevoie de python3, preferabil cu un 14 | [virtualenv](https://virtualenv.pypa.io). 15 | 16 | * Instalezi dependențele: 17 | ```sh 18 | pip install -r requirements.txt 19 | ``` 20 | 21 | * Configurezi variabile de mediu: 22 | ```sh 23 | export API_TOKEN='the secret token' 24 | export SENTRY_DSN='the sentry dsn' # opțional 25 | ``` 26 | 27 | * Rulezi unul din spidere: 28 | ```sh 29 | scrapy crawl dialog 30 | ``` 31 | 32 | * După ce faci schimbări în cod, rulezi testele: 33 | ```sh 34 | pytest 35 | ``` 36 | -------------------------------------------------------------------------------- /scrapy/czlscrape/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import logging 4 | 5 | if 'SENTRY_DSN' in os.environ: 6 | import logging 7 | from raven.handlers.logging import SentryHandler 8 | from raven.conf import setup_logging 9 | setup_logging(SentryHandler(os.environ['SENTRY_DSN'], level=logging.WARN)) 10 | 11 | logging.Formatter.converter = time.gmtime 12 | -------------------------------------------------------------------------------- /scrapy/czlscrape/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class Publication(scrapy.Item): 12 | institution = scrapy.Field() 13 | identifier = scrapy.Field() 14 | type = scrapy.Field() 15 | date = scrapy.Field() 16 | title = scrapy.Field() 17 | description = scrapy.Field() 18 | documents = scrapy.Field() 19 | contact = scrapy.Field() 20 | feedback_days = scrapy.Field() 21 | max_feedback_date = scrapy.Field() 22 | -------------------------------------------------------------------------------- /scrapy/czlscrape/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class CzlScrapeSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /scrapy/czlscrape/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import os 9 | import re 10 | import logging 11 | from scrapy.exceptions import DropItem 12 | import requests 13 | 14 | API_URL = 'http://czl-api.code4.ro/api/publications/' 15 | API_TOKEN = os.environ.get('API_TOKEN') 16 | 17 | logger = logging.getLogger(__name__) 18 | logger.setLevel(logging.WARN) 19 | 20 | 21 | class UploadPipeline(object): 22 | def process_item(self, item, spider): 23 | self.upload(item) 24 | return item 25 | 26 | def upload(self, item): 27 | if not API_TOKEN: 28 | print(item) 29 | return 30 | 31 | headers = {'Authorization': 'Token ' + API_TOKEN} 32 | resp = requests.post(API_URL, json=dict(item), headers=headers) 33 | if resp.status_code == 400: 34 | if re.search(r'Integrity Error: Key .* already exists', resp.text): 35 | return 36 | if resp.status_code != 201: 37 | msg = "Failed to upload publication: {!r}".format(resp) 38 | raise RuntimeError(msg) 39 | 40 | 41 | class PublicationValidatorPipeline(object): 42 | 43 | REQUIRED_FIELDS = [ 44 | 'identifier', 45 | 'title', 46 | 'institution', 47 | 'description', 48 | 'type', 49 | 'date', 50 | ] 51 | 52 | def process_item(self, item, spider): 53 | for field in self.REQUIRED_FIELDS: 54 | if not item.get(field): 55 | message = "Missing field {}".format(field) 56 | logger.warn(message) 57 | raise DropItem(message) 58 | return item 59 | -------------------------------------------------------------------------------- /scrapy/czlscrape/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for czlscrape project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'czlscrape' 13 | 14 | SPIDER_MODULES = ['czlscrape.spiders'] 15 | NEWSPIDER_MODULE = 'czlscrape.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'czlscrape (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'czlscrape.middlewares.CzlScrapeSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'czlscrape.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'czlscrape.pipelines.PublicationValidatorPipeline': 300, 69 | 'czlscrape.pipelines.UploadPipeline': 1000, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | 93 | LOG_LEVEL = 'DEBUG' 94 | LOG_FORMAT = '%(asctime)s srv="czl-scrape" [%(thread)d] %(levelname)s %(name)s %(funcName)s: %(message)s' 95 | LOG_DATEFORMAT = '%Y-%m-%dT%H:%M:%SZ' 96 | -------------------------------------------------------------------------------- /scrapy/czlscrape/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy/czlscrape/spiders/afaceri.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import re 3 | from ..items import Publication 4 | 5 | INDEX_URL = 'http://www.aippimm.ro/categorie/transparenta-decizionala---modificare-hg-96-2011/' 6 | 7 | def text_from(sel): 8 | return (sel.xpath('string(.)').extract_first() or "").strip() 9 | 10 | def guess_publication_type(text): 11 | text = text.lower() 12 | text = re.sub(r'[șş]', 's', text) 13 | text = re.sub(r'[țţ]', 't', text) 14 | text = re.sub(r'[ăâ]', 'a', text) 15 | text = re.sub(r'[î]', 'i', text) 16 | rules = [ 17 | ("lege", "LEGE"), 18 | ("hotarare de guvern", "HG"), 19 | ("hotarare a guvernului", "HG"), 20 | ("hg", "HG"), 21 | ("ordonanta de guvern", "OG"), 22 | ("oug", "OUG"), 23 | ("ordonanta de urgenta", "OUG"), 24 | ("ordin de ministru", "OM"), 25 | ("ordinul", "OM"), 26 | ] 27 | for substr, publication_type in rules: 28 | if substr in text: 29 | return publication_type 30 | else: 31 | return "OTHER" 32 | 33 | class AfaceriSpider(scrapy.Spider): 34 | 35 | name = 'afaceri' 36 | start_urls = [INDEX_URL] 37 | 38 | def parse(self, response): 39 | for article in response.css('.article_container'): 40 | link = article.css('a.lead_subcat') 41 | title = text_from(link) 42 | if not title: 43 | continue 44 | 45 | date_match = re.search( 46 | r'(?P\d{2})\.(?P\d{2})\.(?P\d{4})$', 47 | text_from(article.css('ul.lead')), 48 | ) 49 | date = "{year}-{month}-{day}".format(**date_match.groupdict()) 50 | 51 | identifier = link.css('::attr(href)').extract_first().split('/')[-1] 52 | publication_type = guess_publication_type(title) 53 | 54 | documents = [ 55 | { 56 | 'type': href.split('.')[-1], 57 | 'url': href, 58 | } 59 | for href in article.css('a.files::attr(href)').extract() 60 | ] 61 | 62 | yield Publication( 63 | identifier=identifier, 64 | title=title, 65 | institution='afaceri', 66 | description=title, 67 | type=publication_type, 68 | date=date, 69 | documents=documents, 70 | ) 71 | -------------------------------------------------------------------------------- /scrapy/czlscrape/spiders/dialog.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import re 3 | 4 | from czlscrape.utils import guess_initiative_type 5 | from ..items import Publication 6 | 7 | INDEX_URL = 'http://dialogsocial.gov.ro/categorie/proiecte-de-acte-normative/' 8 | 9 | DOC_EXTENSIONS = [ 10 | ".docs", ".doc", ".txt", ".crt", ".xls", 11 | ".xml", ".pdf", ".docx", ".xlsx", 12 | ] 13 | 14 | TYPE_RULES = [ 15 | ("lege", "LEGE"), 16 | ("hotarare de guvern", "HG"), 17 | ("hotarare a guvernului", "HG"), 18 | ("ordonanta de guvern", "OG"), 19 | ("ordonanta de urgenta", "OUG"), 20 | ("ordin de ministru", "OM"), 21 | ("ordinul", "OM"), 22 | ] 23 | 24 | 25 | def text_from(sel): 26 | return sel.xpath('string(.)').extract_first().strip() 27 | 28 | 29 | class DialogSpider(scrapy.Spider): 30 | 31 | name = 'dialog' 32 | start_urls = [INDEX_URL] 33 | 34 | def parse(self, response): 35 | for article in response.css('#content article.post'): 36 | href = article.css('.entry-title a::attr(href)').extract_first() 37 | yield scrapy.Request(response.urljoin(href), self.parse_article) 38 | 39 | def parse_article(self, response): 40 | title = text_from(response.css('h1')) 41 | publication_type = guess_initiative_type(title, TYPE_RULES) 42 | 43 | article = response.css('#content article.post')[0] 44 | 45 | id_value = article.css('::attr(id)').extract_first() 46 | identifier = re.match(r'post-(\d+)', id_value).group(1) 47 | 48 | date = ( 49 | article.css('time.entry-date::attr(datetime)') 50 | .extract_first()[:10] 51 | ) 52 | 53 | # remove
and everything below 54 | to_remove = article.css('.fb-comments')[0].root 55 | while to_remove is not None: 56 | next_to_remove = to_remove.getnext() 57 | to_remove.getparent().remove(to_remove) 58 | to_remove = next_to_remove 59 | 60 | documents = [ 61 | { 62 | 'type': href.split('.')[-1], 63 | 'url': href, 64 | } 65 | for href in article.css('a::attr(href)').extract() 66 | if any(href.endswith(ext) for ext in DOC_EXTENSIONS) 67 | ] 68 | 69 | return Publication( 70 | identifier=identifier, 71 | title=title, 72 | institution='dialog', 73 | description=text_from(article), 74 | type=publication_type, 75 | date=date, 76 | documents=documents, 77 | ) 78 | 79 | 80 | def main(): 81 | from scrapy.crawler import CrawlerProcess 82 | process = CrawlerProcess() 83 | process.crawl(DialogSpider) 84 | process.start() 85 | 86 | if __name__ == '__main__': 87 | main() 88 | -------------------------------------------------------------------------------- /scrapy/czlscrape/spiders/senat.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import datetime 4 | import re 5 | 6 | from scrapy import Spider, Request 7 | 8 | from ..items import Publication 9 | from ..utils import extract_documents 10 | 11 | INDEX_URL = 'https://www.senat.ro/LegiProiect.aspx' 12 | 13 | 14 | class SenatSpider(Spider): 15 | name = 'senat' 16 | start_urls = [INDEX_URL] 17 | 18 | def parse(self, response): 19 | for entry in response.css('#GridViewProiecte tr > td:nth-child(2) a'): 20 | href = entry.css('a::attr(href)').extract_first() 21 | yield Request(response.urljoin(href), self.parse_entry) 22 | 23 | def parse_entry(self, response): 24 | identifier = response.css( 25 | '#ctl00_B_Center_ctl06_viewFisa_lblNr::text').extract_first() 26 | description = response.css( 27 | '#ctl00_B_Center_ctl06_grdTitlu_ctl02_Label1::text').extract_first() 28 | title = description 29 | date_string = response.css( 30 | '#ctl00_B_Center_ctl06_grdDerulare_ctl02_Label1::text').extract_first() 31 | date_match = re.match( 32 | '^(?P\d{1,2})\-(?P\d{1,2})\-(?P\d{4})$', 33 | date_string) 34 | if date_match: 35 | date = datetime.date( 36 | int(date_match.group('year')), 37 | int(date_match.group('month')), 38 | int(date_match.group('day')), 39 | ) 40 | else: 41 | date = datetime.date.today() 42 | 43 | documents = [ 44 | { 45 | 'type': re.sub('^[^a-zA-Z]+', '', doc['type'], 1), 46 | 'url': re.sub('\\\\', '/', response.urljoin(doc['url'])), 47 | } for doc in extract_documents(response.css( 48 | '#ctl00_B_Center_Accordion1 div.accrdContent a')) 49 | ] 50 | 51 | contact = { 52 | 'tel': '021 315 8942', 53 | 'email': 'infopub@senat.ro', 54 | } 55 | 56 | return Publication( 57 | identifier=identifier, 58 | title=title, 59 | institution='senat', 60 | description=description, 61 | type='LEGE', 62 | date=date.isoformat(), 63 | documents=documents, 64 | contact=contact 65 | ) 66 | -------------------------------------------------------------------------------- /scrapy/czlscrape/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from scrapy.selector import SelectorList 4 | 5 | DIACRITICS_RULES = [ 6 | (r'[șş]', 's'), 7 | (r'[ȘŞ]', 'S'), 8 | (r'[țţ]', 't'), 9 | (r'[ȚŢ]', 'T'), 10 | (r'[ăâ]', 'a'), 11 | (r'[ĂÂ]', 'A'), 12 | (r'[î]', 'i'), 13 | (r'[Î]', 'I'), 14 | ] 15 | 16 | ROMANIAN_MONTHS = { 17 | 'ianuarie': 1, 18 | 'februarie': 2, 19 | 'martie': 3, 20 | 'aprilie': 4, 21 | 'mai': 5, 22 | 'iunie': 6, 23 | 'iulie': 7, 24 | 'august': 8, 25 | 'septembrie': 9, 26 | 'octombrie': 10, 27 | 'noiembrie': 11, 28 | 'decembrie': 12, 29 | } 30 | 31 | DOC_EXTENSIONS = [".docs", ".doc", ".txt", ".crt", ".xls", ".xml", ".pdf", 32 | ".docx", ".xlsx", ] 33 | 34 | 35 | def guess_initiative_type(text: str, rules: list) -> str: 36 | """ 37 | Try to identify the type of a law initiative from its description. 38 | 39 | Use a best guess approach. The rules are provided by the caller as a list 40 | of tuples. Each tuple is composed of a search string and the initiative 41 | type it matches to. 42 | :param text: the description of the initiative 43 | :param rules: the rules of identification expressed as a list of tuples 44 | :return: the type of initiative if a rule matches; "OTHER" if no rule 45 | matches 46 | """ 47 | text = strip_diacritics(text) 48 | 49 | for search_string, initiative_type in rules: 50 | if search_string in text: 51 | return initiative_type 52 | else: 53 | return "OTHER" 54 | 55 | 56 | def strip_diacritics(text: str) -> str: 57 | """ 58 | Replace all diacritics in the given text with their regular counterparts. 59 | :param text: the text to look into 60 | :return: the text without diacritics 61 | """ 62 | result = text 63 | for search_pattern, replacement in DIACRITICS_RULES: 64 | result = re.sub(search_pattern, replacement, result) 65 | return result 66 | 67 | 68 | def romanian_month_number(text: str) -> int: 69 | """ 70 | Return the number of the given month identified by its Romanian name. 71 | :param text: the name of the month in Romanian 72 | :return: the number of the month if the month name is recognized, 73 | otherwise None 74 | """ 75 | return ROMANIAN_MONTHS.get(text.lower()) 76 | 77 | 78 | def extract_documents(selector_list: SelectorList): 79 | """ 80 | Extract white-listed documents from CSS selectors. 81 | 82 | Generator function. Search for links to white-listed document types and 83 | return all matching ones. Each entry has two properties. "type" contains 84 | the link text, "url" contains the link URL. 85 | 86 | :param selector_list: a SelectorList 87 | :return: a generator 88 | """ 89 | for link_selector in selector_list: 90 | url = link_selector.css('::attr(href)').extract_first() 91 | if any(url.endswith(ext) for ext in DOC_EXTENSIONS): 92 | yield { 93 | 'type': link_selector.css('::text').extract_first(), 94 | 'url': url, 95 | } 96 | -------------------------------------------------------------------------------- /scrapy/requirements.in: -------------------------------------------------------------------------------- 1 | scrapy 2 | requests 3 | raven 4 | pytest 5 | -------------------------------------------------------------------------------- /scrapy/requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile --output-file requirements.txt requirements.in 6 | # 7 | appdirs==1.4.3 # via setuptools 8 | asn1crypto==0.21.1 # via cryptography 9 | attrs==16.3.0 # via automat, service-identity 10 | automat==0.5.0 # via twisted 11 | cffi==1.9.1 # via cryptography 12 | constantly==15.1.0 # via twisted 13 | contextlib2==0.5.4 # via raven 14 | cryptography==1.8.1 # via pyopenssl 15 | cssselect==1.0.1 # via parsel, scrapy 16 | idna==2.5 # via cryptography 17 | incremental==16.10.1 # via twisted 18 | lxml==3.7.3 # via parsel, scrapy 19 | packaging==16.8 # via cryptography, setuptools 20 | parsel==1.1.0 # via scrapy 21 | py==1.4.33 # via pytest 22 | pyasn1-modules==0.0.8 # via service-identity 23 | pyasn1==0.2.3 # via pyasn1-modules, service-identity 24 | pycparser==2.17 # via cffi 25 | pydispatcher==2.0.5 # via scrapy 26 | pyopenssl==16.2.0 # via scrapy, service-identity 27 | pyparsing==2.2.0 # via packaging 28 | pytest==3.0.7 29 | queuelib==1.4.2 # via scrapy 30 | raven==6.0.0 31 | requests==2.13.0 32 | scrapy==1.3.3 33 | service-identity==16.0.0 # via scrapy 34 | six==1.10.0 # via automat, cryptography, packaging, parsel, pyopenssl, scrapy, setuptools, w3lib 35 | twisted==17.1.0 # via scrapy 36 | w3lib==1.17.0 # via parsel, scrapy 37 | zope.interface==4.3.3 # via twisted 38 | 39 | # The following packages are considered to be unsafe in a requirements file: 40 | # setuptools # via cryptography, pytest, zope.interface 41 | -------------------------------------------------------------------------------- /scrapy/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = czlscrape.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = czlscrape 12 | -------------------------------------------------------------------------------- /scrapy/testsuite/conftest.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | sys.path.append(str(Path(__file__).resolve().parent.parent)) 5 | -------------------------------------------------------------------------------- /scrapy/testsuite/test_validator.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from scrapy.exceptions import DropItem 3 | from czlscrape.items import Publication 4 | from czlscrape.pipelines import PublicationValidatorPipeline 5 | 6 | def create_publication(): 7 | return Publication( 8 | identifier='aa', 9 | title="the good publication", 10 | institution='foo', 11 | description="this is a publication that has all required fields", 12 | type='HG', 13 | date='2017-04-03', 14 | documents=[ 15 | {'type': 'something', 'url': 'http://example.com/something.pdf'}, 16 | ], 17 | ) 18 | 19 | def test_ok(): 20 | pipeline = PublicationValidatorPipeline() 21 | pipeline.process_item(create_publication(), None) 22 | 23 | @pytest.mark.parametrize('field', [ 24 | 'identifier', 25 | 'title', 26 | 'institution', 27 | 'description', 28 | 'type', 29 | 'date', 30 | ]) 31 | def test_missing_field(field): 32 | publication = create_publication() 33 | del publication[field] 34 | pipeline = PublicationValidatorPipeline() 35 | with pytest.raises(DropItem) as err: 36 | pipeline.process_item(publication, None) 37 | -------------------------------------------------------------------------------- /sgg/README.md: -------------------------------------------------------------------------------- 1 | # Secretariatul General al Guvernului 2 | 3 | ## Tehnologie 4 | Python3, virtualenv, scrapy 5 | 6 | 7 | ## Instructiuni 8 | 9 | Install `Python3` and `virtualenv` 10 | 11 | virtualenv -p python3 venv 12 | source venv/bin/activate 13 | pip install -r requirements.txt 14 | cd sgg 15 | SGG_AUTH_TOKEN=sgg-very-secret-key python3 run.py 16 | 17 | ## Exceptii -------------------------------------------------------------------------------- /sgg/requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.3 2 | asn1crypto==0.21.1 3 | attrs==16.3.0 4 | Automat==0.5.0 5 | cffi==1.9.1 6 | constantly==15.1.0 7 | cryptography==1.8.1 8 | cssselect==1.0.1 9 | idna==2.5 10 | incremental==16.10.1 11 | lxml==3.7.3 12 | packaging==16.8 13 | parsel==1.1.0 14 | pyasn1==0.2.3 15 | pyasn1-modules==0.0.8 16 | pycparser==2.17 17 | PyDispatcher==2.0.5 18 | pyOpenSSL==16.2.0 19 | pyparsing==2.2.0 20 | queuelib==1.4.2 21 | requests==2.13.0 22 | Scrapy==1.3.3 23 | service-identity==16.0.0 24 | six==1.10.0 25 | Twisted==17.1.0 26 | w3lib==1.17.0 27 | zope.interface==4.3.3 28 | -------------------------------------------------------------------------------- /sgg/sgg/run.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import subprocess 5 | import json 6 | import requests 7 | 8 | POST_URL = "http://czl-api.code4.ro/api/publications/" 9 | # POST_URL_DEV = "http://10.231.234.10:8000/api/publications/" 10 | 11 | AUTH_TOKEN = os.getenv('SGG_AUTH_TOKEN', "sgg-very-secret-key") 12 | 13 | headers = { 14 | 'Authorization': " ".join(['Token',AUTH_TOKEN]) 15 | } 16 | 17 | if os.path.exists("sgg.json"): 18 | os.remove("sgg.json") 19 | 20 | subprocess.call(['scrapy','crawl', 'sgg_spider', '-o', 'sgg.json']) 21 | 22 | with open("sgg.json") as fp: 23 | items = json.load(fp) 24 | for item in items: 25 | r = requests.post(POST_URL, data=item, headers=headers) 26 | if r.status_code >= 400: 27 | print(json.dumps(r.json())) 28 | 29 | print("DONE!") -------------------------------------------------------------------------------- /sgg/sgg/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = sgg.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = sgg 12 | -------------------------------------------------------------------------------- /sgg/sgg/sgg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/sgg/sgg/sgg/__init__.py -------------------------------------------------------------------------------- /sgg/sgg/sgg/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class SggItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /sgg/sgg/sgg/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class SggSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /sgg/sgg/sgg/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class SggPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /sgg/sgg/sgg/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for sgg project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'sgg' 13 | 14 | SPIDER_MODULES = ['sgg.spiders'] 15 | NEWSPIDER_MODULE = 'sgg.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'sgg (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'sgg.middlewares.SggSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'sgg.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'sgg.pipelines.SggPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /sgg/sgg/sgg/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /sgg/sgg/sgg/spiders/sgg_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | from scrapy.crawler import CrawlerProcess 5 | import logging 6 | import json 7 | import hashlib 8 | 9 | base_url = "http://www.sgg.ro" 10 | 11 | 12 | 13 | class Item(scrapy.Item): 14 | identifier = scrapy.Field() 15 | title = scrapy.Field() 16 | type = scrapy.Field() 17 | institution = scrapy.Field() 18 | date = scrapy.Field() 19 | description = scrapy.Field() 20 | feedback_days = scrapy.Field() 21 | contact = scrapy.Field() 22 | documents = scrapy.Field() 23 | 24 | def xtract(obj, sel): 25 | ret = obj.xpath(sel).extract_first() 26 | 27 | if ret: 28 | ret = " ".join(map(lambda s : s.strip(), ret.splitlines())) 29 | return ret 30 | return "" 31 | 32 | def identify(institution, titlu): 33 | 34 | return " : ".join([hashlib.md5(titlu.encode('utf-8')).hexdigest(), institution]) 35 | 36 | class SggSpider(scrapy.Spider): 37 | name = "sgg_spider" 38 | allowed_domains = ["www.sgg.ro"] 39 | start_urls = ['http://www.sgg.ro/legislativ/index.php/'] 40 | 41 | def parse(self, response): 42 | links = response.css('a::attr(href)').extract() 43 | links = list(set([response.urljoin(link) for link in links if "domeniu.php" in link])) 44 | # yield scrapy.Request(response.urljoin('/legislativ/domeniu.php?id=84'), callback=self.parse_details) 45 | 46 | for link in links: 47 | yield scrapy.Request(response.urljoin(link), callback=self.parse_details) 48 | 49 | 50 | def parse_details(self, response): 51 | # response = get(response.url) 52 | 53 | institution = response.xpath('//h2/text()').extract()[0].strip() 54 | logging.warn("scrapping: %s - %s"%(response.url, institution)) 55 | 56 | for tr in response.xpath('//table[@class="fancy"]/tr'): 57 | 58 | if tr.xpath('td[1]'): 59 | item = Item() 60 | titlu = xtract(tr, 'td[1]//div/text()') 61 | type_ = xtract(tr, 'td[2]//div//strong/text()') 62 | consult = xtract(tr, 'td[3]//div/text()') 63 | avizare = xtract(tr, 'td[4]//div/text()') 64 | avizori = xtract(tr, 'td[5]//div/text()') 65 | termen_avize = xtract(tr, 'td[6]//div/text()') 66 | mfp_mj = xtract(tr, 'td[7]//div/text()') 67 | reavizare = xtract(tr, 'td[8]//div/text()') 68 | init_1 = xtract(tr, 'td[9]//a/@href') 69 | init_2 = xtract(tr, 'td[10]//a/@href') 70 | final_1 = xtract(tr, 'td[11]//a/@href') 71 | final_2 = xtract(tr, 'td[12]//a/@href') 72 | 73 | docs = [{"type": "nota", "url": response.urljoin(f)} for f in [init_1, init_2, final_1, final_2] if f] 74 | 75 | item['identifier'] = identify(institution, titlu) 76 | item['title'] = titlu 77 | item['type'] = type_ 78 | item['institution'] = "sgg" 79 | item['date'] = consult 80 | item['description'] = "" 81 | item['feedback_days'] = None 82 | item['contact'] = None 83 | item['documents'] = docs 84 | 85 | yield item 86 | 87 | if __name__ == '__main__': 88 | process = CrawlerProcess({ 89 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 90 | 'LOG_LEVEL' : 'WARNING' 91 | }) 92 | 93 | process.crawl(SggSpider) 94 | process.start() -------------------------------------------------------------------------------- /tineret/.gitignore: -------------------------------------------------------------------------------- 1 | .scrapy 2 | **/__pycache__ 3 | -------------------------------------------------------------------------------- /tineret/README.md: -------------------------------------------------------------------------------- 1 | # Ministerul Justiţiei 2 | 3 | ## Tehnologie 4 | 5 | - *Python 3.6*, [virtualenv](https://virtualenv.pypa.io/) is a good friend 6 | - [Scrapy](https://scrapy.org/) 7 | 8 | ``` 9 | pip install -r requirements.txt 10 | ``` 11 | 12 | ## Instructiuni 13 | 14 | ``` 15 | scrapy crawl tineret 16 | ``` 17 | 18 | ## Altele 19 | 20 | -------------------------------------------------------------------------------- /tineret/requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.3 2 | asn1crypto==0.21.1 3 | attrs==16.3.0 4 | Automat==0.5.0 5 | cffi==1.9.1 6 | constantly==15.1.0 7 | convertdate==2.1.0 8 | cryptography==1.8.1 9 | cssselect==1.0.1 10 | ephem==3.7.6.0 11 | idna==2.5 12 | incremental==16.10.1 13 | jdatetime==1.8.2 14 | lxml==3.7.3 15 | packaging==16.8 16 | parsel==1.1.0 17 | pyasn1==0.2.3 18 | pyasn1-modules==0.0.8 19 | pycparser==2.17 20 | PyDispatcher==2.0.5 21 | pyOpenSSL==16.2.0 22 | pyparsing==2.2.0 23 | pytz==2016.10 24 | queuelib==1.4.2 25 | regex==2017.2.8 26 | ruamel.yaml==0.13.14 27 | Scrapy==1.3.3 28 | service-identity==16.0.0 29 | six==1.10.0 30 | Twisted==17.1.0 31 | tzlocal==1.3 32 | umalqurra==0.2 33 | Unidecode==0.4.20 34 | w3lib==1.17.0 35 | zope.interface==4.3.3 36 | -------------------------------------------------------------------------------- /tineret/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = tineret.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = tineret 12 | -------------------------------------------------------------------------------- /tineret/tineret/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/tineret/tineret/__init__.py -------------------------------------------------------------------------------- /tineret/tineret/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | class Publication(scrapy.Item): 11 | # define the fields for your item here like: 12 | # name = scrapy.Field() 13 | identifier = scrapy.Field() 14 | title = scrapy.Field() 15 | type = scrapy.Field() 16 | institution = scrapy.Field() 17 | date = scrapy.Field() 18 | description = scrapy.Field() 19 | feedback_days = scrapy.Field() 20 | contact = scrapy.Field() 21 | documents = scrapy.Field() 22 | 23 | pass 24 | 25 | -------------------------------------------------------------------------------- /tineret/tineret/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import requests 9 | import json 10 | import logging 11 | 12 | from tineret.items import Publication 13 | import logging 14 | 15 | API_KEY = 'tineret-very-secret-key' 16 | API_PUBLICATIONS = 'http://czl-api.code4.ro/api/publications/' 17 | 18 | class PublicationsToApiPipeline(object): 19 | def process_item(self, item, spider): 20 | 21 | if type(item) != Publication: 22 | return item 23 | 24 | r = requests.post(API_PUBLICATIONS, json=dict(item), headers={'Authorization': 'Token %s' % (API_KEY,) } ) 25 | 26 | 27 | if r.status_code == 200 or r.status_code == '200': 28 | logging.log(msg=r.status_code, level=logging.INFO) 29 | else: 30 | logging.log(msg=r.status_code, level=logging.ERROR) 31 | logging.log(msg=r.content, level=logging.INFO) 32 | 33 | return item 34 | -------------------------------------------------------------------------------- /tineret/tineret/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for tineret project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'tineret' 13 | 14 | SPIDER_MODULES = ['tineret.spiders'] 15 | NEWSPIDER_MODULE = 'tineret.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | USER_AGENT = 'code4romania (+http://www.code4.ro)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = True 22 | 23 | LOG_ENABLED = True 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | DOWNLOAD_DELAY = 5 32 | # The download delay setting will honor only one of: 33 | CONCURRENT_REQUESTS_PER_DOMAIN = 1 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | #COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | #DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | #} 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'tineret.middlewares.JustSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'tineret.middlewares.MyCustomDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.AutoThrottle': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'tineret.pipelines.PublicationsToApiPipeline': 100, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | # AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | # AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | # AUTOTHROTTLE_MAX_DELAY = 30 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0 82 | # Enable showing throttling stats for every response received: 83 | # AUTOTHROTTLE_DEBUG = True 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | HTTPCACHE_ENABLED = True 88 | HTTPCACHE_EXPIRATION_SECS = 30 89 | HTTPCACHE_DIR = 'httpcache' 90 | HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /tineret/tineret/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /transport/README.md: -------------------------------------------------------------------------------- 1 | # Ministerul Transporturilor 2 | 3 | ## Tehnologie 4 | 5 | Node.js, [nightmare](http://www.nightmarejs.org/) 6 | 7 | ## Instrucțiuni 8 | 9 | ``` 10 | npm install 11 | ``` 12 | 13 | edit config.js, change API token (can also be specified on the command line) and other config vars 14 | 15 | ``` 16 | [API_TOKEN=foobar] npm start 17 | ``` 18 | 19 | ## Excepții 20 | -------------------------------------------------------------------------------- /transport/config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | api: { 3 | url: 'http://czl-api.code4.ro/api/publications/', 4 | token: 'educatie-very-secret-key' 5 | }, 6 | scrape: { 7 | //url of the proposals listing page 8 | baseUrl: 'http://mt.gov.ro/web14/transparenta-decizionala/consultare-publica/acte-normative-in-avizare', 9 | //number of listing pages to scrape 10 | pages: 2 11 | } 12 | }; 13 | -------------------------------------------------------------------------------- /transport/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "mt-scraper", 3 | "version": "1.0.0", 4 | "description": "Data scraper pentru Ministerul Transporturilor", 5 | "main": "index.js", 6 | "scripts": { 7 | "start": "node index.js", 8 | "test": "echo \"Error: no test specified\" && exit 1" 9 | }, 10 | "repository": { 11 | "type": "git", 12 | "url": "git+https://github.com/lbogdan/czl-scrape" 13 | }, 14 | "author": { 15 | "name": "Bogdan Luca", 16 | "email": "luca.bogdan@gmail.com" 17 | }, 18 | "license": "MIT", 19 | "dependencies": { 20 | "diacritics": "^1.3.0", 21 | "jsonfile": "^2.4.0", 22 | "moment": "^2.17.1", 23 | "nightmare": "^2.10.0", 24 | "request-promise": "^4.1.1" 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /turism/README.md: -------------------------------------------------------------------------------- 1 | # Ministerul Turismului 2 | 3 | ## Tehnologie 4 | Java 5 | 6 | ## Instructiuni 7 | Rulati scraper.jar, va crea output-ul astfel: 8 | 9 | Anexele le salveaza in out_files/Anexe 10 | Proiectele le salveaza in out_files/Proiecte 11 | 12 | ## Exceptii 13 | Din cauza faptului ca orice link care nu ducea la un document PDF de pe site redirectiona pe pagina principala, scraper-ul downloadeaza toate documentele PDF si le organizeaza in Anexe si Proiecte. -------------------------------------------------------------------------------- /turism/out/production/scraper/com/company/Main.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out/production/scraper/com/company/Main.class -------------------------------------------------------------------------------- /turism/out/production/scraper/com/company/Scraper.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out/production/scraper/com/company/Scraper.class -------------------------------------------------------------------------------- /turism/out/scraper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out/scraper.jar -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa1.1.1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.1.1.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa1.1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.1.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa1.2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.2.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa1.3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.3.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa1.4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.4.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa1.5.1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.5.1.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa1.5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.5.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa1.6.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.6.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa1.7.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.7.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa1.8.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.8.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa1.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa10.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa10.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa11.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa11.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa12.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa12.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa13.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa13.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa14.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa14.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa15.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa15.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa2.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa3.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa4.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa5.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa6.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa6.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa7.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa7.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa8.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa8.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa9.2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa9.2.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexa9.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexa9.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/AnexaAP.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/AnexaAP.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexabrevet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexabrevet.pdf -------------------------------------------------------------------------------- /turism/out_files/Anexe/Anexacazare.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Anexe/Anexacazare.pdf -------------------------------------------------------------------------------- /turism/out_files/Proiecte/Ordin-criterii-participare-targuri-externe.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Proiecte/Ordin-criterii-participare-targuri-externe.pdf -------------------------------------------------------------------------------- /turism/out_files/Proiecte/Proiect-de-Ordin-al-Ministrului-delegat-pentru-intreprinderi-mici-şi-mijlocii-mediul-de-afaceri-şi-turism-pentru-modificarea-OMT-nr-235-2001.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Proiecte/Proiect-de-Ordin-al-Ministrului-delegat-pentru-intreprinderi-mici-şi-mijlocii-mediul-de-afaceri-şi-turism-pentru-modificarea-OMT-nr-235-2001.pdf -------------------------------------------------------------------------------- /turism/out_files/Proiecte/Proiect-ordin-modificare-Ordin-65.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code4romania/czl-scrape/789341faa950f1511ff69bec840fbfb5998cf04e/turism/out_files/Proiecte/Proiect-ordin-modificare-Ordin-65.pdf --------------------------------------------------------------------------------