55 | saveOrUpdate(alderman);
56 |
57 | } catch (InterruptedException | ExecutionException | IOException e) {
58 | LOGGER.error(ExceptionUtils.getStackTrace(e));
59 | }
60 |
61 | }
62 |
63 | protected Alderman extractAldermenInfo(final Elements elements) {
64 |
65 | final Alderman alderman = new Alderman();
66 |
67 | for (final Element element : elements) {
68 |
69 | final Elements elementsInfo = element.select("div.col-sm-7.texto");
70 |
71 | String politicianName = elementsInfo.select("h3").text().trim();
72 | alderman.setName(politicianName);
73 | alderman.setEmail(getElementValue(elementsInfo, "E-mail").nextElementSibling().text().trim());
74 | alderman.setInfo(getElementValue(elementsInfo, "Dados Pessoais").nextElementSibling().text().trim());
75 | alderman.setLegislature(getElementValue(elementsInfo, "Legislatura").nextSibling().toString().trim());
76 | alderman.setPhone(getElementValue(elementsInfo, "Telefone").nextSibling().toString().trim());
77 | alderman.setPoliticalParty(extractedPoliticalParty(elementsInfo).trim());
78 | alderman.setWorkplace(getElementValue(elementsInfo, "Local de Trabalho").nextSibling().toString().trim());
79 | alderman.setPhoto(createPhoto(element.getElementsByClass("img-responsive").attr("src").trim(), politicianName));
80 |
81 | }
82 |
83 | return alderman;
84 |
85 | }
86 |
87 | protected String createPhoto(String url, String politicianName) {
88 | politicianName = StringUtil.unaccent(politicianName.toLowerCase().replaceAll(" ", "_"));
89 | String fullImagePath = pathImages.concat(politicianName).concat(".jpg");
90 | fileUtil.savePhoto(url, pathWebapp.concat(fullImagePath));
91 | return urlContext.concat(fullImagePath);
92 | }
93 |
94 | protected String extractedPoliticalParty(final Elements elements) {
95 | return elements.select("h3").first().nextElementSibling().text().replaceAll("Partido: ", "");
96 | }
97 |
98 | protected Element getElementValue(final Elements elementsInfo, final String key) {
99 | return elementsInfo.select("h4:contains(" + key + ")").first();
100 | }
101 |
102 | protected void saveOrUpdate(final Alderman aldermanToSave) {
103 |
104 | final Optional
alderman = aldermanRepository.findByName(aldermanToSave.getName());
105 |
106 | if (alderman.isPresent()) {
107 | aldermanToSave.setId(alderman.get().getId());
108 | aldermanToSave.setLawsCount(alderman.get().getLawsCount());
109 | aldermanRepository.save(aldermanToSave);
110 |
111 | } else {
112 | aldermanRepository.save(aldermanToSave);
113 | }
114 |
115 | }
116 |
117 | }
118 |
--------------------------------------------------------------------------------
/src/main/java/com/sjcdigital/temis/model/service/parsers/impl/LawsParser.java:
--------------------------------------------------------------------------------
1 | package com.sjcdigital.temis.model.service.parsers.impl;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.time.LocalDate;
6 | import java.time.format.DateTimeFormatter;
7 | import java.time.format.DateTimeParseException;
8 | import java.util.Locale;
9 | import java.util.Objects;
10 | import java.util.Optional;
11 | import java.util.concurrent.ExecutionException;
12 | import java.util.regex.Matcher;
13 |
14 | import org.apache.commons.lang3.exception.ExceptionUtils;
15 | import org.apache.log4j.LogManager;
16 | import org.apache.log4j.Logger;
17 | import org.jsoup.nodes.Document;
18 | import org.jsoup.nodes.Element;
19 | import org.springframework.beans.factory.annotation.Autowired;
20 | import org.springframework.stereotype.Component;
21 |
22 | import com.sjcdigital.temis.model.document.Law;
23 | import com.sjcdigital.temis.model.repositories.LawsRepository;
24 | import com.sjcdigital.temis.model.service.machine_learn.ClassifyLaw;
25 | import com.sjcdigital.temis.model.service.parsers.AbstractParser;
26 | import com.sjcdigital.temis.model.service.parsers.util.AldermanParserUtil;
27 | import com.sjcdigital.temis.util.RegexUtils;
28 |
29 | /**
30 | * @author pedro-hos
31 | */
32 |
33 | @Component
34 | public class LawsParser extends AbstractParser {
35 |
36 | private static final Logger LOGGER = LogManager.getLogger(LawsParser.class);
37 |
38 | @Autowired
39 | private LawsRepository lawsRepository;
40 |
41 | @Autowired
42 | private ClassifyLaw classifyLaw;
43 |
44 | @Autowired
45 | private AldermanParserUtil aldermanParserUtil;
46 |
47 | @Override
48 | public void parse(final File file) {
49 |
50 | try {
51 |
52 | final Document document = readFile(file).get();
53 |
54 | final Law law = new Law();
55 | final Optional title = buildTitle(document.title().trim());
56 |
57 | String summary = buildSummary(document.head().select("script").toString()).orElse(null);
58 |
59 | law.setSummary(summary);
60 | law.setType(Objects.nonNull(summary) ? classifyLaw.classify(summary) : null);
61 | law.setTitle(title.orElse(null));
62 | law.setDate(buildDate(title.orElse("")).orElse(LocalDate.now()));
63 |
64 | cleanDocument(document);
65 |
66 | final Element body = document.body();
67 | law.setAuthor(aldermanParserUtil.buildAuthor(Optional.ofNullable(body.getElementsByClass("RegPub").first().text()).orElse("")));
68 | law.setDesc(body.html().trim());
69 | law.setCode(extractedCode(file).orElse(null));
70 | law.setProjectLawNumber(buildProjectLawNumber(body).orElse(null));
71 |
72 | saveLaw(law);
73 |
74 | } catch (InterruptedException | ExecutionException | IOException e) {
75 | LOGGER.error(ExceptionUtils.getStackTrace(e));
76 | }
77 |
78 | }
79 |
80 | private void cleanDocument(final Document document) {
81 | document.select("script").remove();
82 | document.select("a[href]").remove();
83 | }
84 |
85 | private Optional buildTitle(String title) {
86 |
87 | final Matcher matcher = RegexUtils.getMatcher("lei\\s*municipal\\s*nº?\\s*\\d+\\.?\\d+,?\\s*(de)?\\s*\\d{1,2}/\\d{1,2}/\\d{2,4}", title);
88 |
89 | if (matcher.find()) {
90 | return Optional.of(matcher.group(0));
91 | }
92 |
93 | return Optional.empty();
94 | }
95 |
96 | private Optional buildSummary(String script) {
97 |
98 | final Matcher matcher = RegexUtils.getMatcher("Xtesta\\((.+)\\)", script);
99 |
100 | if (matcher.find()) {
101 | return Optional.of(matcher.group(1).split("\",\"")[1].replaceAll("