├── src
└── main
│ ├── resources
│ └── META-INF
│ │ └── MANIFEST.MF
│ └── java
│ ├── CsvCharacterExtractor.java
│ ├── CombinedOutputData.java
│ ├── ConfigData.java
│ ├── ConfigReader.java
│ └── CsvUniqueCharacterProcessor.java
├── .idea
├── vcs.xml
├── copyright
│ ├── profiles_settings.xml
│ └── Supyrb__Johannes.xml
├── modules.xml
├── misc.xml
└── compiler.xml
├── in
└── example.csv
├── config.xml
├── pom.xml
├── LICENSE
├── CsvCharacterExtractor.iml
├── .gitignore
└── README.md
/src/main/resources/META-INF/MANIFEST.MF:
--------------------------------------------------------------------------------
1 | Manifest-Version: 1.0
2 | Main-Class: CsvCharacterExtractor
3 |
4 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/copyright/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/in/example.csv:
--------------------------------------------------------------------------------
1 | ID,Description,en,de,jp,ko,zh-CN,ru
2 | General.Cancel,"Cancel phrase - No more than 15 characters, for buttons and such",Cancel,Abbrechen,キャンセル,취소,取消,Отмена
3 | General.Confirm,"Confirm phrase - No more than 15 characters, for buttons and such",Confirm,Bestätigen,決定,확인,确认,Принять
4 | General.Continue,"Continue Phrase - No more than 15 characters, for buttons and such",Continue,Fortsetzen,続ける,계속,继续,Продолжить игру
5 | Example.Placerholder, "This show how placeholders are used",Continue {0},Fortsetzen {0},続ける {0},계속 {0},继续 {0}。,Продолжить игру {0}
--------------------------------------------------------------------------------
/config.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | in/example.csv
5 |
6 |
7 | out/
8 |
9 |
10 | true
11 | 0123456789+-,.!?-
12 | {}
13 |
14 |
15 | default
16 | en
17 | de
18 | ru
19 |
20 |
--------------------------------------------------------------------------------
/.idea/copyright/Supyrb__Johannes.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/compiler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | CharacterProcessor
8 | CsvCharacterExtractor
9 | 0.1-SNAPSHOT
10 |
11 |
12 |
13 |
14 | com.univocity
15 | univocity-parsers
16 | 2.5.9
17 | jar
18 |
19 |
20 |
21 | com.vdurmont
22 | emoji-java
23 | 4.0.0
24 |
25 |
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Johannes Deml
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/main/java/CsvCharacterExtractor.java:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------------------------------------------------
3 | *
4 | * Copyright (c) 2018 Supyrb. All rights reserved.
5 | *
6 | *
7 | * Johannes Deml
8 | * send@johannesdeml.com
9 | *
10 | * --------------------------------------------------------------------------------------------------------------------
11 | */
12 |
13 | import java.util.ArrayList;
14 |
15 | public class CsvCharacterExtractor {
16 | public static void main(String[] args) {
17 | System.out.println("Running CsvCharacterExtractor");
18 |
19 | ConfigData config = new ConfigData("./in/table.csv", "./out/", false,"", "", new ArrayList());
20 | ConfigReader configReader = new ConfigReader();
21 | configReader.parseXmlFile("./config.xml", config);
22 | CsvUniqueCharacterProcessor processor = new CsvUniqueCharacterProcessor();
23 | processor.initialize(config);
24 | processor.run();
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/CsvCharacterExtractor.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
2 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
3 |
4 | # User-specific stuff:
5 | .idea/**/workspace.xml
6 | .idea/**/tasks.xml
7 | .idea/dictionaries
8 |
9 | # Sensitive or high-churn files:
10 | .idea/**/dataSources/
11 | .idea/**/dataSources.ids
12 | .idea/**/dataSources.xml
13 | .idea/**/dataSources.local.xml
14 | .idea/**/sqlDataSources.xml
15 | .idea/**/dynamic.xml
16 | .idea/**/uiDesigner.xml
17 |
18 | # Gradle:
19 | .idea/**/gradle.xml
20 | .idea/**/libraries
21 |
22 | # CMake
23 | cmake-build-debug/
24 | cmake-build-release/
25 |
26 | # Mongo Explorer plugin:
27 | .idea/**/mongoSettings.xml
28 |
29 | ## File-based project format:
30 | *.iws
31 |
32 | ## Plugin-specific files:
33 |
34 | # IntelliJ
35 | artifacts/
36 | target/
37 | out/
38 |
39 | # mpeltonen/sbt-idea plugin
40 | .idea_modules/
41 |
42 | # JIRA plugin
43 | atlassian-ide-plugin.xml
44 |
45 | # Cursive Clojure plugin
46 | .idea/replstate.xml
47 |
48 | # Crashlytics plugin (for Android Studio and IntelliJ)
49 | com_crashlytics_export_strings.xml
50 | crashlytics.properties
51 | crashlytics-build.properties
52 | fabric.properties
53 |
--------------------------------------------------------------------------------
/src/main/java/CombinedOutputData.java:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------------------------------------------------
3 | *
4 | * Copyright (c) 2018 Supyrb. All rights reserved.
5 | *
6 | *
7 | * Johannes Deml
8 | * send@johannesdeml.com
9 | *
10 | * --------------------------------------------------------------------------------------------------------------------
11 | */
12 |
13 | import java.util.List;
14 |
15 | public class CombinedOutputData {
16 | private String name;
17 | private List columns;
18 |
19 | public CombinedOutputData(String name, List columns) {
20 | this.name = name;
21 | this.columns = columns;
22 | }
23 |
24 | public String getName() {
25 | return name;
26 | }
27 |
28 | public void setName(String name) {
29 | this.name = name;
30 | }
31 |
32 | public List getColumns() {
33 | return columns;
34 | }
35 |
36 | public void setColumns(List columns) {
37 | this.columns = columns;
38 | }
39 |
40 | public void addColumn(String columnName) {
41 | columns.add(columnName);
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CSV Character Extractor
2 |
3 | 
4 |
5 | ## Functionality
6 | Extract all unique characters of each column of a csv file, combine and manipulate results and store the results in text files for further usage.
7 | This tool was developed to create font assets for TextmeshPro in Unity. Creating textures with just the character you need is essential for languages like Chinese, Japanese or Korean.
8 |
9 | ### Input
10 | * Input file can be defined in config.xml, default value is "in/example.csv"
11 | * Languages are defined in columns, first column defines the language name (see [example.csv](in/example.csv))
12 | * Column `ID` and `Description` will be ignored
13 | * Newline character (\n\r) and all emojis will be ignored
14 |
15 | ### Output
16 | * Text files are created for each language and named "ColumnName.txt". Output path can be defined in config.xml
17 | * One file per column, expecting to have one language per column
18 |
19 | ## Requirements
20 | * [Java Runtime environment](http://www.oracle.com/technetwork/java/javase/downloads/jre8-downloads-2133155.html)
21 |
22 | ## Download
23 | * [latest release](https://github.com/JohannesDeml/CsvCharacterExtractor/releases)
24 |
25 | ## Run
26 | * Windows: Doubleclick Run.bat
27 | * Windows, Mac, Linux: Run `java -jar CsvCharacterExtractor.jar` in the terminal
28 | ### Config usage
29 | * With the config you can set the in and out path as well as characters that should be always or never included. Take a look at the [example config](config.xml)
30 | * Paths can be relative, e.g. `in/example.csv`
31 | * Paths can be absolute, e.g. `C:/Users/UserName/Documents/LanguageCharacterFiles/`
32 | * Use forward slashes only `/`
33 | * Automatically add lower and upper case charaters to the unique characters file
34 | * Create union files of multiple separate columns
35 | ## Example
36 | [Example Spreadsheet](https://docs.google.com/spreadsheets/d/1WmGauAzcCyQu7OcOnFP2Ypx2x9xuJCclpf7p25cFpz0/edit#gid=1088591893)
37 |
38 | ## Roadmap
39 | * Document code
40 | * Add information on how to build the project
41 |
42 | ## Third Party Libraries
43 | * https://github.com/uniVocity/univocity-parsers (Apache 2.0 License)
44 | * https://github.com/vdurmont/emoji-java (MIT License)
45 |
--------------------------------------------------------------------------------
/src/main/java/ConfigData.java:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------------------------------------------------
3 | *
4 | * Copyright (c) 2018 Supyrb. All rights reserved.
5 | *
6 | *
7 | * Johannes Deml
8 | * send@johannesdeml.com
9 | *
10 | * --------------------------------------------------------------------------------------------------------------------
11 | */
12 |
13 | import java.util.List;
14 |
15 | public class ConfigData {
16 | private String inPath;
17 | private String outPath;
18 | private boolean includeUpperAndLowerCase;
19 | private String includeCharacters;
20 | private String excludeCharacters;
21 | private List combinedOutputTargets;
22 |
23 | public ConfigData(String inPath, String outPath, boolean includeUpperAndLowerCase, String includeCharacters,
24 | String excludeCharacters, List combinedOutputTargets) {
25 | this.inPath = inPath;
26 | this.outPath = outPath;
27 | this.includeUpperAndLowerCase = includeUpperAndLowerCase;
28 | this.includeCharacters = includeCharacters;
29 | this.excludeCharacters = excludeCharacters;
30 | this.combinedOutputTargets = combinedOutputTargets;
31 | }
32 |
33 | public String getInPath() {
34 | return inPath;
35 | }
36 |
37 | public void setInPath(String inPath) {
38 | this.inPath = inPath;
39 | }
40 |
41 | public String getOutPath() {
42 | return outPath;
43 | }
44 |
45 | public void setOutPath(String outPath) {
46 | this.outPath = outPath;
47 | }
48 |
49 | public boolean isIncludeUpperAndLowerCase() {
50 | return includeUpperAndLowerCase;
51 | }
52 |
53 | public void setIncludeUpperAndLowerCase(boolean includeUpperAndLowerCase) {
54 | this.includeUpperAndLowerCase = includeUpperAndLowerCase;
55 | }
56 |
57 | public String getIncludeCharacters() {
58 | return includeCharacters;
59 | }
60 |
61 | public void setIncludeCharacters(String includeCharacters) {
62 | this.includeCharacters = includeCharacters;
63 | }
64 |
65 | public String getExcludeCharacters() {
66 | return excludeCharacters;
67 | }
68 |
69 | public void setExcludeCharacters(String excludeCharacters) {
70 | this.excludeCharacters = excludeCharacters;
71 | }
72 |
73 | public List getCombinedOutputTargets() {
74 | return combinedOutputTargets;
75 | }
76 |
77 | public void setCombinedOutputTargets(List combinedOutputTargets) {
78 | this.combinedOutputTargets = combinedOutputTargets;
79 | }
80 |
81 | public void addCombinedOutputTarget(CombinedOutputData target) {
82 | this.combinedOutputTargets.add(target);
83 | }
84 | }
85 |
86 |
--------------------------------------------------------------------------------
/src/main/java/ConfigReader.java:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------------------------------------------------
3 | *
4 | * Copyright (c) 2018 Supyrb. All rights reserved.
5 | *
6 | *
7 | * Johannes Deml
8 | * send@johannesdeml.com
9 | *
10 | * --------------------------------------------------------------------------------------------------------------------
11 | */
12 |
13 | import org.w3c.dom.Document;
14 | import org.w3c.dom.Element;
15 | import org.w3c.dom.NodeList;
16 | import org.xml.sax.SAXException;
17 |
18 | import javax.xml.parsers.DocumentBuilder;
19 | import javax.xml.parsers.DocumentBuilderFactory;
20 | import javax.xml.parsers.ParserConfigurationException;
21 | import java.io.IOException;
22 | import java.util.ArrayList;
23 | import java.util.List;
24 |
25 | public class ConfigReader {
26 |
27 | public ConfigReader() {
28 | }
29 |
30 | public void parseXmlFile(String relativeConfigPath, ConfigData configData) {
31 | //get the factory
32 | DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
33 | Document dom = null;
34 |
35 | try {
36 |
37 | //Using factory get an instance of document builder
38 | DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder();
39 |
40 | //parse using builder to get DOM representation of the XML file
41 | dom = documentBuilder.parse(relativeConfigPath);
42 |
43 |
44 | }catch(ParserConfigurationException pce) {
45 | pce.printStackTrace();
46 | }catch(SAXException se) {
47 | se.printStackTrace();
48 | }catch(IOException ioe) {
49 | ioe.printStackTrace();
50 | }
51 |
52 | if(dom == null) {
53 | System.out.println("Couldn't read config file, proceeding with default values");
54 | return;
55 | }
56 |
57 | Element root = dom.getDocumentElement();
58 |
59 | NodeList inNodes = root.getElementsByTagName("in");
60 | if(inNodes == null || inNodes.getLength() <= 0) {
61 | System.out.println("Couldn't find in node in config file, proceeding with default values");
62 | }
63 | else {
64 | Element node = (Element)inNodes.item(0);
65 | configData.setInPath(getTextValue(node, "path", configData.getInPath()));
66 | }
67 |
68 |
69 | NodeList outNodes = root.getElementsByTagName("out");
70 | if(outNodes == null || outNodes.getLength() <= 0) {
71 | System.out.println("Couldn't find out node in config file, proceeding with default values");
72 | }
73 | else {
74 | Element node = (Element)outNodes.item(0);
75 | configData.setOutPath(getTextValue(node, "path", configData.getOutPath()));
76 | }
77 |
78 | NodeList modificationNodes = root.getElementsByTagName("modification");
79 | if(modificationNodes == null || modificationNodes.getLength() <= 0) {
80 | System.out.println("Couldn't find modification node in config file, proceeding with default values");
81 | }
82 | else {
83 | Element node = (Element)modificationNodes.item(0);
84 | String upperLowerCaseString = getTextValue(node, "include-upper-and-lower-case", configData.isIncludeUpperAndLowerCase()?"true":"false");
85 | configData.setIncludeUpperAndLowerCase(upperLowerCaseString.equals("true"));
86 | configData.setIncludeCharacters(getTextValue(node, "include-characters", configData.getIncludeCharacters()));
87 | configData.setExcludeCharacters(getTextValue(node, "exclude-characters", configData.getExcludeCharacters()));
88 | }
89 |
90 | NodeList combineNodes = root.getElementsByTagName("combine");
91 | if(combineNodes == null || combineNodes.getLength() <= 0) {
92 | System.out.println("Couldn't find combine node in config file, proceeding with default values");
93 | }
94 | else {
95 | for (int i = 0; i < combineNodes.getLength(); i++) {
96 | Element node = (Element)combineNodes.item(i);
97 | String name = getTextValue(node, "name", String.format("combinedValues%d", i));
98 | List columns = getTextValues(node, "column");
99 | configData.addCombinedOutputTarget(new CombinedOutputData(name, columns));
100 | }
101 | }
102 | }
103 |
104 | private String getTextValue(Element element, String tagName, String defaultValue) {
105 | if(element == null) {
106 | return defaultValue;
107 | }
108 | NodeList nodes = element.getElementsByTagName(tagName);
109 | if(nodes == null || nodes.getLength() == 0) {
110 | return defaultValue;
111 | }
112 | return nodes.item(0).getTextContent();
113 | }
114 |
115 | private List getTextValues(Element element, String tagName) {
116 | List list = new ArrayList();
117 | NodeList nodes = element.getElementsByTagName(tagName);
118 | if(nodes == null || nodes.getLength() == 0) {
119 | return list;
120 | }
121 |
122 | for (int i = 0; i < nodes.getLength(); i++) {
123 | list.add(nodes.item(i).getTextContent());
124 | }
125 |
126 | return list;
127 | }
128 | }
129 |
--------------------------------------------------------------------------------
/src/main/java/CsvUniqueCharacterProcessor.java:
--------------------------------------------------------------------------------
1 | /*
2 | * --------------------------------------------------------------------------------------------------------------------
3 | *
4 | * Copyright (c) 2018 Supyrb. All rights reserved.
5 | *
6 | *
7 | * Johannes Deml
8 | * send@johannesdeml.com
9 | *
10 | * --------------------------------------------------------------------------------------------------------------------
11 | */
12 |
13 | import com.univocity.parsers.common.processor.ColumnProcessor;
14 | import com.univocity.parsers.csv.CsvParser;
15 | import com.univocity.parsers.csv.CsvParserSettings;
16 | import com.vdurmont.emoji.EmojiParser;
17 |
18 | import java.io.*;
19 | import java.util.HashMap;
20 | import java.util.HashSet;
21 | import java.util.List;
22 | import java.util.Map;
23 |
24 | public class CsvUniqueCharacterProcessor {
25 |
26 | private CsvParserSettings parserSettings;
27 | private ColumnProcessor rowProcessor;
28 | private CsvParser parser;
29 | private Map> uniqueLanguageCharacters;
30 | private HashSet allUniqueCharacters;
31 | private ConfigData config;
32 |
33 | public CsvUniqueCharacterProcessor() {
34 | parserSettings = defineParserSettings();
35 | rowProcessor = defineProcessorSettings();
36 | parser = new CsvParser(parserSettings);
37 | uniqueLanguageCharacters = new HashMap>();
38 | allUniqueCharacters = new HashSet();
39 | }
40 |
41 | public void initialize(ConfigData config) {
42 | this.config = config;
43 | uniqueLanguageCharacters.clear();
44 | allUniqueCharacters.clear();
45 | }
46 |
47 | public void run() {
48 | String pathToInputCsv = config.getInPath();
49 | // Check for input file
50 | if(!fileExists(pathToInputCsv)) {
51 | System.out.println("Missing input file! The input path set in the config is " + pathToInputCsv);
52 | return;
53 | }
54 |
55 | parseInputFile(pathToInputCsv);
56 | extractUniqueCharacters();
57 |
58 | storeColumnCharacters();
59 | storeAllCharacters();
60 | storeCombinationCharacters();
61 | }
62 |
63 | private void extractUniqueCharacters() {
64 | Map> columnValues = rowProcessor.getColumnValuesAsMapOfNames();
65 |
66 | for (Map.Entry> columnEntry : columnValues.entrySet()) {
67 | String columnName = columnEntry.getKey();
68 | String fileName = generateValidFileName(columnName);
69 | if(ignoreColumn(fileName)) {
70 | continue;
71 | }
72 | List entries = columnEntry.getValue();
73 | System.out.println("Processing column " + columnName);
74 | HashSet uniqueCharacters = getUniqueCharacters(entries);
75 | allUniqueCharacters.addAll(uniqueCharacters);
76 | addIncludeCharacters(uniqueCharacters);
77 | removeExcludeCharacters(uniqueCharacters);
78 | uniqueLanguageCharacters.put(fileName, uniqueCharacters);
79 | }
80 | addIncludeCharacters(allUniqueCharacters);
81 | removeExcludeCharacters(allUniqueCharacters);
82 | }
83 |
84 | private void storeColumnCharacters() {
85 | for (Map.Entry>languageCharacters: uniqueLanguageCharacters.entrySet()) {
86 | String fileName = languageCharacters.getKey();
87 | HashSet uniqueCharacters = languageCharacters.getValue();
88 | storeAsText(fileName, uniqueCharacters);
89 | }
90 | }
91 |
92 | private void storeAllCharacters() {
93 | storeAsText("AllUniqueCharacters", allUniqueCharacters);
94 | }
95 |
96 | private void storeCombinationCharacters() {
97 | List combinedOutputTargets = config.getCombinedOutputTargets();
98 | for (CombinedOutputData combinedOutputData : combinedOutputTargets) {
99 | HashSet uniqueCharacters = new HashSet();
100 | String combinedDataName = combinedOutputData.getName();
101 | System.out.println("Processing combined data " + combinedDataName);
102 |
103 | List targetColumns = combinedOutputData.getColumns();
104 | for (String column : targetColumns) {
105 | if(!uniqueLanguageCharacters.containsKey(column)) {
106 | System.out.println("Missing column " + column + " for combinedColumn " + combinedDataName + " - ignoring column.");
107 | continue;
108 | }
109 | HashSet languageCharacters = uniqueLanguageCharacters.get(column);
110 | uniqueCharacters.addAll(languageCharacters);
111 | }
112 | storeAsText(combinedDataName, uniqueCharacters);
113 | }
114 | }
115 |
116 | private void storeAsText(String fileName, HashSet uniqueCharacters) {
117 | String uniqueCharacterString = getStringRepresentation(uniqueCharacters);
118 | System.out.println(fileName + " has " + uniqueCharacterString.length() + " unique characters");
119 | writeToFile(config.getOutPath() + fileName + ".txt", uniqueCharacterString);
120 | }
121 |
122 | private String generateValidFileName(String name) {
123 | return name.replaceAll("[\\\\/:*?\"<>|]", "_");
124 | }
125 |
126 | private boolean ignoreColumn(String columnName) {
127 | return columnName.toLowerCase().equals("id") || columnName.toLowerCase().equals("description");
128 | }
129 |
130 | private void parseInputFile(String pathToInputCsv) {
131 | parser.parse(getReader(pathToInputCsv));
132 | }
133 |
134 | private CsvParserSettings defineParserSettings() {
135 | CsvParserSettings parserSettings = new CsvParserSettings();
136 | parserSettings.getFormat().setLineSeparator("\n");
137 | parserSettings.setHeaderExtractionEnabled(true);
138 | return parserSettings;
139 | }
140 |
141 | private ColumnProcessor defineProcessorSettings() {
142 | ColumnProcessor rowProcessor = new ColumnProcessor();
143 | parserSettings.setProcessor(rowProcessor);
144 | return rowProcessor;
145 | }
146 |
147 | private boolean fileExists(String path) {
148 | File file = new File(path);
149 | if(!file.exists()) {
150 | return false;
151 | }
152 | return true;
153 | }
154 |
155 | private void writeToFile(String relativePath, String uniqueCharacterString) {
156 | PrintWriter out = null;
157 | try {
158 |
159 | File file = new File(relativePath);
160 | file.getParentFile().mkdirs();
161 | if(!file.exists()) {
162 | file.createNewFile();
163 | }
164 |
165 | out = new PrintWriter(file, "UTF-8");
166 | out.write(uniqueCharacterString);
167 | }
168 | catch (IOException e)
169 | {
170 | e.printStackTrace();
171 | }
172 | finally
173 | {
174 | if(out != null) {
175 | out.close();
176 | }
177 | }
178 | }
179 |
180 | private String getStringRepresentation(HashSet uniqueCharacters) {
181 | StringBuilder builder = new StringBuilder(uniqueCharacters.size());
182 | for (Character character: uniqueCharacters) {
183 | builder.append(character);
184 | }
185 | return builder.toString();
186 | }
187 |
188 | private HashSet getUniqueCharacters(List entries) {
189 | boolean includeUpperAndLowercaseVersion = config.isIncludeUpperAndLowerCase();
190 | HashSet uniqueCharacters = new HashSet();
191 | for (String entry : entries) {
192 | if(entry == null) {
193 | continue;
194 | }
195 | // Remove emojis
196 | entry = EmojiParser.removeAllEmojis(entry);
197 | // Remove newlines
198 | entry = entry.replaceAll("\n\r", "");
199 | entry = entry.replaceAll("\n", "");
200 | char[] characters = entry.toCharArray();
201 | for (int i = 0; i uniqueCharacters) {
219 | String excludeCharacters= config.getExcludeCharacters();
220 |
221 | for (int i = 0; i< excludeCharacters.length(); i++) {
222 | char c = excludeCharacters.charAt(i);
223 | uniqueCharacters.remove(c);
224 | }
225 | }
226 |
227 | private void addIncludeCharacters(HashSet uniqueCharacters) {
228 | String includeCharacters = config.getIncludeCharacters();
229 |
230 | for (int i = 0; i< includeCharacters.length(); i++) {
231 | char c = includeCharacters.charAt(i);
232 | uniqueCharacters.add(c);
233 | }
234 | }
235 |
236 | public Reader getReader(String relativePath) {
237 | try {
238 | InputStream inputStream = new FileInputStream(relativePath);
239 | return new InputStreamReader(inputStream, "UTF-8");
240 | } catch (Exception e) {
241 | e.printStackTrace();
242 | return null;
243 | }
244 | }
245 | }
246 |
--------------------------------------------------------------------------------