lineSegments = new ArrayList<>();
435 | if (line.length() < MAX_LENGTH) lineSegments.add(line);
436 | else { // split the line into short line segments
437 | Matcher matcher = SPLIT_PATTERN.matcher(line);
438 | while (matcher.find()) lineSegments.add(matcher.group());
439 | }
440 | return lineSegments;
441 | }
442 |
443 | /**
444 | * Returns a {@link Charset} wich name {@code charset}. This methods differs from
445 | * the {@link Charset#forName(String)} when {@code charset} is {@code null}, with
446 | * this method returning {@code null} while {@link Charset#forName(String)} throws
447 | * an NPE.
448 | *
449 | * @param charset
450 | * The name of the {@link Charset}.
451 | *
452 | * @return The {@link Charset} with name {@code charset}.
453 | *
454 | * @throws UnsupportedCharsetException
455 | * If the charset referred to by the given name is not supported.
456 | */
457 | private static Charset forName(String charset) throws UnsupportedCharsetException {
458 | if (charset == null) return null;
459 | return Charset.forName(charset);
460 | }
461 |
462 | /**
463 | * Returns the given {@link Charset} when non-null, or
464 | * {@link StandardCharsets#UTF_8} otherwise, since many applications using
465 | * {@link Charset} throws NPE if charset is {@code null}.
466 | *
467 | * @param charset
468 | * The given {@link Charset}.
469 | *
470 | * @return {@code charset} when non-null, {@link StandardCharsets#UTF_8} otherwise.
471 | */
472 | private static Charset getOrDefault(Charset charset) {
473 | return charset == null ? StandardCharsets.UTF_8 : charset;
474 | }
475 | }
476 |
--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/util/StringUtils.java:
--------------------------------------------------------------------------------
1 | package org.thunlp.thulac.util;
2 |
3 | /**
4 | * An utility class which deals with string, converting array of code points to and from
5 | * strings.
6 | */
7 | public class StringUtils {
8 | /**
9 | * Convert an array of code points to {@link String}.
10 | *
11 | * @param codePoints
12 | * The code points to convert.
13 | *
14 | * @return The converted {@link String}.
15 | */
16 | public static String toString(int... codePoints) {
17 | return toString(codePoints, 0, codePoints.length);
18 | }
19 |
20 | /**
21 | * Convert an array of code points to {@link String}.
22 | *
23 | * @param codePoints
24 | * The code points to convert.
25 | * @param offset
26 | * The starting offset of {@code codePoints}.
27 | * @param len
28 | * The number of code points to convert.
29 | *
30 | * @return The converted {@link String}, indices which exceeds {@code
31 | * codePoints.length} are discarded.
32 | */
33 | public static String toString(int[] codePoints, int offset, int len) {
34 | StringBuilder sb = new StringBuilder();
35 | for (int i = offset, max = Math.min(codePoints.length, offset + len);
36 | i < max; ++i)
37 | sb.appendCodePoint(codePoints[i]);
38 | return sb.toString();
39 | }
40 |
41 | /**
42 | * Convert a {@link String} to an array of code points.
43 | * Internally, data in {@link String} is stored in {@code char[]}, however for
44 | * Unicode code points greater than U+FFFF, one {@code char} (that is, two bytes)
45 | * is not enough. Therefore, Java uses surrogates to divide code points
46 | * that cannot be represented by one {@code} into two. The problem is,
47 | * {@link String#length()} return the length of its internal {@code char[]}, while
48 | * the return value of {@link String#length()} is not necessarily (though in most
49 | * cases) equal to the number of code points stored in the {@link String}.
50 | * To solve this problem, the {@link String} class provides a set of methods to
51 | * retrieve the actual number of code points stored and to access a code points in
52 | * the {@link String} using the index by code points, as implemented in this method.
53 | * However, the iteration through a {@link String} by the actual code points is
54 | * fairly complicated, and it is much easier for applications to achieve this if
55 | * the string data is stored as {@code int[]}, each element representing a code point.
56 | * And this is exactly What this method does: take a {@link String} as input,
57 | * convert it into a {@code int[]} which contains exactly the same data as the
58 | * {@link String}.
59 | * It is recommended that all applications which iterate through the characters
60 | * stored in a {@link String} use
61 | *
62 | * int[] codePoints = StringUtils.toCodePoints(str);
63 | * for (int codePoint: codePoints) // do something ...
64 | *
65 | * instead of the traditional
66 | *
67 | * for (int i = 0, length = str.length(); i < length; ++i) {
68 | * char c = str.charAt(i);
69 | * // do something ...
70 | * }
71 | *
72 | *
73 | * @param str
74 | * The {@link String} to convert.
75 | *
76 | * @return The converted array of code points.
77 | */
78 | public static int[] toCodePoints(String str) {
79 | if (str == null) return null;
80 | int codePointCount = str.codePointCount(0, str.length());
81 | int[] codePoints = new int[codePointCount];
82 | for (int i = 0; i < codePointCount; ++i)
83 | codePoints[i] = str.codePointAt(str.offsetByCodePoints(0, i));
84 | return codePoints;
85 | }
86 |
87 | /**
88 | * Return the number of code points in the given {@link String}.
89 | *
90 | * @param str
91 | * The given {@link String}.
92 | *
93 | * @return The number of code points in {@code str}.
94 | */
95 | public static int codePointCount(String str) {
96 | return str.codePointCount(0, str.length());
97 | }
98 |
99 | /**
100 | * Return code point {@code index}-ith code point in the given {@link String}.
101 | *
102 | * @param str
103 | * The given {@link String}.
104 | * @param index
105 | * The index of the code point to return.
106 | *
107 | * @return The cde point at {@code index}.
108 | *
109 | * @throws IndexOutOfBoundsException
110 | * If index is negative or greater than or equal to the number of code points
111 | * of {@code str}.
112 | */
113 | public static int codePointAt(String str, int index) {
114 | int codePointIndex = str.offsetByCodePoints(0, index);
115 | return str.codePointAt(codePointIndex);
116 | }
117 | }
118 |
--------------------------------------------------------------------------------
/src/test/java/org/thunlp/thulac/IAccessible.java:
--------------------------------------------------------------------------------
1 | package org.thunlp.thulac;
2 |
3 | import org.thunlp.thulac.io.IInputProvider;
4 | import org.thunlp.thulac.io.IOutputHandler;
5 | import org.thunlp.thulac.util.IOUtils;
6 |
7 | import java.io.FileInputStream;
8 | import java.io.IOException;
9 | import java.io.InputStream;
10 | import java.net.URI;
11 | import java.net.URISyntaxException;
12 | import java.net.URL;
13 | import java.nio.file.Files;
14 | import java.nio.file.Paths;
15 | import java.util.List;
16 | import java.util.stream.Collectors;
17 | import java.util.stream.Stream;
18 |
19 | /**
20 | * An interface which provides a set of common actions for resources and files used in
21 | * {@link TestHelper}. In practice, an {@code abstract class} is used instead of an
22 | * {@code interface} because interfaces does not allow private nested classes. Despite
23 | * of this, this class can be used just like an interface.
24 | */
25 | public abstract class IAccessible {
26 | /**
27 | * Create an instance of {@link IAccessible} with the given resource name.
28 | *
29 | * @param name
30 | * The resource name.
31 | *
32 | * @return The {@link IAccessible} created.
33 | *
34 | * @see Resources
36 | */
37 | public static IAccessible resourceAt(String name) {
38 | return new AccessibleResource(name);
39 | }
40 |
41 | /**
42 | * Create an instance of {@link IAccessible} with the given file name.
43 | *
44 | * @param name
45 | * The file name.
46 | *
47 | * @return The {@link IAccessible} created.
48 | */
49 | public static IAccessible fileAt(String name) {
50 | return new AccessibleFiles(name);
51 | }
52 |
53 | /**
54 | * Trim lines and remove empty ones.
55 | *
56 | * @param lines
57 | * The raw lines as {@link Stream}.
58 | *
59 | * @return The trimmed and non-empty lines as {@link List}.
60 | */
61 | private static List getLines(Stream lines) {
62 | return lines.map(String::trim)
63 | .filter(line -> !line.isEmpty())
64 | .collect(Collectors.toList());
65 | }
66 |
67 | /**
68 | * Implementation of {@link IAccessible} reading from resources.
69 | */
70 | private static class AccessibleResource extends IAccessible {
71 | private URI uri;
72 | private URL url;
73 |
74 | public AccessibleResource(String resourceName) {
75 | this.url = AccessibleResource.class.getResource(resourceName);
76 | try {
77 | this.uri = this.url.toURI();
78 | } catch (URISyntaxException ignored) { // should not happen
79 | }
80 | }
81 |
82 | @Override
83 | public List getLines() throws IOException {
84 | return IAccessible.getLines(Files.lines(Paths.get(this.uri)));
85 | }
86 |
87 | @Override
88 | public IOutputHandler toOutputHandler() throws IOException {
89 | throw new UnsupportedOperationException("Output not supported on resources!");
90 | }
91 |
92 | @Override
93 | public InputStream toInputStream() throws IOException {
94 | return this.url.openStream();
95 | }
96 | }
97 |
98 | /**
99 | * Implementation of {@link IAccessible} reading from and writing to files.
100 | */
101 | private static class AccessibleFiles extends IAccessible {
102 | private String filename;
103 |
104 | public AccessibleFiles(String filename) {
105 | this.filename = filename;
106 | }
107 |
108 | @Override
109 | public List getLines() throws IOException {
110 | return Files.readAllLines(Paths.get(this.filename));
111 | }
112 |
113 | @Override
114 | public IInputProvider toInputProvider() throws IOException {
115 | return IOUtils.inputFromFile(this.filename);
116 | }
117 |
118 | @Override
119 | public IOutputHandler toOutputHandler() throws IOException {
120 | return IOUtils.outputToFile(this.filename);
121 | }
122 |
123 | @Override
124 | public InputStream toInputStream() throws IOException {
125 | return new FileInputStream(this.filename);
126 | }
127 | }
128 |
129 | /**
130 | * Return the content of this resource / file separated into individual lines.
131 | *
132 | * @return Content of this resource / file as a list of strings.
133 | */
134 | public abstract List getLines() throws IOException;
135 |
136 | /**
137 | * Create a {@link IInputProvider} with this resource / file.
138 | *
139 | * @return The {@link IInputProvider} created.
140 | */
141 | public IInputProvider toInputProvider() throws IOException {
142 | return IOUtils.inputFromInputStream(this.toInputStream());
143 | }
144 |
145 | /**
146 | * Create a {@link IOutputHandler} with this resource / file.
147 | *
148 | * @return The {@link IOutputHandler} created.
149 | */
150 | public abstract IOutputHandler toOutputHandler() throws IOException;
151 |
152 | /**
153 | * Create a {@link InputStream} with this resource / file.
154 | *
155 | * @return The {@link InputStream} created.
156 | */
157 | public abstract InputStream toInputStream() throws IOException;
158 | }
159 |
--------------------------------------------------------------------------------
/src/test/java/org/thunlp/thulac/MainAlt.java:
--------------------------------------------------------------------------------
1 | package org.thunlp.thulac;
2 |
3 | import joptsimple.OptionException;
4 | import joptsimple.OptionParser;
5 | import joptsimple.OptionSet;
6 | import joptsimple.OptionSpec;
7 | import org.thunlp.thulac.io.IInputProvider;
8 | import org.thunlp.thulac.io.IOutputHandler;
9 | import org.thunlp.thulac.util.IOUtils;
10 |
11 | import java.io.IOException;
12 |
13 | import static java.util.Arrays.asList;
14 |
15 | /**
16 | * A test class of the CLI (Command Line Interface), using
17 | * Jopt Simple to parse command
18 | * line input.
19 | */
20 | public class MainAlt {
21 | private static final String SEG_ONLY_DESC = "Output segments only";
22 | private static final String T2S_DESC = "Convert traditional to simplified Chinese";
23 | private static final String FILTER_DESC = "Use filter for output";
24 | private static final String INPUT_DESC = "Path to the input file";
25 | private static final String OUTPUT_DESC = "Path to the output file";
26 | private static final String USER_DICT_DESC = "The user-specified dictionary";
27 | private static final String DELIMITER_DESC = "The separator between words and tags";
28 | private static final String MODEL_DIR_DESC = "Path for models directory";
29 | private static final String HELP_DESC = "Show help";
30 |
31 | public static void main(String[] args) throws IOException {
32 | OptionParser parser = new OptionParser();
33 |
34 | parser.accepts("seg_only", SEG_ONLY_DESC);
35 | parser.accepts("t2s", T2S_DESC);
36 | parser.accepts("filter", FILTER_DESC);
37 | OptionSpec iOpt = parser.acceptsAll(
38 | asList("input", "i"), INPUT_DESC).withRequiredArg();
39 | OptionSpec oOpt = parser.acceptsAll(
40 | asList("output", "o"), OUTPUT_DESC).withRequiredArg();
41 | OptionSpec userDictOpt = parser.acceptsAll(
42 | asList("user_dict", "dict", "user"), USER_DICT_DESC).withRequiredArg();
43 | OptionSpec dOpt = parser.acceptsAll(
44 | asList("delimiter", "delim", "deli"), DELIMITER_DESC).withRequiredArg();
45 | OptionSpec modelDirOpt = parser.acceptsAll(
46 | asList("model_dir", "model"), MODEL_DIR_DESC).withRequiredArg();
47 | parser.acceptsAll(asList("help", "?", "h"), HELP_DESC).forHelp();
48 |
49 | OptionSet opts = parser.parse(args);
50 |
51 | if (opts.has("help")) parser.printHelpOn(System.out);
52 | else try {
53 | char separator = opts.valueOf(dOpt).charAt(0);
54 | boolean segOnly = opts.has("seg_only");
55 | boolean useT2S = opts.has("t2s");
56 | boolean useFilter = opts.has("filter");
57 |
58 | IInputProvider input;
59 | if (opts.has(iOpt)) input = IOUtils.inputFromFile(opts.valueOf(iOpt));
60 | else input = IOUtils.inputFromConsole();
61 | IOutputHandler output;
62 | if (opts.has(oOpt)) output = IOUtils.outputToFile(opts.valueOf(oOpt));
63 | else output = IOUtils.outputToConsole();
64 |
65 | String userDict = opts.valueOf(userDictOpt);
66 | String modelDir = opts.valueOf(modelDirOpt);
67 |
68 | Thulac.split(modelDir, separator, userDict,
69 | useT2S, segOnly, useFilter, input, output);
70 | } catch (OptionException e) {
71 | parser.printHelpOn(System.out);
72 | }
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/src/test/java/org/thunlp/thulac/ProfilerInputProvider.java:
--------------------------------------------------------------------------------
1 | package org.thunlp.thulac;
2 |
3 | import org.thunlp.thulac.io.IInputProvider;
4 |
5 | import java.io.IOException;
6 | import java.util.ArrayList;
7 | import java.util.Iterator;
8 | import java.util.List;
9 |
10 | /**
11 | * An implementation of {@link IInputProvider}, used in profiler to reduce time
12 | * consumed by IO operations, wrapping outside another {@link IInputProvider}, reading
13 | * the lines provided in advance and store them in memory. Note that they might lead to
14 | * high memory usage for large files.
15 | */
16 | public class ProfilerInputProvider implements IInputProvider {
17 | private Iterator> linesIterator;
18 |
19 | public ProfilerInputProvider(IInputProvider inputProvider) throws IOException {
20 | List> lines = new ArrayList<>();
21 | for (List lineSegments = inputProvider.provideInput();
22 | lineSegments != null; lineSegments = inputProvider.provideInput())
23 | lines.add(lineSegments);
24 | this.linesIterator = lines.iterator();
25 | }
26 |
27 | @Override
28 | public void onProgramStart() {
29 | }
30 |
31 | @Override
32 | public void onProgramEnd() {
33 | }
34 |
35 | @Override
36 | public List provideInput() throws IOException {
37 | if (this.linesIterator.hasNext()) return this.linesIterator.next();
38 | else return null;
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/test/java/org/thunlp/thulac/ProfilerOutputHandler.java:
--------------------------------------------------------------------------------
1 | package org.thunlp.thulac;
2 |
3 | import org.thunlp.thulac.data.TaggedWord;
4 | import org.thunlp.thulac.io.IOutputHandler;
5 |
6 | import java.io.IOException;
7 | import java.util.List;
8 |
9 | /**
10 | * An empty {@link IOutputHandler}, used in profiler to reduce time consumed by IO
11 | * operations.
12 | */
13 | public class ProfilerOutputHandler implements IOutputHandler {
14 | @Override
15 | public void onProgramStart() {
16 | }
17 |
18 | @Override
19 | public void onProgramEnd() {
20 | }
21 |
22 | @Override
23 | public void handleLineSegment(List words,
24 | boolean segOnly, char separator) {
25 | }
26 |
27 | @Override
28 | public void handleLineStart() throws IOException {
29 | }
30 |
31 | @Override
32 | public void handleLineEnd() throws IOException {
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/test/java/org/thunlp/thulac/TestHelper.java:
--------------------------------------------------------------------------------
1 | package org.thunlp.thulac;
2 |
3 | import org.thunlp.thulac.io.IInputProvider;
4 | import org.thunlp.thulac.io.IOutputHandler;
5 | import org.thunlp.thulac.util.StringUtils;
6 |
7 | import java.io.IOException;
8 | import java.nio.file.Files;
9 | import java.nio.file.Paths;
10 | import java.util.ArrayList;
11 | import java.util.List;
12 |
13 | import static org.junit.Assert.assertEquals;
14 | import static org.junit.Assert.assertTrue;
15 |
16 | /**
17 | * Helper class for THULAC tests.
18 | */
19 | public class TestHelper {
20 | /**
21 | * Run the segmentation program, write the output to the given position and
22 | * calculate the accuracy of the program.
23 | *
24 | * @param inputFile
25 | * The {@link IInputProvider} used as input.
26 | * @param compareFile
27 | * The {@link IInputProvider} used as answer.
28 | * @param outputFile
29 | * The {@link IOutputHandler} used as output.
30 | *
31 | * @throws IOException
32 | * If an error occurs while I/O.
33 | */
34 | public static void testSuite(
35 | IAccessible inputFile, IAccessible compareFile, IAccessible outputFile)
36 | throws IOException {
37 | run(inputFile, outputFile, true);
38 | compare(inputFile, compareFile, outputFile);
39 | }
40 |
41 | /**
42 | * Runs the segmentation program with given input and output and the {@code
43 | * segOnly} flag and output execution time.
44 | *
45 | * @param input
46 | * The {@link IAccessible} used as input.
47 | * @param output
48 | * The {@link IAccessible} used as output.
49 | * @param segOnly
50 | * Whether to output segments only.
51 | *
52 | * @throws IOException
53 | * If one of the model files failed to load.
54 | */
55 | public static void run(IAccessible input, IAccessible output, boolean segOnly)
56 | throws IOException {
57 | IInputProvider inputProvider = input.toInputProvider();
58 | IOutputHandler outputHandler = output.toOutputHandler();
59 | run(inputProvider, outputHandler, segOnly);
60 | }
61 |
62 | /**
63 | * Runs the segmentation program with given input and output and the {@code
64 | * segOnly} flag and output execution time.
65 | *
66 | * @param input
67 | * The {@link IInputProvider} used as input.
68 | * @param output
69 | * The {@link IOutputHandler} used as output.
70 | * @param segOnly
71 | * Whether to output segments only.
72 | *
73 | * @throws IOException
74 | * If one of the model files failed to load.
75 | */
76 | public static void run(IInputProvider input, IOutputHandler output, boolean segOnly)
77 | throws IOException {
78 | long time = -System.currentTimeMillis();
79 | Thulac.split(input, output, segOnly);
80 | time += System.currentTimeMillis();
81 | System.out.printf("Time elapsed: %dms\n", time);
82 | }
83 |
84 | /**
85 | * Runs the segmentation program in profiler mode, that is, provide fastest input
86 | * and output to measure the actual time consumed by the program. Note that this
87 | * method does not output the result, use {@link #run(IInputProvider, IOutputHandler,
88 | * boolean)} or {@link #run(IAccessible, IAccessible, boolean)} if the result must be
89 | * used afterwards.
90 | *
91 | * @param input
92 | * The {@link IAccessible} used as input.
93 | * @param segOnly
94 | * Whether to output segments only.
95 | *
96 | * @throws IOException
97 | * If one of the model files failed to load.
98 | */
99 | public static void runProfiler(IAccessible input, boolean segOnly)
100 | throws IOException {
101 | run(new ProfilerInputProvider(input.toInputProvider()),
102 | new ProfilerOutputHandler(), segOnly);
103 | }
104 |
105 | /**
106 | * Compare the output file and the answer file ({@code compareFile}) and calculate
107 | * accuracy.
108 | * The comparison is done in such a way that, extracting split results from the
109 | * files, the number of split positions in the output file which also exist in
110 | * the compare file are counted.
111 | * This method requires outputFile to be generated with flag -seg_only
112 | *
113 | * @param inputFile
114 | * The {@link IAccessible} used as input.
115 | * @param compareFile
116 | * The {@link IAccessible} used as answer.
117 | * @param outputFile
118 | * The {@link IAccessible} used as output.
119 | *
120 | * @throws IOException
121 | * If an exception was thrown while reading the lines from {@code inputFile},
122 | * {@code compareFile} or {@code outputFile}.
123 | */
124 | public static void compare(
125 | IAccessible inputFile, IAccessible compareFile, IAccessible outputFile)
126 | throws IOException {
127 | // ADDITIONAL TO JAVADOC: ( *XXX* means XXX is a variable )
128 | // In other words, set *matches* to 0 initially. If THULAC splits input at
129 | // point A and so will a human, increase *matches* by one.
130 | // *total* is the number of total split segments in the answer, while
131 | // *segments* is that of the output from THULAC.
132 | // Accuracy is computed dividing *matches* by *total*, that is,
133 | // accuracy = matches / total * 100%
134 | // *segments* is strictly greater than *matches*, therefore
135 | // segments - matches
136 | // represent the number of wrongly split segments.
137 |
138 | List input = inputFile.getLines();
139 | List output = outputFile.getLines();
140 | List compare = compareFile.getLines();
141 |
142 | int lines = input.size();
143 | List> outputSeg = extractSegments(input, output);
144 | List> compareSeg = extractSegments(input, compare);
145 | int matches = 0, segments = outputSeg.stream().mapToInt(List::size).sum(),
146 | total = compareSeg.stream().mapToInt(List::size).sum();
147 | for (int i = 0; i < lines; ++i) {
148 | List outputLine = outputSeg.get(i);
149 | List compareLine = compareSeg.get(i);
150 | matches += outputLine.stream().filter(compareLine::contains).count();
151 | }
152 |
153 | System.out.printf("Result: %d total, %d segments, %d matches, %.2f%% accuracy\n",
154 | total, segments, matches, 100f * matches / total);
155 | }
156 |
157 | private static List> extractSegments(
158 | List input, List result) {
159 | List> segments = new ArrayList<>();
160 | assertEquals("Line count of input and result doesn't match",
161 | input.size(), result.size());
162 | for (int i = 0, size = input.size(); i < size; ++i)
163 | segments.add(extractSegments(input.get(i), result.get(i)));
164 | return segments;
165 | }
166 |
167 | private static List extractSegments(
168 | String input, String result) {
169 | // It is required that the result contains all the characters (code points)
170 | // that exist in the input. This also means that the input should not contain
171 | // whitespaces (ASCII space U+0020 and Chinese fullwidth space U+3000),
172 | // otherwise the behavior of the program is undefined.
173 | // If a character in the input if not found in the output, than an
174 | // AssertionError is thrown with a message which provides more details.
175 |
176 | // In addition, the result of splitting the input is represent by a list of
177 | // integers, each one, say N, means that the program finds it appropriate to
178 | // split the input after the Nth code Point.
179 | // To make it easier to understand, if N and M are two adjacent integers in the
180 | // returned list, then the Nth (inclusive) to the Mth (exclusive) code points
181 | // of the input together make a Chinese word.
182 |
183 | List segments = new ArrayList<>();
184 | int[] cp1 = StringUtils.toCodePoints(input),
185 | cp2 = StringUtils.toCodePoints(result);
186 | int pointer = 0, len1 = cp1.length, len2 = cp2.length;
187 | assertTrue("Result shorter than input!", len1 <= len2);
188 |
189 | int i = 0;
190 | for (; i < len1 && pointer < len2; ++i, ++pointer) {
191 | int c = cp1[i];
192 | if (cp2[pointer] == c) continue;
193 | segments.add(i);
194 | for (; pointer < len2 && cp2[pointer] != c; ++pointer) ;
195 | if (pointer == len2) throw new AssertionError(
196 | new StringBuilder("Character '").appendCodePoint(c)
197 | .append("' not found in result string!\n")
198 | .append("Input: ").append(input)
199 | .append("Result: ").append(result).toString());
200 | }
201 | if (i != len1) throw new AssertionError(
202 | new StringBuilder("Character '").appendCodePoint(cp1[i])
203 | .append("' not found in result string!\n")
204 | .append("Input: ").append(input)
205 | .append("Result: ").append(result).toString());
206 |
207 | return segments;
208 | }
209 |
210 | private static final String RESOURCES_DIRECTORY = "/";
211 | // the temp directory used to store output files
212 | private static final String TEMP_DIRECTORY = "build/tmp/tests/";
213 |
214 | static {
215 | try { // create tmp directory, otherwise IOException would be thrown
216 | Files.createDirectories(Paths.get(TEMP_DIRECTORY));
217 | } catch (IOException e) {
218 | throw new RuntimeException("Unable to create temp directory!", e);
219 | }
220 | }
221 |
222 | public static IAccessible fileAt(String name) {
223 | return IAccessible.fileAt(name);
224 | }
225 |
226 | public static IAccessible tempAt(String name) {
227 | return fileAt(TEMP_DIRECTORY + name);
228 | }
229 |
230 | public static IAccessible resourceAt(String name) {
231 | return IAccessible.resourceAt(RESOURCES_DIRECTORY + name);
232 | }
233 | }
--------------------------------------------------------------------------------
/src/test/java/org/thunlp/thulac/Tests.java:
--------------------------------------------------------------------------------
1 | package org.thunlp.thulac;
2 |
3 | import org.junit.Test;
4 |
5 | import java.io.IOException;
6 |
7 | /**
8 | *
9 | */
10 | public class Tests {
11 | // test files excluded from git for copyright reasons, users may download them here:
12 | // http://rsarxiv.github.io/2016/11/29/%E4%B8%AD%E6%96%87%E5%88%86%E8%AF%8D%E5%B7%A5%E5%85%B7%E6%B5%8B%E8%AF%84/
13 |
14 | // @Test
15 | // public void test1() throws IOException {
16 | // TestHelper.run(TestHelper.resourceAt("input_1.txt"),
17 | // TestHelper.tempAt("output_1.txt"), false);
18 | // }
19 |
20 | // @Test
21 | // public void test2() throws IOException {
22 | // TestHelper.testSuite(TestHelper.resourceAt("input_2.txt"),
23 | // TestHelper.resourceAt("compare_2.txt"),
24 | // TestHelper.tempAt("output_2.txt"));
25 | // }
26 |
27 | // @Test
28 | // public void test3() throws IOException {
29 | // // non-Chinese users may see the following line rendered strangely,
30 | // // nevertheless it is only a simple Chinese sentence.
31 | // System.out.println(Thulac.split("今天,中国人民站起来了!", true));
32 | // }
33 |
34 | // @Test
35 | // public void test4() throws IOException {
36 | // TestHelper.runProfiler(TestHelper.resourceAt("input_2.txt"), true);
37 | // }
38 | }
39 |
--------------------------------------------------------------------------------
/src/test/java/org/thunlp/thulac/data/Dat2WordsConverter.java:
--------------------------------------------------------------------------------
1 | package org.thunlp.thulac.data;
2 |
3 | import org.junit.Test;
4 | import org.thunlp.thulac.util.StringUtils;
5 |
6 | import java.io.IOException;
7 | import java.io.PrintWriter;
8 | import java.nio.file.Files;
9 | import java.nio.file.Path;
10 | import java.nio.file.Paths;
11 | import java.util.AbstractMap;
12 | import java.util.Comparator;
13 | import java.util.List;
14 | import java.util.Stack;
15 | import java.util.regex.Matcher;
16 | import java.util.regex.Pattern;
17 | import java.util.stream.Stream;
18 |
19 | /**
20 | * A class which converts {@link Dat} files generated by {@link DatMaker} inversely to a
21 | * list
22 | * of words.
23 | */
24 | public class Dat2WordsConverter {
25 | /**
26 | * Converts the given {@link Dat} file generated by {@link DatMaker} to words plus
27 | * line numbers and output them through {@code writer}.
28 | *
29 | * @param dat
30 | * The {@link Dat} file to convert.
31 | * @param writer
32 | * The {@link PrintWriter} to output words plus line numbers to.
33 | * @param ln
34 | * Whether to output line numbers.
35 | */
36 | private static void convert(Dat dat, PrintWriter writer, boolean ln) {
37 | traverseTrieTree(dat, writer, 0, new Stack<>(), ln);
38 | }
39 |
40 | /**
41 | * Traverse within the Trie Tree specified by the {@link Dat} file. The file is
42 | * assumed to be generated correctly using {@link DatMaker}, otherwise the behavior
43 | * is undefined. Along the traversing, the words plus line numbers stored within this
44 | * Trie Tree are output using {@linkplain PrintWriter#println(String)
45 | * writer.println()}.
46 | * This method calls itself recursively.
47 | *
48 | * @param dat
49 | * The {@link Dat} file.
50 | * @param writer
51 | * The {@link PrintWriter} to output words to.
52 | * @param index
53 | * The index of the node to traverse.
54 | * @param prefix
55 | * The current prefix of this node, as a list of code points.
56 | * @param ln
57 | * Whether to output line numbers
58 | */
59 | private static void traverseTrieTree(
60 | Dat dat, PrintWriter writer, int index, Stack prefix, boolean ln) {
61 | int[] d = dat.dat;
62 | int base = d[index << 1], length = dat.datSize;
63 | if (d[(base << 1) + 1] == index && !prefix.isEmpty()) {
64 | writer.print(toString(prefix));
65 | if (ln) {
66 | writer.print(' ');
67 | writer.println(d[base << 1]); // line number
68 | } else writer.println();
69 | }
70 | for (int i = base + 1; i < length; ++i)
71 | if (d[(i << 1) + 1] == index) {
72 | prefix.push(i - base);
73 | traverseTrieTree(dat, writer, i, prefix, ln);
74 | prefix.pop();
75 | }
76 | }
77 |
78 | /**
79 | * Converts an list of code points to a {@link String}.
80 | *
81 | * @param codePoints
82 | * The list of code pointe.
83 | *
84 | * @return The converted {@link String}.
85 | *
86 | * @see StringUtils#toString(int...)
87 | */
88 | private static String toString(List codePoints) {
89 | StringBuilder sb = new StringBuilder();
90 | for (int codePoint : codePoints) sb.appendCodePoint(codePoint);
91 | return sb.toString();
92 | }
93 |
94 | /**
95 | * Convert dat file at models/<name>.dat to words and save converted result
96 | * to build/tmp/tests/<name>_text.txt.
97 | *
98 | * @param name
99 | * The name of the DAT file.
100 | * @param ln
101 | * Whether to output line numbers.
102 | *
103 | * @throws IOException
104 | * If an I/O error occurs.
105 | */
106 | private static void convertAndSave(String name, boolean ln) throws IOException {
107 | Dat dat = new Dat("models/" + name + ".dat");
108 | PrintWriter writer = new PrintWriter(Files.newBufferedWriter(
109 | Paths.get("build/tmp/tests/" + name + "_text.txt")));
110 | convert(dat, writer, ln);
111 | writer.close();
112 | }
113 |
114 | private static Pattern LINE_PATTERN = Pattern.compile("^(.*)\\s(\\d+)$");
115 |
116 | /**
117 | * Read file generated by {@link #convertAndSave(String, boolean)} at
118 | * build/tmp/tests/<name>_text.txt and sort the words comparing the
119 | * corresponding line numbers. Every line of the input file should match {@link
120 | * #LINE_PATTERN}, while the first group being the word and the second
121 | * group being the line number. The sorted result is output to
122 | * build/tmp/tests/<name>_sorted.txt with the line numbers removed,
123 | * containing the words only.
124 | * Since the {@link Dat} file as input to {@link #convertAndSave(String, boolean)} is
125 | * assumed to be generated using {@link DatMaker#readFromTxtFile(String)}, which
126 | * reads from a text file containing a word on each line, the file generated by
127 | * this method should be identical to the input file provided to {@link
128 | * DatMaker#readFromTxtFile(String)}.
129 | *
130 | * @param name
131 | * The name of the converted file.
132 | *
133 | * @throws IOException
134 | * If an I/O error occurs.
135 | */
136 | private static void sortAndSave(String name) throws IOException {
137 | // This method makes excessive use of the Java 8 Stream API, advanced knowledge
138 | // of streams is required to read the following code.
139 |
140 | Files.write(Paths.get("build/tmp/tests/" + name + "_sorted.txt"),
141 | (Iterable) Files.lines(
142 | Paths.get("build/tmp/tests/" + name + "_text.txt"))
143 | .map(line -> {
144 | Matcher matcher = LINE_PATTERN.matcher(line);
145 | if (!matcher.find()) return null;
146 | return new AbstractMap.SimpleEntry<>(
147 | Integer.parseInt(matcher.group(2)),
148 | matcher.group(1));
149 | })
150 | .sorted(Comparator.comparingInt(AbstractMap.SimpleEntry::getKey))
151 | .map(AbstractMap.SimpleEntry::getValue)::iterator);
152 | }
153 |
154 | /**
155 | * Convert a stream of {@link Dat} files specified by {@code datFiles} to words plus
156 | * line numbers using {@link #convertAndSave(String, boolean)} and then sort the
157 | * lines using {@link #sortAndSave(String)}. This method output messages to {@link
158 | * System#out} while executing.
159 | *
160 | * @param datFiles
161 | * The stream of {@link Dat} files, for each {@link String} in {@code datFiles},
162 | * for example, {@code "example"}, the input {@link Dat} file is at {@code
163 | * models/example.dat}, the converted file is at {@code
164 | * build/tmp/tests/example_text.txt}, and the sorted file is at {@code
165 | * build/tmp/tests/example_sorted.txt}.
166 | */
167 | private void convertAndSort(Stream datFiles) {
168 | datFiles.forEach(datFile -> {
169 | try {
170 | System.out.printf("Converting dat file %s.dat\n", datFile);
171 | convertAndSave(datFile, true);
172 | System.out.printf("Sorting dat file build/tmp/tests/%s_text.dat\n",
173 | datFile);
174 | sortAndSave(datFile);
175 | } catch (IOException e) {
176 | e.printStackTrace();
177 | }
178 | });
179 | }
180 |
181 | // @Test
182 | // public void test() throws IOException {
183 | // convertAndSort(Files.list(Paths.get("models/"))
184 | // .parallel()
185 | // .map(Path::getFileName)
186 | // .map(Path::toString)
187 | // .map(String::toLowerCase)
188 | // .filter(filename -> filename.endsWith(".dat"))
189 | // .map(filename -> filename.substring(0, filename.length() - 4))
190 | // .filter(filename -> !"t2s".equals(filename)) // not Dat file
191 | // .filter(filename -> !"idiom".equals(filename))); // not DatMaker
192 | // // idiom.dat is correct Dat file however not generated by DatMaker
193 | // System.out.println("Converting dat file idiom.dat");
194 | // convertAndSave("idiom", false);
195 | // }
196 | }
197 |
--------------------------------------------------------------------------------
/src/test/java/org/thunlp/thulac/data/DatMakerTest.java:
--------------------------------------------------------------------------------
1 | package org.thunlp.thulac.data;
2 |
3 | import org.junit.Test;
4 | import org.thunlp.thulac.IAccessible;
5 | import org.thunlp.thulac.TestHelper;
6 |
7 | import java.io.IOException;
8 | import java.util.List;
9 |
10 | import static org.junit.Assert.assertTrue;
11 |
12 | /**
13 | *
14 | */
15 | public class DatMakerTest {
16 | @Test
17 | public void test() throws IOException {
18 | IAccessible file = TestHelper.resourceAt("dat_maker_test_1.txt");
19 | Dat dat = DatMaker.readFromInputStream(file.toInputStream());
20 | List lines = file.getLines();
21 | for (String line : lines) assertTrue(line, dat.contains(line));
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/src/test/java/org/thunlp/thulac/util/CodePointUtilsTest.java:
--------------------------------------------------------------------------------
1 | package org.thunlp.thulac.util;
2 |
3 | import org.junit.Test;
4 |
5 | import java.util.Arrays;
6 |
7 | import static org.junit.Assert.assertNotEquals;
8 |
9 | /**
10 | *
11 | */
12 | public class CodePointUtilsTest {
13 | // the original one
14 | private static final String OTHER_CODE_POINTS =
15 | StringUtils.toString(65292, 12290, 65311, 65281, 65306, 65307, 8216, 8217,
16 | 8220, 8221, 12304, 12305, 12289, 12298, 12299, 126, 183, 64, 124, 35,
17 | 65509, 37, 8230, 38, 42, 65288, 65289, 8212, 45, 43, 61, 44, 46, 60,
18 | 62, 63, 47, 33, 59, 58, 39, 34, 123, 125, 91, 93, 92, 124, 35, 36, 37,
19 | 94, 38, 42, 40, 41, 95, 45, 43, 61, 9700, 9734, 9733, 65, 66, 67,
20 | 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
21 | 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 104, 105,
22 | 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
23 | 120, 121, 122, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57);
24 |
25 | @Test
26 | public void test() {
27 | // equality test
28 | Arrays.stream(StringUtils.toCodePoints(OTHER_CODE_POINTS))
29 | .forEach(ch -> assertNotEquals(String.valueOf(ch),
30 | -1, CodePointUtils.SPECIAL_CHARS.indexOf(ch)));
31 | Arrays.stream(StringUtils.toCodePoints(CodePointUtils.SPECIAL_CHARS))
32 | .forEach(ch -> assertNotEquals(String.valueOf(ch),
33 | -1, OTHER_CODE_POINTS.indexOf(ch)));
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/test/resources/dat_maker_test_1.txt:
--------------------------------------------------------------------------------
1 | A
2 | AB
3 | ABC
4 | AC
5 | AD
6 | AE
7 | B
8 | BC
9 | BCDEFG
10 | BCDEGF
11 | BBCCDD
12 | BE
13 | BF
14 | BFF
15 | C
16 | D
17 | E
18 | F
19 | G
--------------------------------------------------------------------------------