├── src
├── main
│ ├── resources
│ │ └── models
│ │ │ └── sentencepiece.bpe.model
│ └── java
│ │ └── com
│ │ └── sentencepiece
│ │ ├── Scoring.java
│ │ ├── TokenType.java
│ │ ├── ResultBuilderImpl.java
│ │ ├── Piece.java
│ │ ├── TrieNode.java
│ │ ├── SegmentEnd.java
│ │ ├── SentencePieceProcessor.java
│ │ ├── SentencePieceAlgorithm.java
│ │ └── Model.java
└── test
│ └── java
│ ├── Tests2.java
│ ├── Tests.java
│ └── Main.java
├── .idea
├── vcs.xml
├── .gitignore
├── encodings.xml
├── misc.xml
└── uiDesigner.xml
├── .gitignore
├── .github
└── workflows
│ └── maven.yml
├── README.md
├── pom.xml
└── LICENSE
/src/main/resources/models/sentencepiece.bpe.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eix128/SentencePiece4J/HEAD/src/main/resources/models/sentencepiece.bpe.model
--------------------------------------------------------------------------------
/src/main/java/com/sentencepiece/Scoring.java:
--------------------------------------------------------------------------------
1 | package com.sentencepiece;
2 |
3 | public enum Scoring {
4 | FEWEST_SEGMENTS,
5 | HIGHEST_SCORE
6 | }
7 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 |
--------------------------------------------------------------------------------
/src/main/java/com/sentencepiece/TokenType.java:
--------------------------------------------------------------------------------
1 | package com.sentencepiece;
2 |
3 |
4 | public enum TokenType {
5 | UNKNOWN,
6 | TEXT, // NORMAL
7 | USER_DEFINED,
8 | CONTROL,
9 | BYTE,
10 | UNUSED
11 | }
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | !.mvn/wrapper/maven-wrapper.jar
3 | !**/src/main/**/target/
4 | !**/src/test/**/target/
5 |
6 | ### IntelliJ IDEA ###
7 | .idea/modules.xml
8 | .idea/jarRepositories.xml
9 | .idea/compiler.xml
10 | .idea/libraries/
11 | *.iws
12 | *.iml
13 | *.ipr
14 |
15 | ### Eclipse ###
16 | .apt_generated
17 | .classpath
18 | .factorypath
19 | .project
20 | .settings
21 | .springBeans
22 | .sts4-cache
23 |
24 | ### NetBeans ###
25 | /nbproject/private/
26 | /nbbuild/
27 | /dist/
28 | /nbdist/
29 | /.nb-gradle/
30 | build/
31 | !**/src/main/**/build/
32 | !**/src/test/**/build/
33 |
34 | ### VS Code ###
35 | .vscode/
36 |
37 | ### Mac OS ###
38 | .DS_Store
--------------------------------------------------------------------------------
/src/main/java/com/sentencepiece/ResultBuilderImpl.java:
--------------------------------------------------------------------------------
1 | package com.sentencepiece;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | public class ResultBuilderImpl implements SentencePieceAlgorithm.ResultBuilder {
7 | private final List tokenIds = new ArrayList<>();
8 |
9 | @Override
10 | public void build(String input, SegmentEnd[] segmentEnds, boolean collapseUnknowns) {
11 | int i = segmentEnds.length - 1;
12 | while (i > 0) {
13 | SegmentEnd end = segmentEnds[i];
14 | tokenIds.add(0, end.getId());
15 | i = end.getSegmentStart();
16 | }
17 | }
18 |
19 | public List getTokenIds() {
20 | return tokenIds;
21 | }
22 | }
--------------------------------------------------------------------------------
/src/test/java/Tests2.java:
--------------------------------------------------------------------------------
1 | import com.sentencepiece.SentencePieceProcessor;
2 |
3 | import java.io.IOException;
4 | import java.nio.file.Path;
5 | import java.nio.file.Paths;
6 | import java.util.List;
7 |
8 | public class Tests2 {
9 | public static void main(String[] args) throws IOException {
10 | Path modelPath = Paths.get("models/sentencepiece.bpe.model");
11 | SentencePieceProcessor processor = new SentencePieceProcessor(modelPath);
12 |
13 | String raw = "Akşam eve gidince yağlı ballı ekmek yemek istiyorum.";
14 | List ids = processor.encode(raw);
15 |
16 | System.out.println("IDs: " + ids);
17 | System.out.println("Decoded: " + processor.decode(ids));
18 | System.out.println("Escaped: " + processor.decodeSmart(ids));
19 |
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/.github/workflows/maven.yml:
--------------------------------------------------------------------------------
1 | name: Java CI with Maven (Java 8)
2 |
3 | on:
4 | push:
5 | branches: [ "main" ]
6 | pull_request:
7 | branches: [ "main" ]
8 |
9 | jobs:
10 | build:
11 |
12 | runs-on: ubuntu-latest
13 |
14 | steps:
15 | - uses: actions/checkout@v4
16 |
17 | - name: Set up JDK 8
18 | uses: actions/setup-java@v4
19 | with:
20 | java-version: '8'
21 | distribution: 'temurin'
22 | cache: maven
23 |
24 | - name: Build with Maven
25 | run: mvn -B package --file pom.xml
26 |
27 | # Optional: Uploads the full dependency graph to GitHub to improve the quality of Dependabot alerts this repository can receive
28 | - name: Update dependency graph
29 | uses: advanced-security/maven-dependency-submission-action@571e99aab1055c2e71a1e2309b9691de18d6b7d6
30 |
--------------------------------------------------------------------------------
/src/test/java/Tests.java:
--------------------------------------------------------------------------------
1 | import com.sentencepiece.Model;
2 |
3 | import java.io.IOException;
4 | import java.nio.file.Path;
5 | import java.nio.file.Paths;
6 | import java.util.List;
7 |
8 |
9 | public class Tests {
10 | public static void main(String[] args) throws IOException {
11 | Path modelPath = Paths.get("models/sentencepiece.bpe.model");
12 |
13 | // Load the model
14 | System.out.println("Loading model...");
15 | Model model = Model.parseFrom(modelPath);
16 | System.out.println("Model loaded with maxScore = " + model.getMaxScore());
17 |
18 | // Test encode
19 | String text = "▁this ▁is ▁a ▁test";
20 | List ids = model.encode(text);
21 | System.out.println("Encoded: " + ids);
22 |
23 | // Test decode
24 | String decoded = model.decode(ids);
25 | System.out.println("Decoded: " + decoded);
26 |
27 | // Extra: reverse check
28 | for (int id : ids) {
29 | System.out.printf("id: %d → token: '%s'%n", id, model.getTokenById(id));
30 | }
31 | }
32 | }
--------------------------------------------------------------------------------
/src/main/java/com/sentencepiece/Piece.java:
--------------------------------------------------------------------------------
1 | package com.sentencepiece;
2 |
3 |
4 | import sentencepiece.SentencepieceModel;
5 |
6 | public class Piece {
7 | private final String token;
8 | private final int id;
9 | private final float score;
10 | private final TokenType type; // NEW
11 |
12 | public Piece(String token, int id, float score, TokenType type) {
13 | this.token = token;
14 | this.id = id;
15 | this.score = score;
16 | this.type = type;
17 | }
18 |
19 | public String getToken() { return token; }
20 | public int getId() { return id; }
21 | public float getScore() { return score; }
22 | public TokenType getType() { return type; } // NEW
23 |
24 | public static TokenType mapType(SentencepieceModel.ModelProto.SentencePiece.Type t) {
25 | switch (t) {
26 | case NORMAL: return TokenType.TEXT;
27 | case UNKNOWN: return TokenType.UNKNOWN;
28 | case USER_DEFINED: return TokenType.USER_DEFINED;
29 | case CONTROL: return TokenType.CONTROL;
30 | case BYTE: return TokenType.BYTE;
31 | default: return TokenType.TEXT;
32 | }
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/java/com/sentencepiece/TrieNode.java:
--------------------------------------------------------------------------------
1 | package com.sentencepiece;
2 |
3 | import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap;
4 |
5 | public class TrieNode {
6 | private final it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap children = new Int2ObjectOpenHashMap<>();
7 | private boolean isToken;
8 | private TokenType type = TokenType.TEXT;
9 | private int id = -1;
10 | private float score;
11 |
12 |
13 | public TrieNode getOrCreate(int codePoint) {
14 | TrieNode n = children.get(codePoint);
15 | if (n == null) { n = new TrieNode(); children.put(codePoint, n); }
16 | return n;
17 | }
18 |
19 | public TrieNode child(int codePoint) {
20 | return children.get(codePoint);
21 | }
22 |
23 | public void mark(int id, float score, TokenType type) {
24 | this.isToken = true; this.id = id; this.score = score; this.type = type;
25 | }
26 |
27 | public boolean isToken() {
28 | return isToken;
29 | }
30 |
31 | public TokenType getType() {
32 | return type;
33 | }
34 |
35 | public int getId() {
36 | return id;
37 | }
38 |
39 | public float getScore() {
40 | return score;
41 | }
42 |
43 | }
--------------------------------------------------------------------------------
/src/main/java/com/sentencepiece/SegmentEnd.java:
--------------------------------------------------------------------------------
1 | package com.sentencepiece;
2 |
3 | public class SegmentEnd {
4 | private final TokenType type;
5 | private final int id;
6 | private final float pathScoreSum;
7 | private final int pathSegmentCount;
8 | private final int segmentStart;
9 |
10 | public SegmentEnd(TokenType type, int id, float pathScoreSum, int pathSegmentCount, int segmentStart) {
11 | this.type = type;
12 | this.id = id;
13 | this.pathScoreSum = pathScoreSum;
14 | this.pathSegmentCount = pathSegmentCount;
15 | this.segmentStart = segmentStart;
16 | }
17 |
18 | public float score(Scoring scoring) {
19 | switch (scoring) {
20 | case FEWEST_SEGMENTS:
21 | return 1f / pathSegmentCount * 10_000_000 + pathScoreSum;
22 | case HIGHEST_SCORE:
23 | return pathScoreSum;
24 | default:
25 | throw new IllegalArgumentException("Unknown scoring " + scoring);
26 | }
27 | }
28 |
29 | public float scoreWith(Scoring scoring, float additionalSegmentScore) {
30 | switch (scoring) {
31 | case FEWEST_SEGMENTS:
32 | return 1f / (pathSegmentCount + 1) * 10_000_000 + (pathScoreSum + additionalSegmentScore);
33 | case HIGHEST_SCORE:
34 | return pathScoreSum + additionalSegmentScore;
35 | default:
36 | throw new IllegalArgumentException("Unknown scoring " + scoring);
37 | }
38 | }
39 |
40 | public int getId() {
41 | return id;
42 | }
43 |
44 | public float getPathScoreSum() {
45 | return pathScoreSum;
46 | }
47 |
48 | public int getPathSegmentCount() {
49 | return pathSegmentCount;
50 | }
51 |
52 | public int getSegmentStart() {
53 | return segmentStart;
54 | }
55 | }
--------------------------------------------------------------------------------
/src/test/java/Main.java:
--------------------------------------------------------------------------------
1 | import com.sentencepiece.Model;
2 | import com.sentencepiece.Scoring;
3 | import com.sentencepiece.SentencePieceAlgorithm;
4 |
5 | import java.io.IOException;
6 | import java.nio.file.Paths;
7 | import java.util.List;
8 |
9 | public class Main {
10 | public static void main(String[] args) throws IOException {
11 | Model model = Model.getInstance();
12 | SentencePieceAlgorithm algorithm = new SentencePieceAlgorithm(
13 | true, Scoring.HIGHEST_SCORE
14 | );
15 |
16 | {
17 | String raw = "o captain! my captain! our fearful trip is done," +
18 | "the ship has weather’d every rack, the prize we sought is won," +
19 | "the port is near, the bells i hear, the people all exulting,";
20 | List ids = model.encodeNormalized(raw, algorithm);
21 |
22 | System.out.println("Token IDs: " + ids);
23 | System.out.println("Decoded text: " + model.decodeSmart(ids));
24 | assert (raw.hashCode() == model.decodeSmart(ids).hashCode());
25 | System.out.println("✔ Success");
26 |
27 | }
28 |
29 | //Test Hebrew
30 | {
31 | String raw = "השתיקה יפה לחכמים";
32 | List ids = model.encodeNormalized(raw, algorithm);
33 |
34 | System.out.println("Token IDs: " + ids);
35 | System.out.println("Decoded text: " + model.normalizeHebrew(model.decodeSmart(ids) , true ));
36 | assert (raw.hashCode() == model.decodeSmart(ids).hashCode());
37 | System.out.println("✔ Success");
38 | }
39 |
40 |
41 | //Test Turkish
42 | {
43 | String raw = "kusur bulmak için bakma birine! bakmak için bakarsan bulursun.kusursuz olmayı marifet edin kendine , işte asıl o zaman kusursuz olursun!...";
44 | List ids = model.encodeNormalized(raw, algorithm);
45 |
46 | System.out.println("Token IDs: " + ids);
47 | System.out.println("Decoded text: " + model.decodeSmart(ids));
48 | assert (raw.hashCode() == model.decodeSmart(ids).hashCode());
49 | System.out.println("✔ Success");
50 | }
51 |
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/src/main/java/com/sentencepiece/SentencePieceProcessor.java:
--------------------------------------------------------------------------------
1 | package com.sentencepiece;
2 |
3 |
4 |
5 | import java.io.IOException;
6 | import java.nio.file.Path;
7 | import java.text.Normalizer;
8 | import java.util.*;
9 | import java.util.regex.Pattern;
10 | import java.util.stream.Collectors;
11 |
12 | public class SentencePieceProcessor {
13 |
14 | private final Model model;
15 | private final SentencePieceAlgorithm algorithm;
16 | private final boolean lowercase;
17 |
18 | // Model fields
19 |
20 | public SentencePieceProcessor(Path modelPath) throws IOException {
21 | this.model = Model.parseFrom(modelPath);
22 | this.algorithm = new SentencePieceAlgorithm(true, Scoring.HIGHEST_SCORE);
23 | this.lowercase = false;
24 | }
25 |
26 |
27 | public List encode(String rawText) {
28 | String normalized = normalize(rawText);
29 |
30 | // Add SentencePiece boundary markers (▁)
31 | StringBuilder sb = new StringBuilder();
32 | for (String word : normalized.split(" ")) {
33 | sb.append('▁').append(word);
34 | }
35 | String prepared = sb.toString();
36 |
37 | ResultBuilderImpl builder = new ResultBuilderImpl();
38 | algorithm.segment(prepared, builder, model);
39 | return builder.getTokenIds();
40 | }
41 |
42 | public String decode(List ids) {
43 | return ids.stream()
44 | .map(model::tokenById)
45 | .map(t -> t.startsWith("▁") ? t.substring(1) : t)
46 | .collect(Collectors.joining(" "))
47 | .replaceAll(" +", " ")
48 | .trim();
49 | }
50 |
51 | public String decodeSmart(List ids) {
52 | StringBuilder sb = new StringBuilder();
53 |
54 | for (int id : ids) {
55 | String token = model.tokenById(id);
56 | if (token.startsWith("▁")) {
57 | if (sb.length() > 0) sb.append(' ');
58 | sb.append(token.substring(1));
59 | } else {
60 | sb.append(token);
61 | }
62 | }
63 |
64 | return sb.toString().trim();
65 | }
66 |
67 |
68 | private static final Pattern WS = Pattern.compile("\\p{Z}+");
69 | private String normalize(String input) {
70 | String s = lowercase ? input.toLowerCase() : input;
71 | s = Normalizer.normalize(s, Normalizer.Form.NFKC);
72 | s = s.replaceAll("\\p{Cc}+", ""); // drop control chars
73 | s = WS.matcher(s.trim()).replaceAll(" ");
74 | return s;
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/main/java/com/sentencepiece/SentencePieceAlgorithm.java:
--------------------------------------------------------------------------------
1 | package com.sentencepiece;
2 |
3 | public class SentencePieceAlgorithm {
4 |
5 |
6 | private final boolean collapseUnknowns;
7 | private final Scoring scoring;
8 |
9 | public SentencePieceAlgorithm(boolean collapseUnknowns, Scoring scoring) {
10 | this.collapseUnknowns = collapseUnknowns;
11 | this.scoring = scoring;
12 | }
13 |
14 | public void segment(String input, ResultBuilder resultBuilder, Model model) {
15 | SegmentEnd[] segmentEnds = new SegmentEnd[input.length() + 1];
16 | segmentEnds[0] = new SegmentEnd(TokenType.UNKNOWN, 0, 0, 0, 0);
17 |
18 | int start = 0;
19 | while (start < input.length()) {
20 | TrieNode node = model.getRoot();
21 | int pos = start;
22 | while (node != null && pos < input.length()) {
23 | int cp = input.codePointAt(pos);
24 | pos += Character.charCount(cp);
25 | node = node.child(cp);
26 | int length = pos - start;
27 | if (node != null && node.isToken() && node.getType() != TokenType.UNUSED) {
28 | float score = (node.getType() == TokenType.USER_DEFINED)
29 | ? (length * model.getMaxScore() - 0.1f)
30 | : node.getScore();
31 | addSegment(node.getType(), node.getId(), start, pos, score, segmentEnds);
32 | } else if (length == Character.charCount(cp)) {
33 | addSegment(TokenType.UNKNOWN, model.getUnkId() >= 0 ? model.getUnkId() : 0,
34 | start, start + length, model.getMinScore() - 10.0f, segmentEnds);
35 | }
36 | }
37 | start += Character.charCount(input.codePointAt(start));
38 | }
39 | resultBuilder.build(input, segmentEnds, collapseUnknowns);
40 | }
41 |
42 | private void addSegment(TokenType type, int id, int start, int end, float score, SegmentEnd[] segmentEnds) {
43 | if (segmentEnds[end] == null ||
44 | segmentEnds[start].scoreWith(scoring, score) > segmentEnds[end].score(scoring)) {
45 |
46 | segmentEnds[end] = new SegmentEnd(
47 | type, id,
48 | segmentEnds[start].getPathScoreSum() + score,
49 | segmentEnds[start].getPathSegmentCount() + 1,
50 | start
51 | );
52 | }
53 | }
54 |
55 | public interface ResultBuilder {
56 | void build(String input, SegmentEnd[] segmentEnds, boolean collapseUnknowns);
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SentencePiece4J
2 | SentencePiece in pure Java , No JNI Required.
3 | Works Cross Platform for all OS
4 |
5 | use with :
6 |
7 | # download sentencepiece
8 | wget https://github.com/eix128/SentencePiece4J/raw/refs/heads/main/src/main/resources/sentencepiece.bpe.model
9 |
10 | # mvn package
11 |
12 | io.github.eix128
13 | sentencepiece4j
14 | 1.0.2
15 |
16 |
17 | # Usage
18 | import com.sentencepiece.Model;
19 | import com.sentencepiece.Scoring;
20 | import com.sentencepiece.SentencePieceAlgorithm;
21 |
22 | import java.io.IOException;
23 | import java.nio.file.Paths;
24 | import java.util.List;
25 |
26 | public class Main {
27 | public static void main(String[] args) throws IOException {
28 | Model model = Model.parseFrom(Paths.get("sentencepiece.bpe.model"));
29 | SentencePieceAlgorithm algorithm = new SentencePieceAlgorithm( true, Scoring.HIGHEST_SCORE );
30 |
31 | {
32 | String raw = "o captain! my captain! our fearful trip is done," +
33 | "the ship has weather’d every rack, the prize we sought is won," +
34 | "the port is near, the bells i hear, the people all exulting,";
35 | List ids = model.encodeNormalized(raw, algorithm);
36 |
37 | System.out.println("Token IDs: " + ids);
38 | System.out.println("Decoded text: " + model.decodeSmart(ids));
39 | assert (raw.hashCode() == model.decodeSmart(ids).hashCode());
40 | System.out.println("✔ Success");
41 |
42 | }
43 |
44 | //Test Hebrew
45 | {
46 | String raw = "השתיקה יפה לחכמים";
47 | List ids = model.encodeNormalized(raw, algorithm);
48 |
49 | System.out.println("Token IDs: " + ids);
50 | System.out.println("Decoded text: " + model.normalizeHebrew(model.decodeSmart(ids) , true ));
51 | assert (raw.hashCode() == model.decodeSmart(ids).hashCode());
52 | System.out.println("✔ Success");
53 | }
54 |
55 |
56 | //Test Turkish
57 | {
58 | String raw = "kusur bulmak için bakma birine! kakmak için bakarsan bulursun.kusursuz olmayı marifet edin kendine , işte asıl o zaman kusursuz olursun!...";
59 | List ids = model.encodeNormalized(raw, algorithm);
60 |
61 | System.out.println("Token IDs: " + ids);
62 | System.out.println("Decoded text: " + model.decodeSmart(ids));
63 | assert (raw.hashCode() == model.decodeSmart(ids).hashCode());
64 | System.out.println("✔ Success");
65 | }
66 | }
67 | }
68 |
69 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | io.github.eix128
8 | sentencepiece4j
9 | 1.0.2
10 |
11 |
12 | ${project.groupId}:${project.artifactId}
13 | Java utilities/bindings for SentencePiece (tokenize/encode/decode, BPE/SPM helpers).
14 | https://github.com/eix128/SentencePiece4J
15 |
16 |
17 |
18 | Apache License 2.0
19 | https://www.apache.org/licenses/LICENSE-2.0.txt
20 |
21 |
22 |
23 |
24 |
25 | Kadir BASOL
26 | kadir.bayner@gmail.com
27 | https://github.com/eix128
28 |
29 |
30 |
31 |
32 | https://github.com/eix128/SentencePiece4J
33 | scm:git:https://github.com/eix128/SentencePiece4J.git
34 | scm:git:ssh://github.com/eix128/SentencePiece4J.git
35 |
36 |
37 |
38 |
39 | 8
40 | 8
41 | UTF-8
42 |
43 |
44 | 5.9.3
45 | 1.9.3
46 |
47 | 85D81515741EDCFE6BF86ED41A342E30F64EAA40
48 |
49 |
50 | 3.1.2
51 |
52 |
53 |
54 |
55 | com.google.protobuf
56 | protobuf-java
57 | 4.32.1
58 |
59 |
60 | it.unimi.dsi
61 | fastutil
62 | 8.5.12
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 | maven-source-plugin
73 | 3.3.1
74 |
75 |
76 | attach-sources
77 | jar-no-fork
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 | org.apache.maven.plugins
86 | maven-gpg-plugin
87 | 3.2.4
88 |
89 | signverifysign
90 |
91 |
92 |
93 | --pinentry-modeloopback
94 |
95 |
96 |
97 |
98 |
99 |
100 | maven-javadoc-plugin
101 | 3.6.3
102 |
103 |
104 | attach-javadocs
105 | jar
106 |
107 | 8
108 | false
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 | org.sonatype.central
118 | central-publishing-maven-plugin
119 | 0.9.0
120 | true
121 |
122 | central
123 |
127 |
128 |
129 |
130 |
131 |
--------------------------------------------------------------------------------
/src/main/java/com/sentencepiece/Model.java:
--------------------------------------------------------------------------------
1 | package com.sentencepiece;
2 |
3 |
4 | import com.google.protobuf.InvalidProtocolBufferException;
5 | import sentencepiece.SentencepieceModel;
6 |
7 | import java.io.ByteArrayOutputStream;
8 | import java.io.IOException;
9 | import java.io.InputStream;
10 | import java.nio.file.Files;
11 | import java.nio.file.Path;
12 | import java.text.Normalizer;
13 | import java.util.ArrayList;
14 | import java.util.HashMap;
15 | import java.util.List;
16 | import java.util.Map;
17 | import java.util.regex.Pattern;
18 | import java.util.stream.Collectors;
19 |
20 | import static com.sentencepiece.Piece.mapType;
21 |
22 |
23 | /**
24 | * Wraps the SentencePiece protobuf model into a Java-friendly structure.
25 | */
26 | public class Model {
27 |
28 | private final Map vocabulary = new HashMap<>();
29 | private final TrieNode root = new TrieNode();
30 | private float maxScore = Float.NEGATIVE_INFINITY;
31 | private float minScore = Float.POSITIVE_INFINITY;
32 |
33 | private final List piecesById = new ArrayList<>();
34 | // NEW: store id (or -1 if absent)
35 | private int unkId = -1;
36 |
37 |
38 | public Model() {
39 |
40 | }
41 |
42 |
43 | private static byte[] toByteArray(InputStream in) throws IOException {
44 | ByteArrayOutputStream buffer = new ByteArrayOutputStream();
45 | byte[] data = new byte[8192];
46 | int nRead;
47 | while ((nRead = in.read(data, 0, data.length)) != -1) {
48 | buffer.write(data, 0, nRead);
49 | }
50 | return buffer.toByteArray();
51 | }
52 |
53 |
54 |
55 |
56 | private static Model init( byte[] bytes ) throws InvalidProtocolBufferException {
57 | SentencepieceModel.ModelProto modelProto = SentencepieceModel.ModelProto.parseFrom(bytes);
58 | Model model = new Model();
59 |
60 | int id = 0;
61 | for (SentencepieceModel.ModelProto.SentencePiece sp : modelProto.getPiecesList()) {
62 | String token = sp.getPiece();
63 | float score = sp.getScore();
64 | model.addPiece(token, id++, score, mapType(sp.getType()));
65 | }
66 |
67 | // determine unkId
68 | if (modelProto.hasTrainerSpec() && modelProto.getTrainerSpec().hasUnkId()) {
69 | model.unkId = modelProto.getTrainerSpec().getUnkId();
70 | } else {
71 | for (Piece p : model.piecesById) {
72 | if (p != null && "".equals(p.getToken())) {
73 | model.unkId = p.getId();
74 | break;
75 | }
76 | }
77 | }
78 | return model;
79 | }
80 |
81 | /**
82 | * @return returns sentencepiece.bpe.model model which is stored inside library
83 | * @throws IOException : when model is not found
84 | */
85 | public static Model getInstance( ) throws IOException {
86 | return parseFromResource("models/sentencepiece.bpe.model");
87 | }
88 |
89 |
90 | public static Model parseFromResource(String resourceName) throws IOException {
91 | try (InputStream in = Model.class.getClassLoader().getResourceAsStream(resourceName)) {
92 | if (in == null) {
93 | throw new IOException("Resource not found: " + resourceName);
94 | }
95 | byte[] bytes = toByteArray(in);
96 | return init(bytes);
97 | }
98 | }
99 |
100 |
101 |
102 | public static Model parseFrom(Path modelPath) throws IOException {
103 | byte[] bytes = Files.readAllBytes(modelPath);
104 | return init(bytes);
105 | }
106 |
107 | public void addPiece(String token, int id, float score, TokenType type) {
108 | Piece piece = new Piece(token, id, score, type);
109 | vocabulary.put(token, piece);
110 |
111 | // ensure O(1) id lookup
112 | while (piecesById.size() <= id) piecesById.add(null);
113 | piecesById.set(id, piece);
114 |
115 | maxScore = Math.max(maxScore, score);
116 | minScore = Math.min(minScore, score);
117 | insertIntoTrie(token, piece);
118 | }
119 |
120 | private void insertIntoTrie(String token, Piece piece) {
121 | TrieNode node = root;
122 | for (int i = 0; i < token.length(); ) {
123 | int cp = token.codePointAt(i);
124 | i += Character.charCount(cp);
125 | node = node.getOrCreate(cp);
126 | }
127 | // If Piece doesn't have getType(), use a default TokenType.TEXT here
128 | node.mark(piece.getId(), piece.getScore(), piece.getType());
129 | }
130 |
131 | public TrieNode getRoot() {
132 | return root;
133 | }
134 |
135 | public float getMaxScore() {
136 | return maxScore;
137 | }
138 |
139 | public float getMinScore() {
140 | return minScore;
141 | }
142 |
143 | public String getTokenById(int id) {
144 | for (Piece p : vocabulary.values()) {
145 | if (p.getId() == id) return p.getToken();
146 | }
147 | return "";
148 | }
149 |
150 | public String tokenById(int id) {
151 | if (id < 0 || id >= piecesById.size()) return "";
152 | Piece piece = piecesById.get(id);
153 | return piece != null ? piece.getToken() : "";
154 | }
155 |
156 | public int getUnkId() { return unkId; }
157 |
158 | public int getIdForToken(String token) {
159 | Piece piece = vocabulary.get(token);
160 | return (piece != null) ? piece.getId() : -1;
161 | }
162 |
163 |
164 | public List encode(String input) {
165 | // Naive space-based token matching for test (improve later with BPE segmenter)
166 | String trimmedInput = input.trim();
167 | String[] tokens = trimmedInput.split("\\s+");
168 | List ids = new ArrayList<>();
169 | for (String t : tokens) {
170 | int id = getIdForToken(t);
171 | if (id != -1) {
172 | ids.add(id);
173 | } else {
174 | System.err.println("Unknown token: " + t);
175 | }
176 | }
177 | return ids;
178 | }
179 |
180 | private static final Pattern pattern = Pattern.compile("\\s+");
181 |
182 | public List encodeNormalized(String rawInput, SentencePieceAlgorithm algorithm) {
183 | // Step 1: Unicode normalization
184 | String normalized = Normalizer.normalize(rawInput, Normalizer.Form.NFKC).toLowerCase();
185 |
186 | // Step 2: Collapse whitespace
187 | normalized = pattern.matcher(normalized.trim()).replaceAll(" ");
188 |
189 | // Step 3: Add '▁' marker before each word
190 | StringBuilder sb = new StringBuilder();
191 | for (String word : normalized.split(" ")) {
192 | sb.append('▁').append(word);
193 | }
194 |
195 | String prepared = sb.toString();
196 |
197 | // Step 4: Segment using algorithm
198 | ResultBuilderImpl builder = new ResultBuilderImpl();
199 | algorithm.segment(prepared, builder, this);
200 | return builder.getTokenIds();
201 | }
202 |
203 |
204 | public String decodeSmart(List ids) {
205 | StringBuilder sb = new StringBuilder();
206 |
207 | for (int id : ids) {
208 | String token = tokenById(id);
209 |
210 | if (token.equals("")) {
211 | sb.append("�"); // or some fallback
212 | continue;
213 | }
214 |
215 | if (token.startsWith("▁")) {
216 | // Word boundary → insert a space if not at the very start
217 | if (sb.length() > 0) sb.append(' ');
218 | sb.append(token.substring(1));
219 | } else {
220 | // Continuation of the current word → append directly
221 | sb.append(token);
222 | }
223 | }
224 |
225 | return sb.toString().trim();
226 | }
227 |
228 | // Choose ONE consistently with training data
229 | private static final Pattern ZS = Pattern.compile("\\p{Z}+");
230 |
231 | // Strip Hebrew diacritics if model expects undiacritized text
232 | private static final Pattern HEBREW_DIACRITICS =
233 | Pattern.compile("[\\u0591-\\u05BD\\u05BF\\u05C1-\\u05C2\\u05C4-\\u05C5\\u05C7]");
234 |
235 | public String normalizeHebrew(String input, boolean stripDiacritics) {
236 | String s = Normalizer.normalize(input, Normalizer.Form.NFC);
237 | if (stripDiacritics) s = HEBREW_DIACRITICS.matcher(s).replaceAll("");
238 | s = ZS.matcher(s.trim()).replaceAll(" ");
239 | return s;
240 | }
241 |
242 | public String decode(List ids) {
243 | return ids.stream()
244 | .map(this::tokenById)
245 | .collect(Collectors.joining(" "));
246 | }
247 | }
248 |
--------------------------------------------------------------------------------
/.idea/uiDesigner.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | -
6 |
7 |
8 | -
9 |
10 |
11 | -
12 |
13 |
14 | -
15 |
16 |
17 | -
18 |
19 |
20 |
21 |
22 |
23 | -
24 |
25 |
26 |
27 |
28 |
29 | -
30 |
31 |
32 |
33 |
34 |
35 | -
36 |
37 |
38 |
39 |
40 |
41 | -
42 |
43 |
44 |
45 |
46 | -
47 |
48 |
49 |
50 |
51 | -
52 |
53 |
54 |
55 |
56 | -
57 |
58 |
59 |
60 |
61 | -
62 |
63 |
64 |
65 |
66 | -
67 |
68 |
69 |
70 |
71 | -
72 |
73 |
74 | -
75 |
76 |
77 |
78 |
79 | -
80 |
81 |
82 |
83 |
84 | -
85 |
86 |
87 |
88 |
89 | -
90 |
91 |
92 |
93 |
94 | -
95 |
96 |
97 |
98 |
99 | -
100 |
101 |
102 | -
103 |
104 |
105 | -
106 |
107 |
108 | -
109 |
110 |
111 | -
112 |
113 |
114 |
115 |
116 | -
117 |
118 |
119 | -
120 |
121 |
122 |
123 |
124 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on whose behalf a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2025 [Kadir BASOL]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
203 | Copyright 2016 Luca Martino.
204 |
205 | Licensed under the Apache License, Version 2.0 (the "License");
206 | you may not use this file except in compliance with the License.
207 | You may obtain a copyFile of the License at
208 |
209 | http://www.apache.org/licenses/LICENSE-2.0
210 |
211 | Unless required by applicable law or agreed to in writing, software
212 | distributed under the License is distributed on an "AS IS" BASIS,
213 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
214 | See the License for the specific language governing permissions and
215 | limitations under the License.
--------------------------------------------------------------------------------