14 |
15 | typedef unsigned char CTYPE; // type of alphabet letters
16 |
17 | // For large alphabets, such as Unicode, see the Web page above
18 | // for techniques to improve performance
19 |
20 | #define ALPHA (256) // alphabet size
21 | #define MAX_PATLEN (100) // maximum pattern length
22 |
23 | int betap[ MAX_PATLEN+1 ];
24 | int Delta[ ALPHA ];
25 |
26 | void output( int pos ) {
27 | static int matches = 0;
28 | printf( "match %d found at position %d\n", ++matches, pos );
29 | }
30 |
31 | void makebetap( const CTYPE* p, int m ) {
32 | int i = 0, j = betap[0] = -1;
33 |
34 | while( i < m ) {
35 | while( (j > -1) && (p[i] != p[j]) ) {
36 | j = betap[j];
37 | }
38 | if( p[++i] == p[++j] ) {
39 | betap[i] = betap[j];
40 | } else {
41 | betap[i] = j;
42 | }
43 | }
44 | }
45 |
46 | void makeDelta( const CTYPE* p, int m ) {
47 | int i;
48 |
49 | for( i = 0; i < ALPHA; ++i ) {
50 | Delta[i] = m + 1;
51 | }
52 | for( i = 0; i < m; ++i ) {
53 | Delta[ p[i] ] = m - i;
54 | }
55 | }
56 |
57 | void FJS( const CTYPE* p, int m, const CTYPE* x, int n ) {
58 | if( m < 1 ) return;
59 | makebetap( p, m );
60 | makeDelta( p, m );
61 |
62 | int i = 0, j = 0, mp = m-1, ip = mp;
63 | while( ip < n ) {
64 | if( j <= 0 ) {
65 | while( p[ mp ] != x[ ip ] ) {
66 | ip += Delta[ x[ ip+1 ] ];
67 | if( ip >= n ) return;
68 | }
69 | j = 0;
70 | i = ip - mp;
71 | while( (j < mp) && (x[i] == p[j]) ) {
72 | ++i; ++j;
73 | }
74 | if( j == mp ) {
75 | output( i-mp );
76 | ++i; ++j;
77 | }
78 | if( j <= 0 ) {
79 | ++i;
80 | } else {
81 | j = betap[j];
82 | }
83 | } else {
84 | while( (j < m) && (x[i] == p[j]) ) {
85 | ++i; ++j;
86 | }
87 | if( j == m ) {
88 | output( i-m );
89 | }
90 | j = betap[j];
91 | }
92 | ip = i + mp - j;
93 | }
94 | }
95 |
96 | int main( int argc, char** argv ) {
97 | int m;
98 |
99 | if( argc == 3 ) {
100 | if( (m = strlen( argv[2] )) <= MAX_PATLEN ) {
101 | FJS( (CTYPE*) argv[2], m,
102 | (CTYPE*) argv[1], strlen( argv[1] ) );
103 | } else {
104 | printf( "Recompile with MAX_PATLEN >= %d\n", m );
105 | }
106 | } else {
107 | printf( "Usage: %s text pattern\n", argv[0] );
108 | }
109 | return 0;
110 | }
--------------------------------------------------------------------------------
/java/ca/cgjennings/algo/BruteForceStringSearcher.java:
--------------------------------------------------------------------------------
1 | /* See LICENSE.md for license details (MIT license). */
2 | package ca.cgjennings.algo;
3 |
4 | import java.util.stream.IntStream;
5 |
6 | /**
7 | * A brute force or naïve implementation of {@link StringSearcher}. Brute force
8 | * string search finds all occurrences of a pattern in a text in O(m*n) time.
9 | *
10 | * @author Christopher G. Jennings
11 | */
12 | public final class BruteForceStringSearcher implements StringSearcher {
13 |
14 | /**
15 | * Creates a new {@code StringSearcher} that uses brute force to find matches.
16 | */
17 | public BruteForceStringSearcher() {
18 | }
19 |
20 | @Override
21 | @SuppressWarnings("empty-statement")
22 | public IntStream findAll(CharSequence p, CharSequence x) {
23 | final int m = p.length(), n = x.length();
24 |
25 | final IntStream.Builder stream = IntStream.builder();
26 | int i, j;
27 |
28 | for (j = 0; j <= n - m; ++j) {
29 | for (i = 0; i < m && p.charAt(i) == x.charAt(i + j); ++i);
30 | if (i >= p.length()) {
31 | stream.accept(j);
32 | }
33 | }
34 |
35 | return stream.build();
36 | }
37 | }
--------------------------------------------------------------------------------
/java/ca/cgjennings/algo/FJSStringSearcher.java:
--------------------------------------------------------------------------------
1 | /* See LICENSE.md for license details (MIT license). */
2 | package ca.cgjennings.algo;
3 |
4 | import java.util.Arrays;
5 | import java.util.stream.IntStream;
6 |
7 | /**
8 | * Finds all occurrences of a pattern string in a text string. It is guaranteed
9 | * to run in O(n) time, but usually runs in much less. This implementation uses
10 | * a simple hashing strategy to avoid paying a penalty for Unicode's large
11 | * alphabet size, with the consequence that this implementation will be slower
12 | * for certain large texts with many hash collisions than an unhashed FJS
13 | * implementation (very rare in practice).
14 | *
15 | * @author Christopher G. Jennings
16 | * @see The FJS string matching
17 | * algorithm
18 | */
19 | public final class FJSStringSearcher implements StringSearcher {
20 |
21 | /**
22 | * Creates a new {@code StringSearcher} that uses the FJS algorithm.
23 | */
24 | public FJSStringSearcher() {
25 | // reused since it does not depend on pattern size
26 | delta = new int[ALPHABET_HASH_SIZE];
27 | }
28 |
29 | // The hash size must be a power of 2; typical texts may not see a speedup
30 | // from using FJS if they are around this length or smaller.
31 | private static final int ALPHABET_HASH_SIZE = 128;
32 | private static final int HASH_MASK = ALPHABET_HASH_SIZE - 1;
33 | private final int[] delta;
34 |
35 | @Override
36 | public IntStream findAll(CharSequence p, CharSequence x) {
37 | final int n = x.length();
38 | final int m = p.length();
39 |
40 | if (m == 0) {
41 | return IntStream.rangeClosed(0, n);
42 | }
43 | if (m > n) {
44 | return IntStream.empty();
45 | }
46 |
47 | final int beta[] = makeBeta(p);
48 | @SuppressWarnings("LocalVariableHidesMemberVariable")
49 | final int delta[] = makeDelta(p);
50 | final IntStream.Builder stream = IntStream.builder();
51 |
52 | int mp = m - 1, np = n - 1, i = 0, ip = i + mp, j = 0;
53 |
54 | outer: while (ip < np) {
55 | if (j <= 0) {
56 | while (p.charAt(mp) != x.charAt(ip)) {
57 | ip += delta[x.charAt(ip + 1) & HASH_MASK];
58 | if (ip >= np) {
59 | break outer;
60 | }
61 | }
62 | j = 0;
63 | i = ip - mp;
64 | while ((j < mp) && (x.charAt(i) == p.charAt(j))) {
65 | ++i;
66 | ++j;
67 | }
68 | if (j == mp) {
69 | stream.accept(i - mp);
70 | ++i;
71 | ++j;
72 | }
73 | if (j <= 0) {
74 | ++i;
75 | } else {
76 | j = beta[j];
77 | }
78 | } else {
79 | while ((j < m) && (x.charAt(i) == p.charAt(j))) {
80 | ++i;
81 | ++j;
82 | }
83 | if (j == m) {
84 | stream.accept(i - m);
85 | }
86 | j = beta[j];
87 | }
88 | ip = i + mp - j;
89 | }
90 |
91 | // check final alignment p[0..m-1] == x[n-m..n-1]
92 | if (ip == np) {
93 | if (j < 0) {
94 | j = 0;
95 | }
96 | i = n - m + j;
97 | while (j < m && x.charAt(i) == p.charAt(j)) {
98 | ++i;
99 | ++j;
100 | }
101 | if (j == m) {
102 | stream.accept(n - m);
103 | }
104 | }
105 |
106 | return stream.build();
107 | }
108 |
109 | /**
110 | * Construct the FJS Δ array for the pattern.
111 | *
112 | * @param pattern the search pattern
113 | */
114 | private int[] makeDelta(CharSequence pattern) {
115 | final int m = pattern.length();
116 | @SuppressWarnings("LocalVariableHidesMemberVariable")
117 | final int[] delta = this.delta;
118 |
119 | Arrays.fill(delta, m + 1);
120 | for (int i = 0; i < m; ++i) {
121 | final char ch = pattern.charAt(i);
122 | final int slot = ch & HASH_MASK;
123 | final int jump = m - i;
124 | if (jump < delta[slot]) {
125 | delta[slot] = jump;
126 | }
127 | }
128 | return delta;
129 | }
130 |
131 | /**
132 | * Returns a new FJS β′ array for the pattern.
133 | *
134 | * @param pattern the search pattern
135 | * @return a new β′ array based on the borders of the pattern
136 | */
137 | private int[] makeBeta(CharSequence pattern) {
138 | final int m = pattern.length();
139 | final int[] beta = new int[m + 1];
140 | int i = 0, j = beta[0] = -1;
141 |
142 | while (i < m) {
143 | while ((j > -1) && (pattern.charAt(i) != pattern.charAt(j))) {
144 | j = beta[j];
145 | }
146 |
147 | ++i;
148 | ++j;
149 | if ((i < m) && (pattern.charAt(i) == pattern.charAt(j))) {
150 | beta[i] = beta[j];
151 | } else {
152 | beta[i] = j;
153 | }
154 | }
155 | return beta;
156 | }
157 | }
--------------------------------------------------------------------------------
/java/ca/cgjennings/algo/StringSearcher.java:
--------------------------------------------------------------------------------
1 | /* See LICENSE.md for license details (MIT license). */
2 | package ca.cgjennings.algo;
3 |
4 | import java.util.stream.IntStream;
5 |
6 | /**
7 | * A base class for implementing algorithms for searching for occurrences of a
8 | * pattern within a text string. Any {@link CharSequence} can be used for the
9 | * pattern and text, including {@link String}s.
10 | *
11 | *
12 | * Empty strings are treated as matching at every possible index, including the
13 | * index after the last character in the text.
14 | *
15 | *
16 | * This class has been designed so that implementations can easily be made
17 | * asynchronous, although such support is not provided out of the box.
18 | *
19 | * @author Christopher G. Jennings
20 | */
21 | public interface StringSearcher {
22 | /**
23 | * Finds all matches of the pattern within the text. Each entry in the returned
24 | * {@code IntStream} is the index of one match. If the pattern does not occur in
25 | * the text, an empty stream is returned.
26 | *
27 | * @param pattern the pattern to search for
28 | * @param text the text to search within
29 | * @return a stream of the indices at which matches was found
30 | */
31 | IntStream findAll(CharSequence pattern, CharSequence text);
32 | }
--------------------------------------------------------------------------------
/java/ca/cgjennings/algo/StringSearcherTests.java:
--------------------------------------------------------------------------------
1 | /* See LICENSE.md for license details (MIT license). */
2 | package ca.cgjennings.algo;
3 |
4 | import java.util.Arrays;
5 | import java.util.Random;
6 |
7 | /**
8 | * Some simple tests for this FJS algorithm implementation. I have not used a
9 | * testing framework (such as JUnit) to keep things simple: this is meant as
10 | * sample code rather than a library.
11 | *
12 | * @author Christopher G. Jennings
13 | */
14 | public final class StringSearcherTests {
15 |
16 | public static void main(String[] args) {
17 | final String[] tests = new String[] {
18 | "aabxaabxaabxaabx", "aaa", // best case
19 | "aaaaaaaaaaaaaaa", "aba", // worst case
20 | "aabaabaabaxbabxaaabaa", "aabaa",
21 | "aaaaaaaaaaaaaaa", "aaaa",
22 | "baabaabab", "aabab",
23 | "bxaccxa", "bbcc",
24 | "The cat sat on the mat", "at",
25 | "abacadabrabracabracadabrabrabracad", "abracadabra",
26 | "bbbbbbbbbbbbbbb", "a",
27 | "", "",
28 | "", "a",
29 | "a", "",
30 | fibostring(5), fibostring(3),
31 | fibostring(7), fibostring(4),
32 | fibostring( 5 ), fibostring( 1 ),
33 | fibostring(1), fibostring(5),
34 | fibostring(6), fibostring(6),
35 | fibostring(rand.nextInt(10)), fibostring(rand.nextInt(5)),
36 | fibostring(rand.nextInt(10)), fibostring(rand.nextInt(5)),
37 | fibostring(rand.nextInt(10)), fibostring(rand.nextInt(5)),
38 | fibostring(rand.nextInt(10)), fibostring(rand.nextInt(5)),
39 | fibostring(rand.nextInt(10)), fibostring(rand.nextInt(5)),
40 | fibostring(rand.nextInt(10)), fibostring(rand.nextInt(5)),
41 | fibostring(rand.nextInt(10)), fibostring(rand.nextInt(5)),
42 | fibostring(rand.nextInt(10)), fibostring(rand.nextInt(5)),
43 | fibostring(rand.nextInt(10)), fibostring(rand.nextInt(5)),
44 | fibostring(rand.nextInt(10)), fibostring(rand.nextInt(5)),
45 | fibostring(rand.nextInt(10)), fibostring(rand.nextInt(5)),
46 | fibostring(rand.nextInt(10)), fibostring(rand.nextInt(5)),
47 | randstring(12), randstring(14),
48 | randstring(12), randstring(13),
49 | randstring(12), randstring(12),
50 | randstring(12), randstring(11),
51 | randstring(12), randstring(10),
52 | randstring(12), randstring(9),
53 | randstring(12), randstring(8),
54 | randstring(12), randstring(7),
55 | randstring(12), randstring(6),
56 | randstring(12), randstring(5),
57 | randstring(12), randstring(4),
58 | randstring(12), randstring(3),
59 | randstring(12), randstring(2),
60 | randstring(12), randstring(1),
61 | randstring(12), "",
62 | randstring(12), randstring(3),
63 | randstring(12), randstring(2),
64 | randstring(12), randstring(1),
65 | randstring(12), randstring(3),
66 | randstring(12), randstring(2),
67 | randstring(12), randstring(1),
68 | randstring(12), randstring(3),
69 | randstring(12), randstring(2),
70 | randstring(12), randstring(1),
71 | randstring(12), randstring(3),
72 | randstring(12), randstring(2),
73 | randstring(12), randstring(1),
74 | randstring(12), randstring(3),
75 | randstring(12), randstring(2),
76 | randstring(12), randstring(1),
77 | randstring(12), randstring(3),
78 | randstring(12), randstring(2),
79 | randstring(12), randstring(1),
80 | randstring(12), randstring(3),
81 | randstring(12), randstring(2),
82 | randstring(12), randstring(1),
83 | randstring(2000), randstring(1),
84 | };
85 | runTests(tests);
86 | }
87 |
88 | private static void runTests(String[] tests) {
89 | int passed = 0;
90 | StringSearcher fjs = new FJSStringSearcher();
91 |
92 | for (int t = 0; t < tests.length;) {
93 | final String text = tests[t++];
94 | final String pattern = tests[t++];
95 |
96 | passed += check(fjs, pattern, text);
97 | }
98 |
99 | System.out.printf("Done, %d of %d tests passed\n", passed, tests.length / 2);
100 | }
101 |
102 | private static int check(StringSearcher toTest, String pattern, String text) {
103 | final int[] a = toTest.findAll(pattern, text).toArray();
104 | final int[] b = new BruteForceStringSearcher().findAll(pattern, text).toArray();
105 |
106 | if (Arrays.equals(a, b)) {
107 | return 1;
108 | }
109 |
110 | System.out.println("Failed:");
111 | System.out.format(" p=\"%s\", t=\"%s\"\n", pattern, text);
112 | System.out.format(" got %s but expected %s\n", Arrays.toString(a), Arrays.toString(b));
113 |
114 | return 0;
115 | }
116 |
117 | /**
118 | * Returns the Fibonacci string of order {@code order}.
119 | */
120 | private static String fibostring(int order) {
121 | if (order == 0) {
122 | return "b";
123 | }
124 |
125 | String previous = fibostring(order - 1);
126 | StringBuilder current = new StringBuilder(previous.length() * 2);
127 |
128 | for (int i = 0; i < previous.length(); ++i) {
129 | switch (previous.charAt(i)) {
130 | case 'a':
131 | current.append("ab");
132 | break;
133 | case 'b':
134 | current.append("a");
135 | break;
136 | }
137 | }
138 | return current.toString();
139 | }
140 |
141 | /**
142 | * Returns a pseudorandom string of length {@code len} consisting of only the
143 | * letters 'a', 'b', and 'c'.
144 | */
145 | private static String randstring(int len) {
146 | final StringBuilder b = new StringBuilder(len);
147 | final char[] abc = new char[] { 'a', 'b', 'c' };
148 | for (int i = 0; i < len; ++i) {
149 | b.append(abc[rand.nextInt(3)]);
150 | }
151 | return b.toString();
152 | }
153 |
154 | /**
155 | * Used to generate deterministic "random" strings for testing.
156 | */
157 | private static final Random rand = new Random(0xf);
158 | }
--------------------------------------------------------------------------------
/readme/images/fjs-animation.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CGJennings/fjs-string-matching/ed4029c582d176d7ec707fe96bc5e254594dd276/readme/images/fjs-animation.gif
--------------------------------------------------------------------------------