├── .editorconfig ├── .github └── workflows │ └── main.yml ├── .gitignore ├── .npmrc ├── .prettierignore ├── funding.yml ├── index.js ├── lib ├── expressions.js ├── index.js └── plugin │ ├── break-implicit-sentences.js │ ├── make-final-white-space-siblings.js │ ├── make-initial-white-space-siblings.js │ ├── merge-affix-exceptions.js │ ├── merge-affix-symbol.js │ ├── merge-final-word-symbol.js │ ├── merge-initial-digit-sentences.js │ ├── merge-initial-lower-case-letter-sentences.js │ ├── merge-initial-word-symbol.js │ ├── merge-initialisms.js │ ├── merge-inner-word-slash.js │ ├── merge-inner-word-symbol.js │ ├── merge-non-word-sentences.js │ ├── merge-prefix-exceptions.js │ ├── merge-remaining-full-stops.js │ ├── patch-position.js │ └── remove-empty-nodes.js ├── license ├── package.json ├── readme.md ├── script ├── build-expressions.js ├── generate-fixture.js └── regenerate-fixtures.js ├── test ├── fixture │ ├── alphabetic-exception-a.json │ ├── alphabetic-exception-b.json │ ├── alphabetic-exception-c.json │ ├── alphabetic-exception-d.json │ ├── alphabetic-exception-e.json │ ├── alphabetic-exception-f.json │ ├── alphabetic-exception-g.json │ ├── alphabetic-exception-h.json │ ├── alphabetic-exception-i.json │ ├── alphabetic-exception-j.json │ ├── alphabetic-exception-k.json │ ├── alphabetic-exception-l.json │ ├── alphabetic-exception-m.json │ ├── alphabetic-exception-n.json │ ├── alphabetic-exception-o.json │ ├── alphabetic-exception-p.json │ ├── alphabetic-exception-q.json │ ├── alphabetic-exception-r.json │ ├── alphabetic-exception-s.json │ ├── alphabetic-exception-t.json │ ├── alphabetic-exception-u.json │ ├── alphabetic-exception-v.json │ ├── alphabetic-exception-w.json │ ├── alphabetic-exception-x.json │ ├── alphabetic-exception-y.json │ ├── alphabetic-exception-z.json │ ├── astral-plane-surrogate-pair.json │ ├── combining-double-breve-below.json │ ├── combining-double-breve.json │ ├── combining-double-inverted-breve.json │ ├── combining-marks-double.json │ ├── combining-marks.json │ ├── combining-tie-character.json │ ├── combining-tie-under-inverted.json │ ├── combining-tie-under.json │ ├── digit-letter-combination.json │ ├── digit-only-sentence.json │ ├── domain-name-exception.json │ ├── ellipsis-sentence-end-spaces-padded.json │ ├── ellipsis-sentence-end-spaces.json │ ├── ellipsis-sentence-end-unicode.json │ ├── ellipsis-sentence-end.json │ ├── ellipsis-sentence-start-spaces-padded.json │ ├── ellipsis-sentence-start-spaces.json │ ├── ellipsis-sentence-start-unicode.json │ ├── ellipsis-sentence-start.json │ ├── full-stop-followed-by-closing-parenthesis.json │ ├── full-stop-followed-by-closing-quote.json │ ├── full-stop-followed-by-comma.json │ ├── full-stop-followed-by-digit.json │ ├── grapheme-clusters-double.json │ ├── grapheme-clusters.json │ ├── implicit-sentence-end.json │ ├── initialism-exception.json │ ├── initialism-in-words.json │ ├── initialism-letter-plural.json │ ├── initialism-like-digits.json │ ├── initialism-like-multi-character.json │ ├── initialism-like.json │ ├── initialism-plural.json │ ├── initialism.json │ ├── intelectual-copyright-symbol.json │ ├── intelectual-registered-trademark-symbol.json │ ├── intelectual-service-mark.json │ ├── intelectual-sound-recording-copyright-symbol.json │ ├── intelectual-trademark.json │ ├── latin-exception-al.json │ ├── latin-exception-ca.json │ ├── latin-exception-cap.json │ ├── latin-exception-cca.json │ ├── latin-exception-cent.json │ ├── latin-exception-cf.json │ ├── latin-exception-cit.json │ ├── latin-exception-con.json │ ├── latin-exception-cp.json │ ├── latin-exception-cwt.json │ ├── latin-exception-ead.json │ ├── latin-exception-etc.json │ ├── latin-exception-ff.json │ ├── latin-exception-fl.json │ ├── latin-exception-ibid.json │ ├── latin-exception-id.json │ ├── latin-exception-nem.json │ ├── latin-exception-op.json │ ├── latin-exception-pro.json │ ├── latin-exception-seq.json │ ├── latin-exception-sic.json │ ├── latin-exception-stat.json │ ├── latin-exception-tem.json │ ├── latin-exception-viz.json │ ├── letter-digit-combination.json │ ├── lower-case-exception.json │ ├── non-alphabetic-sentence.json │ ├── numerical-exception-0.json │ ├── numerical-exception-1.json │ ├── numerical-exception-11.json │ ├── numerical-exception-111.json │ ├── numerical-exception-2.json │ ├── numerical-exception-3.json │ ├── numerical-exception-4.json │ ├── numerical-exception-5.json │ ├── numerical-exception-6.json │ ├── numerical-exception-7.json │ ├── numerical-exception-8.json │ ├── numerical-exception-9.json │ ├── terminal-marker-comma.json │ ├── terminal-marker-ellipsis.json │ ├── terminal-marker-exclamation-mark.json │ ├── terminal-marker-full-stop.json │ ├── terminal-marker-interrobang.json │ ├── terminal-marker-new-line-multiple.json │ ├── terminal-marker-new-line.json │ ├── terminal-marker-question-mark.json │ ├── terminal-marker-semicolon.json │ ├── trailing-white-space-final-paragraph.json │ ├── trailing-white-space-final-sentence.json │ ├── trailing-white-space-final.json │ ├── trailing-white-space-initial-paragraph.json │ ├── trailing-white-space-initial-sentence.json │ ├── trailing-white-space-initial.json │ ├── two-paragraphs.json │ ├── white-space-only.json │ ├── word-final-dash.json │ ├── word-final-full-stop.json │ ├── word-initial-ampersand.json │ ├── word-inner-ampersand.json │ ├── word-inner-at.json │ ├── word-inner-colon.json │ ├── word-inner-full-stop.json │ ├── word-inner-slash-long.json │ ├── word-inner-slash-no-next.json │ ├── word-inner-slash-short.json │ ├── word-inner-slash.json │ ├── word-inner-underscore.json │ └── word-inner-url.json └── index.js └── tsconfig.json /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 2 6 | end_of_line = lf 7 | charset = utf-8 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: main 2 | on: 3 | - pull_request 4 | - push 5 | jobs: 6 | main: 7 | name: ${{matrix.node}} 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v3 11 | - uses: actions/setup-node@v3 12 | with: 13 | node-version: ${{matrix.node}} 14 | - run: npm install 15 | - run: npm test 16 | - uses: codecov/codecov-action@v3 17 | strategy: 18 | matrix: 19 | node: 20 | - lts/gallium 21 | - node 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.log 3 | coverage/ 4 | node_modules/ 5 | yarn.lock 6 | *.d.ts 7 | -------------------------------------------------------------------------------- /.npmrc: -------------------------------------------------------------------------------- 1 | ignore-scripts=true 2 | package-lock=false 3 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | coverage/ 2 | *.md 3 | -------------------------------------------------------------------------------- /funding.yml: -------------------------------------------------------------------------------- 1 | github: wooorm 2 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | export {ParseLatin} from './lib/index.js' 2 | -------------------------------------------------------------------------------- /lib/plugin/break-implicit-sentences.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @typedef {import('nlcst').Paragraph} Paragraph 3 | * @typedef {import('nlcst').Sentence} Sentence 4 | */ 5 | 6 | import {toString} from 'nlcst-to-string' 7 | import {modifyChildren} from 'unist-util-modify-children' 8 | 9 | // Break a sentence if a white space with more than one new-line is found. 10 | export const breakImplicitSentences = modifyChildren( 11 | /** 12 | * @type {import('unist-util-modify-children').Modifier} 13 | */ 14 | 15 | function (child, index, parent) { 16 | if (child.type !== 'SentenceNode') { 17 | return 18 | } 19 | 20 | const children = child.children 21 | 22 | // Ignore first and last child. 23 | let position = 0 24 | 25 | while (++position < children.length - 1) { 26 | const node = children[position] 27 | 28 | if ( 29 | node.type !== 'WhiteSpaceNode' || 30 | toString(node).split(/\r\n|\r|\n/).length < 3 31 | ) { 32 | continue 33 | } 34 | 35 | child.children = children.slice(0, position) 36 | 37 | /** @type {Sentence} */ 38 | const insertion = { 39 | type: 'SentenceNode', 40 | children: children.slice(position + 1) 41 | } 42 | 43 | const tail = children[position - 1] 44 | const head = children[position + 1] 45 | 46 | parent.children.splice(index + 1, 0, node, insertion) 47 | 48 | if (child.position && tail.position && head.position) { 49 | const end = child.position.end 50 | 51 | child.position.end = tail.position.end 52 | 53 | insertion.position = {start: head.position.start, end} 54 | } 55 | 56 | return index + 1 57 | } 58 | } 59 | ) 60 | -------------------------------------------------------------------------------- /lib/plugin/make-final-white-space-siblings.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @typedef {import('nlcst').Paragraph} Paragraph 3 | * @typedef {import('nlcst').Root} Root 4 | */ 5 | 6 | import {modifyChildren} from 'unist-util-modify-children' 7 | 8 | // Move white space ending a paragraph up, so they are the siblings of 9 | // paragraphs. 10 | export const makeFinalWhiteSpaceSiblings = modifyChildren( 11 | /** 12 | * @type {import('unist-util-modify-children').Modifier} 13 | */ 14 | 15 | function (child, index, parent) { 16 | if ('children' in child) { 17 | const tail = child.children[child.children.length - 1] 18 | 19 | if (tail && tail.type === 'WhiteSpaceNode') { 20 | child.children.pop() // Remove `tail`. 21 | parent.children.splice(index + 1, 0, tail) 22 | const previous = child.children[child.children.length - 1] 23 | 24 | if (previous && previous.position && child.position) { 25 | child.position.end = previous.position.end 26 | } 27 | 28 | // Next, iterate over the current node again. 29 | return index 30 | } 31 | } 32 | } 33 | ) 34 | -------------------------------------------------------------------------------- /lib/plugin/make-initial-white-space-siblings.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @typedef {import('nlcst').Paragraph} Paragraph 3 | * @typedef {import('nlcst').Root} Root 4 | */ 5 | 6 | import {visitChildren} from 'unist-util-visit-children' 7 | 8 | // Move white space starting a sentence up, so they are the siblings of 9 | // sentences. 10 | export const makeInitialWhiteSpaceSiblings = visitChildren( 11 | /** 12 | * @type {import('unist-util-visit-children').Visitor} 13 | */ 14 | function (child, index, parent) { 15 | if ('children' in child && child.children) { 16 | const head = child.children[0] 17 | if (head && head.type === 'WhiteSpaceNode') { 18 | child.children.shift() 19 | parent.children.splice(index, 0, head) 20 | const next = child.children[0] 21 | 22 | if (next && next.position && child.position) { 23 | child.position.start = next.position.start 24 | } 25 | } 26 | } 27 | } 28 | ) 29 | -------------------------------------------------------------------------------- /lib/plugin/merge-affix-exceptions.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @typedef {import('nlcst').Paragraph} Paragraph 3 | */ 4 | 5 | import {toString} from 'nlcst-to-string' 6 | import {modifyChildren} from 'unist-util-modify-children' 7 | 8 | // Merge a sentence into its previous sentence, when the sentence starts with a 9 | // comma. 10 | export const mergeAffixExceptions = modifyChildren( 11 | /** 12 | * @type {import('unist-util-modify-children').Modifier} 13 | */ 14 | function (child, index, parent) { 15 | const previous = parent.children[index - 1] 16 | 17 | if ( 18 | previous && 19 | 'children' in previous && 20 | 'children' in child && 21 | child.children.length > 0 22 | ) { 23 | let position = -1 24 | 25 | while (child.children[++position]) { 26 | const node = child.children[position] 27 | 28 | if (node.type === 'WordNode') { 29 | return 30 | } 31 | 32 | if (node.type === 'SymbolNode' || node.type === 'PunctuationNode') { 33 | const value = toString(node) 34 | 35 | if (value !== ',' && value !== ';') { 36 | return 37 | } 38 | 39 | previous.children.push(...child.children) 40 | 41 | // Update position. 42 | if (previous.position && child.position) { 43 | previous.position.end = child.position.end 44 | } 45 | 46 | parent.children.splice(index, 1) 47 | 48 | // Next, iterate over the node *now* at the current position. 49 | return index 50 | } 51 | } 52 | } 53 | } 54 | ) 55 | -------------------------------------------------------------------------------- /lib/plugin/merge-affix-symbol.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @typedef {import('nlcst').Paragraph} Paragraph 3 | */ 4 | 5 | import {toString} from 'nlcst-to-string' 6 | import {modifyChildren} from 'unist-util-modify-children' 7 | // Closing or final punctuation, or terminal markers that should still be 8 | // included in the previous sentence, even though they follow the sentence’s 9 | // terminal marker. 10 | import {affixSymbol} from '../expressions.js' 11 | 12 | // Move certain punctuation following a terminal marker (thus in the next 13 | // sentence) to the previous sentence. 14 | export const mergeAffixSymbol = modifyChildren( 15 | /** 16 | * @type {import('unist-util-modify-children').Modifier} 17 | */ 18 | function (child, index, parent) { 19 | if ('children' in child && child.children.length > 0 && index > 0) { 20 | const previous = parent.children[index - 1] 21 | const first = child.children[0] 22 | const second = child.children[1] 23 | 24 | if ( 25 | previous && 26 | previous.type === 'SentenceNode' && 27 | (first.type === 'SymbolNode' || first.type === 'PunctuationNode') && 28 | affixSymbol.test(toString(first)) 29 | ) { 30 | child.children.shift() // Remove `first`. 31 | previous.children.push(first) 32 | 33 | // Update position. 34 | if (first.position && previous.position) { 35 | previous.position.end = first.position.end 36 | } 37 | 38 | if (second && second.position && child.position) { 39 | child.position.start = second.position.start 40 | } 41 | 42 | // Next, iterate over the previous node again. 43 | return index - 1 44 | } 45 | } 46 | } 47 | ) 48 | -------------------------------------------------------------------------------- /lib/plugin/merge-final-word-symbol.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @typedef {import('nlcst').Sentence} Sentence 3 | */ 4 | 5 | import {toString} from 'nlcst-to-string' 6 | import {modifyChildren} from 'unist-util-modify-children' 7 | 8 | // Merge certain punctuation marks into their preceding words. 9 | export const mergeFinalWordSymbol = modifyChildren( 10 | /** 11 | * @type {import('unist-util-modify-children').Modifier} 12 | */ 13 | function (child, index, parent) { 14 | if ( 15 | index > 0 && 16 | (child.type === 'SymbolNode' || child.type === 'PunctuationNode') && 17 | toString(child) === '-' 18 | ) { 19 | const children = parent.children 20 | const previous = children[index - 1] 21 | const next = children[index + 1] 22 | 23 | if ( 24 | (!next || next.type !== 'WordNode') && 25 | previous && 26 | previous.type === 'WordNode' 27 | ) { 28 | // Remove `child` from parent. 29 | children.splice(index, 1) 30 | 31 | // Add the punctuation mark at the end of the previous node. 32 | previous.children.push(child) 33 | 34 | // Update position. 35 | if (previous.position && child.position) { 36 | previous.position.end = child.position.end 37 | } 38 | 39 | // Next, iterate over the node *now* at the current position (which was 40 | // the next node). 41 | return index 42 | } 43 | } 44 | } 45 | ) 46 | -------------------------------------------------------------------------------- /lib/plugin/merge-initial-digit-sentences.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @typedef {import('nlcst').Paragraph} Paragraph 3 | */ 4 | 5 | import {toString} from 'nlcst-to-string' 6 | import {modifyChildren} from 'unist-util-modify-children' 7 | import {digitStart} from '../expressions.js' 8 | 9 | // Merge a sentence into its previous sentence, when the sentence starts with a 10 | // lower case letter. 11 | export const mergeInitialDigitSentences = modifyChildren( 12 | /** 13 | * @type {import('unist-util-modify-children').Modifier} 14 | */ 15 | function (child, index, parent) { 16 | const previous = parent.children[index - 1] 17 | 18 | if ( 19 | previous && 20 | previous.type === 'SentenceNode' && 21 | child.type === 'SentenceNode' 22 | ) { 23 | const head = child.children[0] 24 | 25 | if (head && head.type === 'WordNode' && digitStart.test(toString(head))) { 26 | previous.children.push(...child.children) 27 | parent.children.splice(index, 1) 28 | 29 | // Update position. 30 | if (previous.position && child.position) { 31 | previous.position.end = child.position.end 32 | } 33 | 34 | // Next, iterate over the node *now* at the current position. 35 | return index 36 | } 37 | } 38 | } 39 | ) 40 | -------------------------------------------------------------------------------- /lib/plugin/merge-initial-lower-case-letter-sentences.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @typedef {import('nlcst').Paragraph} Paragraph 3 | */ 4 | 5 | import {toString} from 'nlcst-to-string' 6 | import {modifyChildren} from 'unist-util-modify-children' 7 | // Initial lowercase letter. 8 | import {lowerInitial} from '../expressions.js' 9 | 10 | // Merge a sentence into its previous sentence, when the sentence starts with a 11 | // lower case letter. 12 | export const mergeInitialLowerCaseLetterSentences = modifyChildren( 13 | /** 14 | * @type {import('unist-util-modify-children').Modifier} 15 | */ 16 | function (child, index, parent) { 17 | if (child.type === 'SentenceNode' && index > 0) { 18 | const previous = parent.children[index - 1] 19 | const children = child.children 20 | 21 | if (children.length > 0 && previous.type === 'SentenceNode') { 22 | let position = -1 23 | 24 | while (children[++position]) { 25 | const node = children[position] 26 | 27 | if (node.type === 'WordNode') { 28 | if (!lowerInitial.test(toString(node))) { 29 | return 30 | } 31 | 32 | previous.children.push(...children) 33 | 34 | parent.children.splice(index, 1) 35 | 36 | // Update position. 37 | if (previous.position && child.position) { 38 | previous.position.end = child.position.end 39 | } 40 | 41 | // Next, iterate over the node *now* at the current position. 42 | return index 43 | } 44 | 45 | if (node.type === 'SymbolNode' || node.type === 'PunctuationNode') { 46 | return 47 | } 48 | } 49 | } 50 | } 51 | } 52 | ) 53 | -------------------------------------------------------------------------------- /lib/plugin/merge-initial-word-symbol.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @typedef {import('nlcst').Sentence} Sentence 3 | */ 4 | 5 | import {toString} from 'nlcst-to-string' 6 | import {modifyChildren} from 'unist-util-modify-children' 7 | 8 | // Merge certain punctuation marks into their following words. 9 | export const mergeInitialWordSymbol = modifyChildren( 10 | /** 11 | * @type {import('unist-util-modify-children').Modifier} 12 | */ 13 | function (child, index, parent) { 14 | if ( 15 | (child.type !== 'SymbolNode' && child.type !== 'PunctuationNode') || 16 | toString(child) !== '&' 17 | ) { 18 | return 19 | } 20 | 21 | const children = parent.children 22 | const next = children[index + 1] 23 | 24 | // If either a previous word, or no following word, exists, exit early. 25 | if ( 26 | (index > 0 && children[index - 1].type === 'WordNode') || 27 | !(next && next.type === 'WordNode') 28 | ) { 29 | return 30 | } 31 | 32 | // Remove `child` from parent. 33 | children.splice(index, 1) 34 | 35 | // Add the punctuation mark at the start of the next node. 36 | next.children.unshift(child) 37 | 38 | // Update position. 39 | if (next.position && child.position) { 40 | next.position.start = child.position.start 41 | } 42 | 43 | // Next, iterate over the node at the previous position, as it's now adjacent 44 | // to a following word. 45 | return index - 1 46 | } 47 | ) 48 | -------------------------------------------------------------------------------- /lib/plugin/merge-initialisms.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @typedef {import('nlcst').Sentence} Sentence 3 | */ 4 | 5 | import {toString} from 'nlcst-to-string' 6 | import {modifyChildren} from 'unist-util-modify-children' 7 | import {numerical} from '../expressions.js' 8 | 9 | // Merge initialisms. 10 | export const mergeInitialisms = modifyChildren( 11 | /** 12 | * @type {import('unist-util-modify-children').Modifier} 13 | */ 14 | function (child, index, parent) { 15 | if ( 16 | index > 0 && 17 | child.type === 'PunctuationNode' && 18 | toString(child) === '.' 19 | ) { 20 | const previous = parent.children[index - 1] 21 | 22 | if ( 23 | previous.type === 'WordNode' && 24 | previous.children && 25 | previous.children.length !== 1 && 26 | previous.children.length % 2 !== 0 27 | ) { 28 | let position = previous.children.length 29 | let isAllDigits = true 30 | 31 | while (previous.children[--position]) { 32 | const otherChild = previous.children[position] 33 | 34 | const value = toString(otherChild) 35 | 36 | if (position % 2 === 0) { 37 | // Initialisms consist of one character values. 38 | if (value.length > 1) { 39 | return 40 | } 41 | 42 | if (!numerical.test(value)) { 43 | isAllDigits = false 44 | } 45 | } else if (value !== '.') { 46 | if (position < previous.children.length - 2) { 47 | break 48 | } else { 49 | return 50 | } 51 | } 52 | } 53 | 54 | if (!isAllDigits) { 55 | // Remove `child` from parent. 56 | parent.children.splice(index, 1) 57 | 58 | // Add child to the previous children. 59 | previous.children.push(child) 60 | 61 | // Update position. 62 | if (previous.position && child.position) { 63 | previous.position.end = child.position.end 64 | } 65 | 66 | // Next, iterate over the node *now* at the current position. 67 | return index 68 | } 69 | } 70 | } 71 | } 72 | ) 73 | -------------------------------------------------------------------------------- /lib/plugin/merge-inner-word-slash.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @typedef {import('nlcst').Sentence} Sentence 3 | * @typedef {import('nlcst').SentenceContent} SentenceContent 4 | * @typedef {import('nlcst').WordContent} WordContent 5 | */ 6 | 7 | import {toString} from 'nlcst-to-string' 8 | import {modifyChildren} from 'unist-util-modify-children' 9 | 10 | // Merge words joined by certain punctuation marks. 11 | export const mergeInnerWordSlash = modifyChildren( 12 | /** 13 | * @type {import('unist-util-modify-children').Modifier} 14 | */ 15 | function (child, index, parent) { 16 | const siblings = parent.children 17 | const previous = siblings[index - 1] 18 | 19 | if ( 20 | previous && 21 | previous.type === 'WordNode' && 22 | (child.type === 'SymbolNode' || child.type === 'PunctuationNode') && 23 | toString(child) === '/' 24 | ) { 25 | const previousValue = toString(previous) 26 | /** @type {SentenceContent} */ 27 | let tail = child 28 | /** @type {Array} */ 29 | const queue = [child] 30 | let count = 1 31 | let nextValue = '' 32 | const next = siblings[index + 1] 33 | 34 | if (next && next.type === 'WordNode') { 35 | nextValue = toString(next) 36 | tail = next 37 | queue.push(...next.children) 38 | count++ 39 | } 40 | 41 | if (previousValue.length < 3 && (!nextValue || nextValue.length < 3)) { 42 | // Add all found tokens to `prev`s children. 43 | previous.children.push(...queue) 44 | 45 | siblings.splice(index, count) 46 | 47 | // Update position. 48 | if (previous.position && tail.position) { 49 | previous.position.end = tail.position.end 50 | } 51 | 52 | // Next, iterate over the node *now* at the current position. 53 | return index 54 | } 55 | } 56 | } 57 | ) 58 | -------------------------------------------------------------------------------- /lib/plugin/merge-inner-word-symbol.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @typedef {import('nlcst').Sentence} Sentence 3 | * @typedef {import('nlcst').WordContent} WordContent 4 | */ 5 | 6 | import {toString} from 'nlcst-to-string' 7 | import {modifyChildren} from 'unist-util-modify-children' 8 | // Symbols part of surrounding words. 9 | import {wordSymbolInner} from '../expressions.js' 10 | 11 | // Merge words joined by certain punctuation marks. 12 | export const mergeInnerWordSymbol = modifyChildren( 13 | /** 14 | * @type {import('unist-util-modify-children').Modifier} 15 | */ 16 | function (child, index, parent) { 17 | if ( 18 | index > 0 && 19 | (child.type === 'SymbolNode' || child.type === 'PunctuationNode') 20 | ) { 21 | const siblings = parent.children 22 | const previous = siblings[index - 1] 23 | 24 | if (previous && previous.type === 'WordNode') { 25 | let position = index - 1 26 | /** @type {Array} */ 27 | const tokens = [] 28 | /** @type {Array} */ 29 | let queue = [] 30 | 31 | // - If a token which is neither word nor inner word symbol is found, 32 | // the loop is broken 33 | // - If an inner word symbol is found, it’s queued 34 | // - If a word is found, it’s queued (and the queue stored and emptied) 35 | while (siblings[++position]) { 36 | const sibling = siblings[position] 37 | 38 | if (sibling.type === 'WordNode') { 39 | tokens.push(...queue, ...sibling.children) 40 | 41 | queue = [] 42 | } else if ( 43 | (sibling.type === 'SymbolNode' || 44 | sibling.type === 'PunctuationNode') && 45 | wordSymbolInner.test(toString(sibling)) 46 | ) { 47 | queue.push(sibling) 48 | } else { 49 | break 50 | } 51 | } 52 | 53 | if (tokens.length > 0) { 54 | // If there is a queue, remove its length from `position`. 55 | if (queue.length > 0) { 56 | position -= queue.length 57 | } 58 | 59 | // Remove every (one or more) inner-word punctuation marks and children 60 | // of words. 61 | siblings.splice(index, position - index) 62 | 63 | // Add all found tokens to `prev`s children. 64 | previous.children.push(...tokens) 65 | 66 | const last = tokens[tokens.length - 1] 67 | 68 | // Update position. 69 | if (previous.position && last.position) { 70 | previous.position.end = last.position.end 71 | } 72 | 73 | // Next, iterate over the node *now* at the current position. 74 | return index 75 | } 76 | } 77 | } 78 | } 79 | ) 80 | -------------------------------------------------------------------------------- /lib/plugin/merge-non-word-sentences.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @typedef {import('nlcst').Paragraph} Paragraph 3 | */ 4 | 5 | import {modifyChildren} from 'unist-util-modify-children' 6 | 7 | // Merge a sentence into the following sentence, when the sentence does not 8 | // contain word tokens. 9 | export const mergeNonWordSentences = modifyChildren( 10 | /** 11 | * @type {import('unist-util-modify-children').Modifier} 12 | */ 13 | function (child, index, parent) { 14 | if ('children' in child) { 15 | let position = -1 16 | 17 | while (child.children[++position]) { 18 | if (child.children[position].type === 'WordNode') { 19 | return 20 | } 21 | } 22 | 23 | const previous = parent.children[index - 1] 24 | 25 | if (previous && 'children' in previous) { 26 | previous.children.push(...child.children) 27 | 28 | // Remove the child. 29 | parent.children.splice(index, 1) 30 | 31 | // Patch position. 32 | if (previous.position && child.position) { 33 | previous.position.end = child.position.end 34 | } 35 | 36 | // Next, iterate over the node *now* at the current position (which was the 37 | // next node). 38 | return index 39 | } 40 | 41 | const next = parent.children[index + 1] 42 | 43 | if (next && 'children' in next) { 44 | next.children.unshift(...child.children) 45 | 46 | // Patch position. 47 | if (next.position && child.position) { 48 | next.position.start = child.position.start 49 | } 50 | 51 | // Remove the child. 52 | parent.children.splice(index, 1) 53 | } 54 | } 55 | } 56 | ) 57 | -------------------------------------------------------------------------------- /lib/plugin/merge-prefix-exceptions.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @typedef {import('nlcst').Paragraph} Paragraph 3 | */ 4 | 5 | import {toString} from 'nlcst-to-string' 6 | import {modifyChildren} from 'unist-util-modify-children' 7 | 8 | // Full stop characters that should not be treated as terminal sentence markers: 9 | // A case-insensitive abbreviation. 10 | const abbreviationPrefix = new RegExp( 11 | '^(' + 12 | '[0-9]{1,3}|' + 13 | '[a-z]|' + 14 | // Common Latin Abbreviations: 15 | // Based on: . 16 | // Where only the abbreviations written without joining full stops, 17 | // but with a final full stop, were extracted. 18 | // 19 | // circa, capitulus, confer, compare, centum weight, eadem, (et) alii, 20 | // et cetera, floruit, foliis, ibidem, idem, nemine && contradicente, 21 | // opere && citato, (per) cent, (per) procurationem, (pro) tempore, 22 | // sic erat scriptum, (et) sequentia, statim, videlicet. */ 23 | 'al|ca|cap|cca|cent|cf|cit|con|cp|cwt|ead|etc|ff|' + 24 | 'fl|ibid|id|nem|op|pro|seq|sic|stat|tem|viz' + 25 | ')$' 26 | ) 27 | 28 | // Merge a sentence into its next sentence, when the sentence ends with a 29 | // certain word. 30 | export const mergePrefixExceptions = modifyChildren( 31 | /** 32 | * @type {import('unist-util-modify-children').Modifier} 33 | */ 34 | function (child, index, parent) { 35 | if ('children' in child && child.children.length > 1) { 36 | const period = child.children[child.children.length - 1] 37 | 38 | if ( 39 | period && 40 | (period.type === 'PunctuationNode' || period.type === 'SymbolNode') && 41 | toString(period) === '.' 42 | ) { 43 | const node = child.children[child.children.length - 2] 44 | 45 | if ( 46 | node && 47 | node.type === 'WordNode' && 48 | abbreviationPrefix.test(toString(node).toLowerCase()) 49 | ) { 50 | // Merge period into abbreviation. 51 | node.children.push(period) 52 | child.children.pop() 53 | 54 | // Update position. 55 | if (period.position && node.position) { 56 | node.position.end = period.position.end 57 | } 58 | 59 | // Merge sentences. 60 | const next = parent.children[index + 1] 61 | 62 | if (next && next.type === 'SentenceNode') { 63 | child.children.push(...next.children) 64 | parent.children.splice(index + 1, 1) 65 | 66 | // Update position. 67 | if (next.position && child.position) { 68 | child.position.end = next.position.end 69 | } 70 | 71 | // Next, iterate over the current node again. 72 | return index - 1 73 | } 74 | } 75 | } 76 | } 77 | } 78 | ) 79 | -------------------------------------------------------------------------------- /lib/plugin/merge-remaining-full-stops.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @typedef {import('nlcst').Paragraph} Paragraph 3 | */ 4 | 5 | import {toString} from 'nlcst-to-string' 6 | import {visitChildren} from 'unist-util-visit-children' 7 | // Full stop characters that should not be treated as terminal sentence markers: 8 | // A case-insensitive abbreviation. 9 | import {terminalMarker} from '../expressions.js' 10 | 11 | // Merge non-terminal-marker full stops into the previous word (if available), 12 | // or the next word (if available). 13 | export const mergeRemainingFullStops = visitChildren( 14 | /** 15 | * @type {import('unist-util-visit-children').Visitor} 16 | */ 17 | // eslint-disable-next-line complexity 18 | function (child, _, _parent) { 19 | if ('children' in child) { 20 | let position = child.children.length 21 | let hasFoundDelimiter = false 22 | 23 | while (child.children[--position]) { 24 | const grandchild = child.children[position] 25 | 26 | if ( 27 | grandchild.type !== 'SymbolNode' && 28 | grandchild.type !== 'PunctuationNode' 29 | ) { 30 | // This is a sentence without terminal marker, so we 'fool' the code to 31 | // make it think we have found one. 32 | if (grandchild.type === 'WordNode') { 33 | hasFoundDelimiter = true 34 | } 35 | 36 | continue 37 | } 38 | 39 | // Exit when this token is not a terminal marker. 40 | if (!terminalMarker.test(toString(grandchild))) { 41 | continue 42 | } 43 | 44 | // Ignore the first terminal marker found (starting at the end), as it 45 | // should not be merged. 46 | if (!hasFoundDelimiter) { 47 | hasFoundDelimiter = true 48 | continue 49 | } 50 | 51 | // Only merge a single full stop. 52 | if (toString(grandchild) !== '.') { 53 | continue 54 | } 55 | 56 | const previous = child.children[position - 1] 57 | const next = child.children[position + 1] 58 | 59 | if (previous && previous.type === 'WordNode') { 60 | const nextNext = child.children[position + 2] 61 | 62 | // Continue when the full stop is followed by a space and another full 63 | // stop, such as: `{.} .` 64 | if ( 65 | next && 66 | nextNext && 67 | next.type === 'WhiteSpaceNode' && 68 | toString(nextNext) === '.' 69 | ) { 70 | continue 71 | } 72 | 73 | // Remove `child` from parent. 74 | child.children.splice(position, 1) 75 | 76 | // Add the punctuation mark at the end of the previous node. 77 | previous.children.push(grandchild) 78 | 79 | // Update position. 80 | if (grandchild.position && previous.position) { 81 | previous.position.end = grandchild.position.end 82 | } 83 | 84 | position-- 85 | } else if (next && next.type === 'WordNode') { 86 | // Remove `child` from parent. 87 | child.children.splice(position, 1) 88 | 89 | // Add the punctuation mark at the start of the next node. 90 | next.children.unshift(grandchild) 91 | 92 | if (grandchild.position && next.position) { 93 | next.position.start = grandchild.position.start 94 | } 95 | } 96 | } 97 | } 98 | } 99 | ) 100 | -------------------------------------------------------------------------------- /lib/plugin/patch-position.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @typedef {import('unist').Node} Node 3 | * @typedef {import('nlcst').Paragraph} Paragraph 4 | * @typedef {import('unist').Position} Position 5 | * @typedef {import('nlcst').Root} Root 6 | * @typedef {import('nlcst').Sentence} Sentence 7 | */ 8 | 9 | import {visitChildren} from 'unist-util-visit-children' 10 | 11 | // Patch the position on a parent node based on its first and last child. 12 | export const patchPosition = visitChildren( 13 | /** 14 | * @type {import('unist-util-visit-children').Visitor} 15 | */ 16 | function (child, index, node) { 17 | const siblings = node.children 18 | 19 | if ( 20 | child.position && 21 | index < 1 && 22 | /* c8 ignore next */ 23 | (!node.position || !node.position.start) 24 | ) { 25 | patch(node) 26 | node.position.start = child.position.start 27 | } 28 | 29 | if ( 30 | child.position && 31 | index === siblings.length - 1 && 32 | (!node.position || !node.position.end) 33 | ) { 34 | patch(node) 35 | node.position.end = child.position.end 36 | } 37 | } 38 | ) 39 | 40 | /** 41 | * @param {Node} node 42 | * @returns {asserts node is Node & {position: Position}} 43 | */ 44 | function patch(node) { 45 | if (!node.position) { 46 | // @ts-expect-error: fine, we’ll fill it later. 47 | node.position = {} 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /lib/plugin/remove-empty-nodes.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @typedef {import('nlcst').Paragraph} Paragraph 3 | * @typedef {import('nlcst').Root} Root 4 | */ 5 | 6 | import {modifyChildren} from 'unist-util-modify-children' 7 | 8 | // Remove empty children. 9 | export const removeEmptyNodes = modifyChildren( 10 | /** 11 | * @type {import('unist-util-modify-children').Modifier} 12 | */ 13 | 14 | function (child, index, parent) { 15 | if ('children' in child && child.children.length === 0) { 16 | parent.children.splice(index, 1) 17 | 18 | // Next, iterate over the node *now* at the current position (which was the 19 | // next node). 20 | return index 21 | } 22 | } 23 | ) 24 | -------------------------------------------------------------------------------- /license: -------------------------------------------------------------------------------- 1 | (The MIT License) 2 | 3 | Copyright (c) 2014 Titus Wormer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | 'Software'), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "parse-latin", 3 | "version": "7.0.0", 4 | "description": "Latin-script (natural language) parser", 5 | "license": "MIT", 6 | "keywords": [ 7 | "nlcst", 8 | "latin", 9 | "script", 10 | "natural", 11 | "language", 12 | "parser" 13 | ], 14 | "repository": "wooorm/parse-latin", 15 | "bugs": "https://github.com/wooorm/parse-latin/issues", 16 | "funding": { 17 | "type": "github", 18 | "url": "https://github.com/sponsors/wooorm" 19 | }, 20 | "author": "Titus Wormer (https://wooorm.com)", 21 | "contributors": [ 22 | "Titus Wormer (https://wooorm.com)" 23 | ], 24 | "sideEffects": false, 25 | "type": "module", 26 | "exports": "./index.js", 27 | "files": [ 28 | "lib/", 29 | "index.d.ts", 30 | "index.js" 31 | ], 32 | "dependencies": { 33 | "@types/nlcst": "^2.0.0", 34 | "@types/unist": "^3.0.0", 35 | "nlcst-to-string": "^4.0.0", 36 | "unist-util-modify-children": "^4.0.0", 37 | "unist-util-visit-children": "^3.0.0", 38 | "vfile": "^6.0.0" 39 | }, 40 | "devDependencies": { 41 | "@types/node": "^20.0.0", 42 | "@types/regenerate": "^1.0.0", 43 | "@unicode/unicode-15.0.0": "^1.0.0", 44 | "c8": "^8.0.0", 45 | "is-hidden": "^2.0.0", 46 | "nlcst-test": "^4.0.0", 47 | "prettier": "^3.0.0", 48 | "regenerate": "^1.0.0", 49 | "remark-cli": "^11.0.0", 50 | "remark-preset-wooorm": "^9.0.0", 51 | "type-coverage": "^2.0.0", 52 | "typescript": "^5.0.0", 53 | "unist-util-remove-position": "^5.0.0", 54 | "xo": "^0.55.0" 55 | }, 56 | "scripts": { 57 | "prepack": "npm run generate && npm run build && npm run format", 58 | "generate": "node script/build-expressions.js", 59 | "build": "tsc --build --clean && tsc --build && type-coverage", 60 | "format": "remark . -qfo && prettier . -w --log-level warn && xo --fix", 61 | "test-api": "node --conditions development test/index.js", 62 | "test-coverage": "c8 --100 --reporter lcov npm run test-api", 63 | "test": "npm run build && npm run format && npm run test-coverage" 64 | }, 65 | "prettier": { 66 | "bracketSpacing": false, 67 | "semi": false, 68 | "singleQuote": true, 69 | "tabWidth": 2, 70 | "trailingComma": "none", 71 | "useTabs": false 72 | }, 73 | "remarkConfig": { 74 | "plugins": [ 75 | "remark-preset-wooorm" 76 | ] 77 | }, 78 | "typeCoverage": { 79 | "atLeast": 100, 80 | "detail": true, 81 | "ignoreCatch": true, 82 | "strict": true 83 | }, 84 | "xo": { 85 | "overrides": [ 86 | { 87 | "files": [ 88 | "script/**/*.js", 89 | "test/**/*.js" 90 | ], 91 | "rules": { 92 | "no-await-in-loop": "off" 93 | } 94 | } 95 | ], 96 | "prettier": true, 97 | "rules": { 98 | "max-depth": "off", 99 | "no-misleading-character-class": "off", 100 | "unicorn/prefer-at": "off" 101 | } 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # parse-latin 2 | 3 | [![Build][build-badge]][build] 4 | [![Coverage][coverage-badge]][coverage] 5 | [![Downloads][downloads-badge]][downloads] 6 | [![Size][size-badge]][size] 7 | 8 | Natural language parser, for Latin-script languages, that produces [nlcst][]. 9 | 10 | ## Contents 11 | 12 | * [What is this?](#what-is-this) 13 | * [When should I use this?](#when-should-i-use-this) 14 | * [Install](#install) 15 | * [Use](#use) 16 | * [API](#api) 17 | * [`ParseLatin()`](#parselatin) 18 | * [Algorithm](#algorithm) 19 | * [Types](#types) 20 | * [Compatibility](#compatibility) 21 | * [Security](#security) 22 | * [Related](#related) 23 | * [Contribute](#contribute) 24 | * [License](#license) 25 | 26 | ## What is this? 27 | 28 | This package exposes a parser that takes Latin-script natural language and 29 | produces a syntax tree. 30 | 31 | ## When should I use this? 32 | 33 | If you want to handle natural language as syntax trees manually, use this. 34 | 35 | Alternatively, you can use the retext plugin [`retext-latin`][retext-latin], 36 | which wraps this project to also parse natural language at a higher-level 37 | (easier) abstraction. 38 | 39 | Whether Old-English (“þā gewearþ þǣm hlāforde and þǣm hȳrigmannum wiþ ānum 40 | penninge”), Icelandic (“Hvað er að frétta”), French (“Où sont les toilettes?”), 41 | this project does a good job at tokenizing it. 42 | 43 | For English and Dutch, you can instead use [`parse-english`][parse-english] and 44 | [`parse-dutch`][parse-dutch]. 45 | 46 | You can somewhat use this for Latin-like scripts, such as Cyrillic (“привет”), 47 | Georgian (“გამარჯობა”), Armenian (“Բարեւ”), and such. 48 | 49 | ## Install 50 | 51 | This package is [ESM only][esm]. 52 | In Node.js (version 16+), install with [npm][]: 53 | 54 | ```sh 55 | npm install parse-latin 56 | ``` 57 | 58 | In Deno with [`esm.sh`][esmsh]: 59 | 60 | ```js 61 | import {ParseLatin} from 'https://esm.sh/parse-latin@7' 62 | ``` 63 | 64 | In browsers with [`esm.sh`][esmsh]: 65 | 66 | ```html 67 | 70 | ``` 71 | 72 | ## Use 73 | 74 | ```js 75 | import {ParseLatin} from 'parse-latin' 76 | import {inspect} from 'unist-util-inspect' 77 | 78 | const tree = new ParseLatin().parse('A simple sentence.') 79 | 80 | console.log(inspect(tree)) 81 | ``` 82 | 83 | Yields: 84 | 85 | ```txt 86 | RootNode[1] (1:1-1:19, 0-18) 87 | └─0 ParagraphNode[1] (1:1-1:19, 0-18) 88 | └─0 SentenceNode[6] (1:1-1:19, 0-18) 89 | ├─0 WordNode[1] (1:1-1:2, 0-1) 90 | │ └─0 TextNode "A" (1:1-1:2, 0-1) 91 | ├─1 WhiteSpaceNode " " (1:2-1:3, 1-2) 92 | ├─2 WordNode[1] (1:3-1:9, 2-8) 93 | │ └─0 TextNode "simple" (1:3-1:9, 2-8) 94 | ├─3 WhiteSpaceNode " " (1:9-1:10, 8-9) 95 | ├─4 WordNode[1] (1:10-1:18, 9-17) 96 | │ └─0 TextNode "sentence" (1:10-1:18, 9-17) 97 | └─5 PunctuationNode "." (1:18-1:19, 17-18) 98 | ``` 99 | 100 | ## API 101 | 102 | This package exports the identifier [`ParseLatin`][api-parse-latin]. 103 | There is no default export. 104 | 105 | ### `ParseLatin()` 106 | 107 | Create a new parser. 108 | 109 | #### `ParseLatin#parse(value)` 110 | 111 | Turn natural language into a syntax tree. 112 | 113 | ###### Parameters 114 | 115 | * `value` (`string`, optional) 116 | — value to parse 117 | 118 | ###### Returns 119 | 120 | Tree ([`RootNode`][root]). 121 | 122 | ## Algorithm 123 | 124 | > 👉 **Note**: 125 | > The easiest way to see how `parse-latin` parses, is by using the 126 | > [online parser demo][demo], which shows the syntax tree corresponding to 127 | > the typed text. 128 | 129 | `parse-latin` splits text into white space, punctuation, symbol, and word 130 | tokens: 131 | 132 | * “word” is one or more unicode letters or numbers 133 | * “white space” is one or more unicode white space characters 134 | * “punctuation” is one or more unicode punctuation characters 135 | * “symbol” is one or more of anything else 136 | 137 | Then, it manipulates and merges those tokens into a syntax tree, adding 138 | sentences and paragraphs where needed. 139 | 140 | * some punctuation marks are part of the word they occur in, such as 141 | `non-profit`, `she’s`, `G.I.`, `11:00`, `N/A`, `&c`, `nineteenth- and…` 142 | * some periods do not mark a sentence end, such as `1.`, `e.g.`, `id.` 143 | * although periods, question marks, and exclamation marks (sometimes) end a 144 | sentence, that end might not occur directly after the mark, such as `.)`, 145 | `."` 146 | * …and many more exceptions 147 | 148 | ## Types 149 | 150 | This package is fully typed with [TypeScript][]. 151 | It exports no additional types. 152 | 153 | ## Compatibility 154 | 155 | Projects maintained by me are compatible with maintained versions of Node.js. 156 | 157 | When I cut a new major release, I drop support for unmaintained versions of 158 | Node. 159 | This means I try to keep the current release line, `parse-latin@^7`, compatible 160 | with Node.js 16. 161 | 162 | ## Security 163 | 164 | This package is safe. 165 | 166 | ## Related 167 | 168 | * [`parse-english`](https://github.com/wooorm/parse-english) 169 | — English (natural language) parser 170 | * [`parse-dutch`](https://github.com/wooorm/parse-dutch) 171 | — Dutch (natural language) parser 172 | 173 | ## Contribute 174 | 175 | Yes please! 176 | See [How to Contribute to Open Source][contribute]. 177 | 178 | ## License 179 | 180 | [MIT][license] © [Titus Wormer][author] 181 | 182 | 183 | 184 | [build-badge]: https://github.com/wooorm/parse-latin/workflows/main/badge.svg 185 | 186 | [build]: https://github.com/wooorm/parse-latin/actions 187 | 188 | [coverage-badge]: https://img.shields.io/codecov/c/github/wooorm/parse-latin.svg 189 | 190 | [coverage]: https://codecov.io/github/wooorm/parse-latin 191 | 192 | [downloads-badge]: https://img.shields.io/npm/dm/parse-latin.svg 193 | 194 | [downloads]: https://www.npmjs.com/package/parse-latin 195 | 196 | [size-badge]: https://img.shields.io/badge/dynamic/json?label=minzipped%20size&query=$.size.compressedSize&url=https://deno.bundlejs.com/?q=parse-latin 197 | 198 | [size]: https://bundlejs.com/?q=parse-latin 199 | 200 | [npm]: https://docs.npmjs.com/cli/install 201 | 202 | [demo]: https://wooorm.com/parse-latin/ 203 | 204 | [esm]: https://gist.github.com/sindresorhus/a39789f98801d908bbc7ff3ecc99d99c 205 | 206 | [esmsh]: https://esm.sh 207 | 208 | [typescript]: https://www.typescriptlang.org 209 | 210 | [contribute]: https://opensource.guide/how-to-contribute/ 211 | 212 | [license]: license 213 | 214 | [author]: https://wooorm.com 215 | 216 | [nlcst]: https://github.com/syntax-tree/nlcst 217 | 218 | [root]: https://github.com/syntax-tree/nlcst#root 219 | 220 | [retext-latin]: https://github.com/retextjs/retext/tree/main/packages/retext-latin 221 | 222 | [parse-english]: https://github.com/wooorm/parse-english 223 | 224 | [parse-dutch]: https://github.com/wooorm/parse-dutch 225 | 226 | [api-parse-latin]: #parselatin 227 | -------------------------------------------------------------------------------- /script/build-expressions.js: -------------------------------------------------------------------------------- 1 | // To do: next major: use modern regex classes? 2 | import fs from 'node:fs' 3 | /** @type {{default: Array}} */ 4 | // @ts-expect-error 5 | import BPWhiteSpace from '@unicode/unicode-15.0.0/Binary_Property/White_Space/code-points.js' 6 | /** @type {{default: Array}} */ 7 | // @ts-expect-error 8 | import combiningDiacriticalMarks from '@unicode/unicode-15.0.0/Block/Combining_Diacritical_Marks/code-points.js' 9 | /** @type {{default: Array}} */ 10 | // @ts-expect-error 11 | import L from '@unicode/unicode-15.0.0/General_Category/Letter/code-points.js' 12 | /** @type {{default: Array}} */ 13 | // @ts-expect-error 14 | import Ll from '@unicode/unicode-15.0.0/General_Category/Lowercase_Letter/code-points.js' 15 | /** @type {{default: Array}} */ 16 | // @ts-expect-error 17 | import M from '@unicode/unicode-15.0.0/General_Category/Mark/code-points.js' 18 | /** @type {{default: Array}} */ 19 | // @ts-expect-error 20 | import N from '@unicode/unicode-15.0.0/General_Category/Number/code-points.js' 21 | /** @type {{default: Array}} */ 22 | // @ts-expect-error 23 | import Pc from '@unicode/unicode-15.0.0/General_Category/Connector_Punctuation/code-points.js' 24 | /** @type {{default: Array}} */ 25 | // @ts-expect-error 26 | import Pd from '@unicode/unicode-15.0.0/General_Category/Dash_Punctuation/code-points.js' 27 | /** @type {{default: Array}} */ 28 | // @ts-expect-error 29 | import Pe from '@unicode/unicode-15.0.0/General_Category/Close_Punctuation/code-points.js' 30 | /** @type {{default: Array}} */ 31 | // @ts-expect-error 32 | import Pf from '@unicode/unicode-15.0.0/General_Category/Final_Punctuation/code-points.js' 33 | /** @type {{default: Array}} */ 34 | // @ts-expect-error 35 | import Pi from '@unicode/unicode-15.0.0/General_Category/Initial_Punctuation/code-points.js' 36 | /** @type {{default: Array}} */ 37 | // @ts-expect-error 38 | import Po from '@unicode/unicode-15.0.0/General_Category/Other_Punctuation/code-points.js' 39 | /** @type {{default: Array}} */ 40 | // @ts-expect-error 41 | import Ps from '@unicode/unicode-15.0.0/General_Category/Open_Punctuation/code-points.js' 42 | import regenerate from 'regenerate' 43 | 44 | const combiningDiacriticalMark = regenerate().add(combiningDiacriticalMarks) 45 | 46 | const combiningNonspacingMark = regenerate().add(M) 47 | 48 | const letter = regenerate().add(L) 49 | 50 | const letterLower = regenerate().add(Ll) 51 | 52 | const numerical = regenerate().add(N) 53 | 54 | const punctuation = regenerate() 55 | .add(Pc) 56 | .add(Pd) 57 | .add(Pe) 58 | .add(Pf) 59 | .add(Pi) 60 | .add(Po) 61 | .add(Ps) 62 | 63 | // Remove few weirdly-classified symbols: 64 | // 65 | .remove('#') 66 | .remove('&') 67 | .remove('@') 68 | .remove('%') 69 | .remove('‰') 70 | .remove('‱') 71 | .remove('*') 72 | .remove('†') 73 | .remove('‡') 74 | .remove('※') 75 | 76 | const punctuationClosing = regenerate().add(Pe) 77 | 78 | const punctuationFinal = regenerate().add(Pf).add('"').add("'") 79 | 80 | const whiteSpace = regenerate().add(BPWhiteSpace) 81 | 82 | const word = regenerate() 83 | .add(combiningDiacriticalMark) 84 | .add(combiningNonspacingMark) 85 | .add(letter) 86 | .add(numerical) 87 | 88 | const terminalMarker = regenerate() 89 | .add('.') 90 | .add(0x20_3d) 91 | .add('?') 92 | .add('!') 93 | .add(0x20_26) 94 | 95 | // Symbols part of surrounding words. 96 | const wordSymbolInner = regenerate() 97 | .add('-') 98 | .add('@') 99 | .add('?') 100 | .add('=') 101 | .add('.') 102 | .add(':') 103 | .add("'") 104 | .add('&') 105 | .add(0x20_19) // Right single quote 106 | .add(0x00_ad) // Soft hyphen 107 | .add(0x00_b7) // Hyphen 108 | .add(0x20_10) // Non-breaking hyphen 109 | .add(0x20_11) // Hyphenation point 110 | .add(0x20_27) // Middle dot 111 | 112 | // Symbols which can occur multiple times and still be part of surrounding 113 | // words. 114 | const wordSymbolInnerMulti = regenerate().add('_') 115 | 116 | // Match closing or final punctuation, or terminal markers that should still be 117 | // included in the previous sentence, even though they follow the sentence’s 118 | // terminal marker. 119 | const reAffixSymbol = new RegExp( 120 | '^(' + 121 | punctuationClosing + 122 | '|' + 123 | punctuationFinal + 124 | '|' + 125 | terminalMarker + 126 | ')\\1*$' 127 | ) 128 | 129 | // Match one or more new line characters. 130 | const reNewLine = /^[ \t]*((\r?\n|\r)[\t ]*)+$/ 131 | 132 | // Match sentence-ending markers. 133 | const reTerminalMarker = new RegExp('^((?:' + terminalMarker + ')+)$') 134 | 135 | // Match punctuation marks part of surrounding words. 136 | const reWordSymbolInner = new RegExp( 137 | '^(' + 138 | '(?:' + 139 | wordSymbolInner + 140 | ')' + 141 | '|' + 142 | '(?:' + 143 | wordSymbolInnerMulti + 144 | ')+' + 145 | ')$' 146 | ) 147 | 148 | // Match punctuation marks. 149 | const rePunctuation = new RegExp(String(punctuation)) 150 | 151 | // Match numbers. 152 | const reNumerical = new RegExp('^(?:' + numerical + ')+$') 153 | 154 | // Match initial digit. 155 | const reDigitStart = /^\d/ 156 | 157 | // Match initial lowercase letter. 158 | const reLowerInitial = new RegExp('^(?:' + letterLower + ')') 159 | 160 | // Match anything, when possible words, white spaces, or astrals. 161 | const reSurrogates = /[\uD800-\uDFFF]/ 162 | 163 | // Match a word. 164 | const reWord = new RegExp(String(word)) 165 | 166 | // Match white space. 167 | const reWhiteSpace = new RegExp(String(whiteSpace)) 168 | 169 | fs.writeFileSync( 170 | new URL('../lib/expressions.js', import.meta.url), 171 | [ 172 | '// This module is generated by `script/build-expressions.js`.', 173 | 'export const affixSymbol = ' + reAffixSymbol, 174 | 'export const newLine = ' + reNewLine, 175 | 'export const terminalMarker = ' + reTerminalMarker, 176 | 'export const wordSymbolInner = ' + reWordSymbolInner, 177 | 'export const numerical = ' + reNumerical, 178 | 'export const digitStart = ' + reDigitStart, 179 | 'export const lowerInitial = ' + reLowerInitial, 180 | 'export const surrogates = ' + reSurrogates, 181 | 'export const punctuation = ' + rePunctuation, 182 | 'export const word = ' + reWord, 183 | 'export const whiteSpace = ' + reWhiteSpace, 184 | '' 185 | ].join('\n') 186 | ) 187 | -------------------------------------------------------------------------------- /script/generate-fixture.js: -------------------------------------------------------------------------------- 1 | import fs from 'node:fs/promises' 2 | import process from 'node:process' 3 | import {ParseLatin} from 'parse-latin' 4 | 5 | const parser = new ParseLatin() 6 | 7 | const parameters = process.argv.splice(2) 8 | 9 | if (parameters.length < 2) { 10 | console.log('Usage:') 11 | console.log(' npm run fixture name document [method]') 12 | } else { 13 | const basename = parameters[0] 14 | const functionName = parameters[2] || 'parse' 15 | 16 | if ( 17 | functionName !== 'parse' && 18 | functionName !== 'tokenizeParagraph' && 19 | functionName !== 'tokenizeRoot' && 20 | functionName !== 'tokenizeSentence' 21 | ) { 22 | throw new Error('Expected valid function name') 23 | } 24 | 25 | const nlcst = parser[functionName](parameters[1]) 26 | 27 | await fs.writeFile( 28 | new URL('../test/fixture/' + basename + '.json', import.meta.url), 29 | JSON.stringify(nlcst, undefined, 2) + '\n' 30 | ) 31 | 32 | console.log('Wrote `' + basename + '`') 33 | } 34 | -------------------------------------------------------------------------------- /script/regenerate-fixtures.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @typedef {import('nlcst').Paragraph} Paragraph 3 | * @typedef {import('nlcst').Root} Root 4 | * @typedef {import('nlcst').Sentence} Sentence 5 | */ 6 | 7 | import fs from 'node:fs/promises' 8 | import {isHidden} from 'is-hidden' 9 | import {toString} from 'nlcst-to-string' 10 | import {ParseLatin} from 'parse-latin' 11 | 12 | const root = new URL('../test/fixture/', import.meta.url) 13 | const english = new ParseLatin() 14 | 15 | const files = await fs.readdir(root) 16 | const applicable = files.filter(function (d) { 17 | return !isHidden(d) 18 | }) 19 | let index = -1 20 | 21 | while (++index < applicable.length) { 22 | const url = new URL(applicable[index], root) 23 | const doc = String(await fs.readFile(url)) 24 | /** @type {Paragraph | Root | Sentence} */ 25 | const tree = JSON.parse(doc) 26 | const name = /** @type {'Paragraph' | 'Root' | 'Sentence'} */ ( 27 | tree.type.slice(0, tree.type.indexOf('Node')) 28 | ) 29 | const nlcst = english[`tokenize${name}`](toString(tree)) 30 | 31 | await fs.writeFile(url, JSON.stringify(nlcst, undefined, 2) + '\n') 32 | } 33 | -------------------------------------------------------------------------------- /test/fixture/combining-double-breve.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "Such", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 5, 25 | "offset": 4 26 | } 27 | } 28 | } 29 | ], 30 | "position": { 31 | "start": { 32 | "line": 1, 33 | "column": 1, 34 | "offset": 0 35 | }, 36 | "end": { 37 | "line": 1, 38 | "column": 5, 39 | "offset": 4 40 | } 41 | } 42 | }, 43 | { 44 | "type": "WhiteSpaceNode", 45 | "value": " ", 46 | "position": { 47 | "start": { 48 | "line": 1, 49 | "column": 5, 50 | "offset": 4 51 | }, 52 | "end": { 53 | "line": 1, 54 | "column": 6, 55 | "offset": 5 56 | } 57 | } 58 | }, 59 | { 60 | "type": "WordNode", 61 | "children": [ 62 | { 63 | "type": "TextNode", 64 | "value": "as", 65 | "position": { 66 | "start": { 67 | "line": 1, 68 | "column": 6, 69 | "offset": 5 70 | }, 71 | "end": { 72 | "line": 1, 73 | "column": 8, 74 | "offset": 7 75 | } 76 | } 77 | } 78 | ], 79 | "position": { 80 | "start": { 81 | "line": 1, 82 | "column": 6, 83 | "offset": 5 84 | }, 85 | "end": { 86 | "line": 1, 87 | "column": 8, 88 | "offset": 7 89 | } 90 | } 91 | }, 92 | { 93 | "type": "WhiteSpaceNode", 94 | "value": " ", 95 | "position": { 96 | "start": { 97 | "line": 1, 98 | "column": 8, 99 | "offset": 7 100 | }, 101 | "end": { 102 | "line": 1, 103 | "column": 9, 104 | "offset": 8 105 | } 106 | } 107 | }, 108 | { 109 | "type": "WordNode", 110 | "children": [ 111 | { 112 | "type": "TextNode", 113 | "value": "the", 114 | "position": { 115 | "start": { 116 | "line": 1, 117 | "column": 9, 118 | "offset": 8 119 | }, 120 | "end": { 121 | "line": 1, 122 | "column": 12, 123 | "offset": 11 124 | } 125 | } 126 | } 127 | ], 128 | "position": { 129 | "start": { 130 | "line": 1, 131 | "column": 9, 132 | "offset": 8 133 | }, 134 | "end": { 135 | "line": 1, 136 | "column": 12, 137 | "offset": 11 138 | } 139 | } 140 | }, 141 | { 142 | "type": "WhiteSpaceNode", 143 | "value": " ", 144 | "position": { 145 | "start": { 146 | "line": 1, 147 | "column": 12, 148 | "offset": 11 149 | }, 150 | "end": { 151 | "line": 1, 152 | "column": 13, 153 | "offset": 12 154 | } 155 | } 156 | }, 157 | { 158 | "type": "WordNode", 159 | "children": [ 160 | { 161 | "type": "TextNode", 162 | "value": "o͝o", 163 | "position": { 164 | "start": { 165 | "line": 1, 166 | "column": 13, 167 | "offset": 12 168 | }, 169 | "end": { 170 | "line": 1, 171 | "column": 16, 172 | "offset": 15 173 | } 174 | } 175 | } 176 | ], 177 | "position": { 178 | "start": { 179 | "line": 1, 180 | "column": 13, 181 | "offset": 12 182 | }, 183 | "end": { 184 | "line": 1, 185 | "column": 16, 186 | "offset": 15 187 | } 188 | } 189 | }, 190 | { 191 | "type": "PunctuationNode", 192 | "value": ".", 193 | "position": { 194 | "start": { 195 | "line": 1, 196 | "column": 16, 197 | "offset": 15 198 | }, 199 | "end": { 200 | "line": 1, 201 | "column": 17, 202 | "offset": 16 203 | } 204 | } 205 | } 206 | ], 207 | "position": { 208 | "start": { 209 | "line": 1, 210 | "column": 1, 211 | "offset": 0 212 | }, 213 | "end": { 214 | "line": 1, 215 | "column": 17, 216 | "offset": 16 217 | } 218 | } 219 | } 220 | ], 221 | "position": { 222 | "start": { 223 | "line": 1, 224 | "column": 1, 225 | "offset": 0 226 | }, 227 | "end": { 228 | "line": 1, 229 | "column": 17, 230 | "offset": 16 231 | } 232 | } 233 | } 234 | ], 235 | "position": { 236 | "start": { 237 | "line": 1, 238 | "column": 1, 239 | "offset": 0 240 | }, 241 | "end": { 242 | "line": 1, 243 | "column": 17, 244 | "offset": 16 245 | } 246 | } 247 | } 248 | -------------------------------------------------------------------------------- /test/fixture/combining-marks-double.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "He", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 3, 25 | "offset": 2 26 | } 27 | } 28 | } 29 | ], 30 | "position": { 31 | "start": { 32 | "line": 1, 33 | "column": 1, 34 | "offset": 0 35 | }, 36 | "end": { 37 | "line": 1, 38 | "column": 3, 39 | "offset": 2 40 | } 41 | } 42 | }, 43 | { 44 | "type": "WhiteSpaceNode", 45 | "value": " ", 46 | "position": { 47 | "start": { 48 | "line": 1, 49 | "column": 3, 50 | "offset": 2 51 | }, 52 | "end": { 53 | "line": 1, 54 | "column": 4, 55 | "offset": 3 56 | } 57 | } 58 | }, 59 | { 60 | "type": "WordNode", 61 | "children": [ 62 | { 63 | "type": "TextNode", 64 | "value": "scored", 65 | "position": { 66 | "start": { 67 | "line": 1, 68 | "column": 4, 69 | "offset": 3 70 | }, 71 | "end": { 72 | "line": 1, 73 | "column": 10, 74 | "offset": 9 75 | } 76 | } 77 | } 78 | ], 79 | "position": { 80 | "start": { 81 | "line": 1, 82 | "column": 4, 83 | "offset": 3 84 | }, 85 | "end": { 86 | "line": 1, 87 | "column": 10, 88 | "offset": 9 89 | } 90 | } 91 | }, 92 | { 93 | "type": "WhiteSpaceNode", 94 | "value": " ", 95 | "position": { 96 | "start": { 97 | "line": 1, 98 | "column": 10, 99 | "offset": 9 100 | }, 101 | "end": { 102 | "line": 1, 103 | "column": 11, 104 | "offset": 10 105 | } 106 | } 107 | }, 108 | { 109 | "type": "WordNode", 110 | "children": [ 111 | { 112 | "type": "TextNode", 113 | "value": "0️⃣", 114 | "position": { 115 | "start": { 116 | "line": 1, 117 | "column": 11, 118 | "offset": 10 119 | }, 120 | "end": { 121 | "line": 1, 122 | "column": 14, 123 | "offset": 13 124 | } 125 | } 126 | } 127 | ], 128 | "position": { 129 | "start": { 130 | "line": 1, 131 | "column": 11, 132 | "offset": 10 133 | }, 134 | "end": { 135 | "line": 1, 136 | "column": 14, 137 | "offset": 13 138 | } 139 | } 140 | }, 141 | { 142 | "type": "WhiteSpaceNode", 143 | "value": " ", 144 | "position": { 145 | "start": { 146 | "line": 1, 147 | "column": 14, 148 | "offset": 13 149 | }, 150 | "end": { 151 | "line": 1, 152 | "column": 15, 153 | "offset": 14 154 | } 155 | } 156 | }, 157 | { 158 | "type": "WordNode", 159 | "children": [ 160 | { 161 | "type": "TextNode", 162 | "value": "points", 163 | "position": { 164 | "start": { 165 | "line": 1, 166 | "column": 15, 167 | "offset": 14 168 | }, 169 | "end": { 170 | "line": 1, 171 | "column": 21, 172 | "offset": 20 173 | } 174 | } 175 | } 176 | ], 177 | "position": { 178 | "start": { 179 | "line": 1, 180 | "column": 15, 181 | "offset": 14 182 | }, 183 | "end": { 184 | "line": 1, 185 | "column": 21, 186 | "offset": 20 187 | } 188 | } 189 | }, 190 | { 191 | "type": "PunctuationNode", 192 | "value": ".", 193 | "position": { 194 | "start": { 195 | "line": 1, 196 | "column": 21, 197 | "offset": 20 198 | }, 199 | "end": { 200 | "line": 1, 201 | "column": 22, 202 | "offset": 21 203 | } 204 | } 205 | } 206 | ], 207 | "position": { 208 | "start": { 209 | "line": 1, 210 | "column": 1, 211 | "offset": 0 212 | }, 213 | "end": { 214 | "line": 1, 215 | "column": 22, 216 | "offset": 21 217 | } 218 | } 219 | } 220 | ], 221 | "position": { 222 | "start": { 223 | "line": 1, 224 | "column": 1, 225 | "offset": 0 226 | }, 227 | "end": { 228 | "line": 1, 229 | "column": 22, 230 | "offset": 21 231 | } 232 | } 233 | } 234 | ], 235 | "position": { 236 | "start": { 237 | "line": 1, 238 | "column": 1, 239 | "offset": 0 240 | }, 241 | "end": { 242 | "line": 1, 243 | "column": 22, 244 | "offset": 21 245 | } 246 | } 247 | } 248 | -------------------------------------------------------------------------------- /test/fixture/combining-marks.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "Ångström", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 11, 25 | "offset": 10 26 | } 27 | } 28 | } 29 | ], 30 | "position": { 31 | "start": { 32 | "line": 1, 33 | "column": 1, 34 | "offset": 0 35 | }, 36 | "end": { 37 | "line": 1, 38 | "column": 11, 39 | "offset": 10 40 | } 41 | } 42 | }, 43 | { 44 | "type": "PunctuationNode", 45 | "value": ".", 46 | "position": { 47 | "start": { 48 | "line": 1, 49 | "column": 11, 50 | "offset": 10 51 | }, 52 | "end": { 53 | "line": 1, 54 | "column": 12, 55 | "offset": 11 56 | } 57 | } 58 | } 59 | ], 60 | "position": { 61 | "start": { 62 | "line": 1, 63 | "column": 1, 64 | "offset": 0 65 | }, 66 | "end": { 67 | "line": 1, 68 | "column": 12, 69 | "offset": 11 70 | } 71 | } 72 | } 73 | ], 74 | "position": { 75 | "start": { 76 | "line": 1, 77 | "column": 1, 78 | "offset": 0 79 | }, 80 | "end": { 81 | "line": 1, 82 | "column": 12, 83 | "offset": 11 84 | } 85 | } 86 | } 87 | ], 88 | "position": { 89 | "start": { 90 | "line": 1, 91 | "column": 1, 92 | "offset": 0 93 | }, 94 | "end": { 95 | "line": 1, 96 | "column": 12, 97 | "offset": 11 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /test/fixture/combining-tie-under.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "The", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 4, 25 | "offset": 3 26 | } 27 | } 28 | } 29 | ], 30 | "position": { 31 | "start": { 32 | "line": 1, 33 | "column": 1, 34 | "offset": 0 35 | }, 36 | "end": { 37 | "line": 1, 38 | "column": 4, 39 | "offset": 3 40 | } 41 | } 42 | }, 43 | { 44 | "type": "WhiteSpaceNode", 45 | "value": " ", 46 | "position": { 47 | "start": { 48 | "line": 1, 49 | "column": 4, 50 | "offset": 3 51 | }, 52 | "end": { 53 | "line": 1, 54 | "column": 5, 55 | "offset": 4 56 | } 57 | } 58 | }, 59 | { 60 | "type": "WordNode", 61 | "children": [ 62 | { 63 | "type": "TextNode", 64 | "value": "undertie", 65 | "position": { 66 | "start": { 67 | "line": 1, 68 | "column": 5, 69 | "offset": 4 70 | }, 71 | "end": { 72 | "line": 1, 73 | "column": 13, 74 | "offset": 12 75 | } 76 | } 77 | } 78 | ], 79 | "position": { 80 | "start": { 81 | "line": 1, 82 | "column": 5, 83 | "offset": 4 84 | }, 85 | "end": { 86 | "line": 1, 87 | "column": 13, 88 | "offset": 12 89 | } 90 | } 91 | }, 92 | { 93 | "type": "WhiteSpaceNode", 94 | "value": " ", 95 | "position": { 96 | "start": { 97 | "line": 1, 98 | "column": 13, 99 | "offset": 12 100 | }, 101 | "end": { 102 | "line": 1, 103 | "column": 14, 104 | "offset": 13 105 | } 106 | } 107 | }, 108 | { 109 | "type": "PunctuationNode", 110 | "value": "/", 111 | "position": { 112 | "start": { 113 | "line": 1, 114 | "column": 14, 115 | "offset": 13 116 | }, 117 | "end": { 118 | "line": 1, 119 | "column": 15, 120 | "offset": 14 121 | } 122 | } 123 | }, 124 | { 125 | "type": "WordNode", 126 | "children": [ 127 | { 128 | "type": "TextNode", 129 | "value": "vuz", 130 | "position": { 131 | "start": { 132 | "line": 1, 133 | "column": 15, 134 | "offset": 14 135 | }, 136 | "end": { 137 | "line": 1, 138 | "column": 18, 139 | "offset": 17 140 | } 141 | } 142 | } 143 | ], 144 | "position": { 145 | "start": { 146 | "line": 1, 147 | "column": 15, 148 | "offset": 14 149 | }, 150 | "end": { 151 | "line": 1, 152 | "column": 18, 153 | "offset": 17 154 | } 155 | } 156 | }, 157 | { 158 | "type": "PunctuationNode", 159 | "value": "‿", 160 | "position": { 161 | "start": { 162 | "line": 1, 163 | "column": 18, 164 | "offset": 17 165 | }, 166 | "end": { 167 | "line": 1, 168 | "column": 19, 169 | "offset": 18 170 | } 171 | } 172 | }, 173 | { 174 | "type": "WordNode", 175 | "children": [ 176 | { 177 | "type": "TextNode", 178 | "value": "ave", 179 | "position": { 180 | "start": { 181 | "line": 1, 182 | "column": 19, 183 | "offset": 18 184 | }, 185 | "end": { 186 | "line": 1, 187 | "column": 22, 188 | "offset": 21 189 | } 190 | } 191 | } 192 | ], 193 | "position": { 194 | "start": { 195 | "line": 1, 196 | "column": 19, 197 | "offset": 18 198 | }, 199 | "end": { 200 | "line": 1, 201 | "column": 22, 202 | "offset": 21 203 | } 204 | } 205 | }, 206 | { 207 | "type": "PunctuationNode", 208 | "value": "/", 209 | "position": { 210 | "start": { 211 | "line": 1, 212 | "column": 22, 213 | "offset": 21 214 | }, 215 | "end": { 216 | "line": 1, 217 | "column": 23, 218 | "offset": 22 219 | } 220 | } 221 | } 222 | ], 223 | "position": { 224 | "start": { 225 | "line": 1, 226 | "column": 1, 227 | "offset": 0 228 | }, 229 | "end": { 230 | "line": 1, 231 | "column": 23, 232 | "offset": 22 233 | } 234 | } 235 | } 236 | ], 237 | "position": { 238 | "start": { 239 | "line": 1, 240 | "column": 1, 241 | "offset": 0 242 | }, 243 | "end": { 244 | "line": 1, 245 | "column": 23, 246 | "offset": 22 247 | } 248 | } 249 | } 250 | ], 251 | "position": { 252 | "start": { 253 | "line": 1, 254 | "column": 1, 255 | "offset": 0 256 | }, 257 | "end": { 258 | "line": 1, 259 | "column": 23, 260 | "offset": 22 261 | } 262 | } 263 | } 264 | -------------------------------------------------------------------------------- /test/fixture/digit-only-sentence.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "123456", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 7, 25 | "offset": 6 26 | } 27 | } 28 | } 29 | ], 30 | "position": { 31 | "start": { 32 | "line": 1, 33 | "column": 1, 34 | "offset": 0 35 | }, 36 | "end": { 37 | "line": 1, 38 | "column": 7, 39 | "offset": 6 40 | } 41 | } 42 | } 43 | ], 44 | "position": { 45 | "start": { 46 | "line": 1, 47 | "column": 1, 48 | "offset": 0 49 | }, 50 | "end": { 51 | "line": 1, 52 | "column": 7, 53 | "offset": 6 54 | } 55 | } 56 | } 57 | ], 58 | "position": { 59 | "start": { 60 | "line": 1, 61 | "column": 1, 62 | "offset": 0 63 | }, 64 | "end": { 65 | "line": 1, 66 | "column": 7, 67 | "offset": 6 68 | } 69 | } 70 | } 71 | ], 72 | "position": { 73 | "start": { 74 | "line": 1, 75 | "column": 1, 76 | "offset": 0 77 | }, 78 | "end": { 79 | "line": 1, 80 | "column": 7, 81 | "offset": 6 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /test/fixture/ellipsis-sentence-end-spaces.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "To", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 3, 25 | "offset": 2 26 | } 27 | } 28 | } 29 | ], 30 | "position": { 31 | "start": { 32 | "line": 1, 33 | "column": 1, 34 | "offset": 0 35 | }, 36 | "end": { 37 | "line": 1, 38 | "column": 3, 39 | "offset": 2 40 | } 41 | } 42 | }, 43 | { 44 | "type": "WhiteSpaceNode", 45 | "value": " ", 46 | "position": { 47 | "start": { 48 | "line": 1, 49 | "column": 3, 50 | "offset": 2 51 | }, 52 | "end": { 53 | "line": 1, 54 | "column": 4, 55 | "offset": 3 56 | } 57 | } 58 | }, 59 | { 60 | "type": "WordNode", 61 | "children": [ 62 | { 63 | "type": "TextNode", 64 | "value": "be", 65 | "position": { 66 | "start": { 67 | "line": 1, 68 | "column": 4, 69 | "offset": 3 70 | }, 71 | "end": { 72 | "line": 1, 73 | "column": 6, 74 | "offset": 5 75 | } 76 | } 77 | } 78 | ], 79 | "position": { 80 | "start": { 81 | "line": 1, 82 | "column": 4, 83 | "offset": 3 84 | }, 85 | "end": { 86 | "line": 1, 87 | "column": 6, 88 | "offset": 5 89 | } 90 | } 91 | }, 92 | { 93 | "type": "WhiteSpaceNode", 94 | "value": " ", 95 | "position": { 96 | "start": { 97 | "line": 1, 98 | "column": 6, 99 | "offset": 5 100 | }, 101 | "end": { 102 | "line": 1, 103 | "column": 7, 104 | "offset": 6 105 | } 106 | } 107 | }, 108 | { 109 | "type": "WordNode", 110 | "children": [ 111 | { 112 | "type": "TextNode", 113 | "value": "continued", 114 | "position": { 115 | "start": { 116 | "line": 1, 117 | "column": 7, 118 | "offset": 6 119 | }, 120 | "end": { 121 | "line": 1, 122 | "column": 16, 123 | "offset": 15 124 | } 125 | } 126 | } 127 | ], 128 | "position": { 129 | "start": { 130 | "line": 1, 131 | "column": 7, 132 | "offset": 6 133 | }, 134 | "end": { 135 | "line": 1, 136 | "column": 16, 137 | "offset": 15 138 | } 139 | } 140 | }, 141 | { 142 | "type": "PunctuationNode", 143 | "value": ".", 144 | "position": { 145 | "start": { 146 | "line": 1, 147 | "column": 16, 148 | "offset": 15 149 | }, 150 | "end": { 151 | "line": 1, 152 | "column": 17, 153 | "offset": 16 154 | } 155 | } 156 | }, 157 | { 158 | "type": "WhiteSpaceNode", 159 | "value": " ", 160 | "position": { 161 | "start": { 162 | "line": 1, 163 | "column": 17, 164 | "offset": 16 165 | }, 166 | "end": { 167 | "line": 1, 168 | "column": 18, 169 | "offset": 17 170 | } 171 | } 172 | }, 173 | { 174 | "type": "PunctuationNode", 175 | "value": ".", 176 | "position": { 177 | "start": { 178 | "line": 1, 179 | "column": 18, 180 | "offset": 17 181 | }, 182 | "end": { 183 | "line": 1, 184 | "column": 19, 185 | "offset": 18 186 | } 187 | } 188 | }, 189 | { 190 | "type": "WhiteSpaceNode", 191 | "value": " ", 192 | "position": { 193 | "start": { 194 | "line": 1, 195 | "column": 19, 196 | "offset": 18 197 | }, 198 | "end": { 199 | "line": 1, 200 | "column": 20, 201 | "offset": 19 202 | } 203 | } 204 | }, 205 | { 206 | "type": "PunctuationNode", 207 | "value": ".", 208 | "position": { 209 | "start": { 210 | "line": 1, 211 | "column": 20, 212 | "offset": 19 213 | }, 214 | "end": { 215 | "line": 1, 216 | "column": 21, 217 | "offset": 20 218 | } 219 | } 220 | } 221 | ], 222 | "position": { 223 | "start": { 224 | "line": 1, 225 | "column": 1, 226 | "offset": 0 227 | }, 228 | "end": { 229 | "line": 1, 230 | "column": 21, 231 | "offset": 20 232 | } 233 | } 234 | } 235 | ], 236 | "position": { 237 | "start": { 238 | "line": 1, 239 | "column": 1, 240 | "offset": 0 241 | }, 242 | "end": { 243 | "line": 1, 244 | "column": 21, 245 | "offset": 20 246 | } 247 | } 248 | } 249 | ], 250 | "position": { 251 | "start": { 252 | "line": 1, 253 | "column": 1, 254 | "offset": 0 255 | }, 256 | "end": { 257 | "line": 1, 258 | "column": 21, 259 | "offset": 20 260 | } 261 | } 262 | } 263 | -------------------------------------------------------------------------------- /test/fixture/ellipsis-sentence-end-unicode.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "To", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 3, 25 | "offset": 2 26 | } 27 | } 28 | } 29 | ], 30 | "position": { 31 | "start": { 32 | "line": 1, 33 | "column": 1, 34 | "offset": 0 35 | }, 36 | "end": { 37 | "line": 1, 38 | "column": 3, 39 | "offset": 2 40 | } 41 | } 42 | }, 43 | { 44 | "type": "WhiteSpaceNode", 45 | "value": " ", 46 | "position": { 47 | "start": { 48 | "line": 1, 49 | "column": 3, 50 | "offset": 2 51 | }, 52 | "end": { 53 | "line": 1, 54 | "column": 4, 55 | "offset": 3 56 | } 57 | } 58 | }, 59 | { 60 | "type": "WordNode", 61 | "children": [ 62 | { 63 | "type": "TextNode", 64 | "value": "be", 65 | "position": { 66 | "start": { 67 | "line": 1, 68 | "column": 4, 69 | "offset": 3 70 | }, 71 | "end": { 72 | "line": 1, 73 | "column": 6, 74 | "offset": 5 75 | } 76 | } 77 | } 78 | ], 79 | "position": { 80 | "start": { 81 | "line": 1, 82 | "column": 4, 83 | "offset": 3 84 | }, 85 | "end": { 86 | "line": 1, 87 | "column": 6, 88 | "offset": 5 89 | } 90 | } 91 | }, 92 | { 93 | "type": "WhiteSpaceNode", 94 | "value": " ", 95 | "position": { 96 | "start": { 97 | "line": 1, 98 | "column": 6, 99 | "offset": 5 100 | }, 101 | "end": { 102 | "line": 1, 103 | "column": 7, 104 | "offset": 6 105 | } 106 | } 107 | }, 108 | { 109 | "type": "WordNode", 110 | "children": [ 111 | { 112 | "type": "TextNode", 113 | "value": "continued", 114 | "position": { 115 | "start": { 116 | "line": 1, 117 | "column": 7, 118 | "offset": 6 119 | }, 120 | "end": { 121 | "line": 1, 122 | "column": 16, 123 | "offset": 15 124 | } 125 | } 126 | } 127 | ], 128 | "position": { 129 | "start": { 130 | "line": 1, 131 | "column": 7, 132 | "offset": 6 133 | }, 134 | "end": { 135 | "line": 1, 136 | "column": 16, 137 | "offset": 15 138 | } 139 | } 140 | }, 141 | { 142 | "type": "PunctuationNode", 143 | "value": "…", 144 | "position": { 145 | "start": { 146 | "line": 1, 147 | "column": 16, 148 | "offset": 15 149 | }, 150 | "end": { 151 | "line": 1, 152 | "column": 17, 153 | "offset": 16 154 | } 155 | } 156 | } 157 | ], 158 | "position": { 159 | "start": { 160 | "line": 1, 161 | "column": 1, 162 | "offset": 0 163 | }, 164 | "end": { 165 | "line": 1, 166 | "column": 17, 167 | "offset": 16 168 | } 169 | } 170 | } 171 | ], 172 | "position": { 173 | "start": { 174 | "line": 1, 175 | "column": 1, 176 | "offset": 0 177 | }, 178 | "end": { 179 | "line": 1, 180 | "column": 17, 181 | "offset": 16 182 | } 183 | } 184 | } 185 | ], 186 | "position": { 187 | "start": { 188 | "line": 1, 189 | "column": 1, 190 | "offset": 0 191 | }, 192 | "end": { 193 | "line": 1, 194 | "column": 17, 195 | "offset": 16 196 | } 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /test/fixture/ellipsis-sentence-end.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "To", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 3, 25 | "offset": 2 26 | } 27 | } 28 | } 29 | ], 30 | "position": { 31 | "start": { 32 | "line": 1, 33 | "column": 1, 34 | "offset": 0 35 | }, 36 | "end": { 37 | "line": 1, 38 | "column": 3, 39 | "offset": 2 40 | } 41 | } 42 | }, 43 | { 44 | "type": "WhiteSpaceNode", 45 | "value": " ", 46 | "position": { 47 | "start": { 48 | "line": 1, 49 | "column": 3, 50 | "offset": 2 51 | }, 52 | "end": { 53 | "line": 1, 54 | "column": 4, 55 | "offset": 3 56 | } 57 | } 58 | }, 59 | { 60 | "type": "WordNode", 61 | "children": [ 62 | { 63 | "type": "TextNode", 64 | "value": "be", 65 | "position": { 66 | "start": { 67 | "line": 1, 68 | "column": 4, 69 | "offset": 3 70 | }, 71 | "end": { 72 | "line": 1, 73 | "column": 6, 74 | "offset": 5 75 | } 76 | } 77 | } 78 | ], 79 | "position": { 80 | "start": { 81 | "line": 1, 82 | "column": 4, 83 | "offset": 3 84 | }, 85 | "end": { 86 | "line": 1, 87 | "column": 6, 88 | "offset": 5 89 | } 90 | } 91 | }, 92 | { 93 | "type": "WhiteSpaceNode", 94 | "value": " ", 95 | "position": { 96 | "start": { 97 | "line": 1, 98 | "column": 6, 99 | "offset": 5 100 | }, 101 | "end": { 102 | "line": 1, 103 | "column": 7, 104 | "offset": 6 105 | } 106 | } 107 | }, 108 | { 109 | "type": "WordNode", 110 | "children": [ 111 | { 112 | "type": "TextNode", 113 | "value": "continued", 114 | "position": { 115 | "start": { 116 | "line": 1, 117 | "column": 7, 118 | "offset": 6 119 | }, 120 | "end": { 121 | "line": 1, 122 | "column": 16, 123 | "offset": 15 124 | } 125 | } 126 | } 127 | ], 128 | "position": { 129 | "start": { 130 | "line": 1, 131 | "column": 7, 132 | "offset": 6 133 | }, 134 | "end": { 135 | "line": 1, 136 | "column": 16, 137 | "offset": 15 138 | } 139 | } 140 | }, 141 | { 142 | "type": "PunctuationNode", 143 | "value": "...", 144 | "position": { 145 | "start": { 146 | "line": 1, 147 | "column": 16, 148 | "offset": 15 149 | }, 150 | "end": { 151 | "line": 1, 152 | "column": 19, 153 | "offset": 18 154 | } 155 | } 156 | } 157 | ], 158 | "position": { 159 | "start": { 160 | "line": 1, 161 | "column": 1, 162 | "offset": 0 163 | }, 164 | "end": { 165 | "line": 1, 166 | "column": 19, 167 | "offset": 18 168 | } 169 | } 170 | } 171 | ], 172 | "position": { 173 | "start": { 174 | "line": 1, 175 | "column": 1, 176 | "offset": 0 177 | }, 178 | "end": { 179 | "line": 1, 180 | "column": 19, 181 | "offset": 18 182 | } 183 | } 184 | } 185 | ], 186 | "position": { 187 | "start": { 188 | "line": 1, 189 | "column": 1, 190 | "offset": 0 191 | }, 192 | "end": { 193 | "line": 1, 194 | "column": 19, 195 | "offset": 18 196 | } 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /test/fixture/ellipsis-sentence-start-unicode.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "PunctuationNode", 12 | "value": "…", 13 | "position": { 14 | "start": { 15 | "line": 1, 16 | "column": 1, 17 | "offset": 0 18 | }, 19 | "end": { 20 | "line": 1, 21 | "column": 2, 22 | "offset": 1 23 | } 24 | } 25 | }, 26 | { 27 | "type": "WordNode", 28 | "children": [ 29 | { 30 | "type": "TextNode", 31 | "value": "to", 32 | "position": { 33 | "start": { 34 | "line": 1, 35 | "column": 2, 36 | "offset": 1 37 | }, 38 | "end": { 39 | "line": 1, 40 | "column": 4, 41 | "offset": 3 42 | } 43 | } 44 | } 45 | ], 46 | "position": { 47 | "start": { 48 | "line": 1, 49 | "column": 2, 50 | "offset": 1 51 | }, 52 | "end": { 53 | "line": 1, 54 | "column": 4, 55 | "offset": 3 56 | } 57 | } 58 | }, 59 | { 60 | "type": "WhiteSpaceNode", 61 | "value": " ", 62 | "position": { 63 | "start": { 64 | "line": 1, 65 | "column": 4, 66 | "offset": 3 67 | }, 68 | "end": { 69 | "line": 1, 70 | "column": 5, 71 | "offset": 4 72 | } 73 | } 74 | }, 75 | { 76 | "type": "WordNode", 77 | "children": [ 78 | { 79 | "type": "TextNode", 80 | "value": "be", 81 | "position": { 82 | "start": { 83 | "line": 1, 84 | "column": 5, 85 | "offset": 4 86 | }, 87 | "end": { 88 | "line": 1, 89 | "column": 7, 90 | "offset": 6 91 | } 92 | } 93 | } 94 | ], 95 | "position": { 96 | "start": { 97 | "line": 1, 98 | "column": 5, 99 | "offset": 4 100 | }, 101 | "end": { 102 | "line": 1, 103 | "column": 7, 104 | "offset": 6 105 | } 106 | } 107 | }, 108 | { 109 | "type": "WhiteSpaceNode", 110 | "value": " ", 111 | "position": { 112 | "start": { 113 | "line": 1, 114 | "column": 7, 115 | "offset": 6 116 | }, 117 | "end": { 118 | "line": 1, 119 | "column": 8, 120 | "offset": 7 121 | } 122 | } 123 | }, 124 | { 125 | "type": "WordNode", 126 | "children": [ 127 | { 128 | "type": "TextNode", 129 | "value": "continued", 130 | "position": { 131 | "start": { 132 | "line": 1, 133 | "column": 8, 134 | "offset": 7 135 | }, 136 | "end": { 137 | "line": 1, 138 | "column": 17, 139 | "offset": 16 140 | } 141 | } 142 | } 143 | ], 144 | "position": { 145 | "start": { 146 | "line": 1, 147 | "column": 8, 148 | "offset": 7 149 | }, 150 | "end": { 151 | "line": 1, 152 | "column": 17, 153 | "offset": 16 154 | } 155 | } 156 | }, 157 | { 158 | "type": "PunctuationNode", 159 | "value": ".", 160 | "position": { 161 | "start": { 162 | "line": 1, 163 | "column": 17, 164 | "offset": 16 165 | }, 166 | "end": { 167 | "line": 1, 168 | "column": 18, 169 | "offset": 17 170 | } 171 | } 172 | } 173 | ], 174 | "position": { 175 | "start": { 176 | "line": 1, 177 | "column": 1, 178 | "offset": 0 179 | }, 180 | "end": { 181 | "line": 1, 182 | "column": 18, 183 | "offset": 17 184 | } 185 | } 186 | } 187 | ], 188 | "position": { 189 | "start": { 190 | "line": 1, 191 | "column": 1, 192 | "offset": 0 193 | }, 194 | "end": { 195 | "line": 1, 196 | "column": 18, 197 | "offset": 17 198 | } 199 | } 200 | } 201 | ], 202 | "position": { 203 | "start": { 204 | "line": 1, 205 | "column": 1, 206 | "offset": 0 207 | }, 208 | "end": { 209 | "line": 1, 210 | "column": 18, 211 | "offset": 17 212 | } 213 | } 214 | } 215 | -------------------------------------------------------------------------------- /test/fixture/ellipsis-sentence-start.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "PunctuationNode", 12 | "value": "...", 13 | "position": { 14 | "start": { 15 | "line": 1, 16 | "column": 1, 17 | "offset": 0 18 | }, 19 | "end": { 20 | "line": 1, 21 | "column": 4, 22 | "offset": 3 23 | } 24 | } 25 | }, 26 | { 27 | "type": "WordNode", 28 | "children": [ 29 | { 30 | "type": "TextNode", 31 | "value": "to", 32 | "position": { 33 | "start": { 34 | "line": 1, 35 | "column": 4, 36 | "offset": 3 37 | }, 38 | "end": { 39 | "line": 1, 40 | "column": 6, 41 | "offset": 5 42 | } 43 | } 44 | } 45 | ], 46 | "position": { 47 | "start": { 48 | "line": 1, 49 | "column": 4, 50 | "offset": 3 51 | }, 52 | "end": { 53 | "line": 1, 54 | "column": 6, 55 | "offset": 5 56 | } 57 | } 58 | }, 59 | { 60 | "type": "WhiteSpaceNode", 61 | "value": " ", 62 | "position": { 63 | "start": { 64 | "line": 1, 65 | "column": 6, 66 | "offset": 5 67 | }, 68 | "end": { 69 | "line": 1, 70 | "column": 7, 71 | "offset": 6 72 | } 73 | } 74 | }, 75 | { 76 | "type": "WordNode", 77 | "children": [ 78 | { 79 | "type": "TextNode", 80 | "value": "be", 81 | "position": { 82 | "start": { 83 | "line": 1, 84 | "column": 7, 85 | "offset": 6 86 | }, 87 | "end": { 88 | "line": 1, 89 | "column": 9, 90 | "offset": 8 91 | } 92 | } 93 | } 94 | ], 95 | "position": { 96 | "start": { 97 | "line": 1, 98 | "column": 7, 99 | "offset": 6 100 | }, 101 | "end": { 102 | "line": 1, 103 | "column": 9, 104 | "offset": 8 105 | } 106 | } 107 | }, 108 | { 109 | "type": "WhiteSpaceNode", 110 | "value": " ", 111 | "position": { 112 | "start": { 113 | "line": 1, 114 | "column": 9, 115 | "offset": 8 116 | }, 117 | "end": { 118 | "line": 1, 119 | "column": 10, 120 | "offset": 9 121 | } 122 | } 123 | }, 124 | { 125 | "type": "WordNode", 126 | "children": [ 127 | { 128 | "type": "TextNode", 129 | "value": "continued", 130 | "position": { 131 | "start": { 132 | "line": 1, 133 | "column": 10, 134 | "offset": 9 135 | }, 136 | "end": { 137 | "line": 1, 138 | "column": 19, 139 | "offset": 18 140 | } 141 | } 142 | } 143 | ], 144 | "position": { 145 | "start": { 146 | "line": 1, 147 | "column": 10, 148 | "offset": 9 149 | }, 150 | "end": { 151 | "line": 1, 152 | "column": 19, 153 | "offset": 18 154 | } 155 | } 156 | }, 157 | { 158 | "type": "PunctuationNode", 159 | "value": ".", 160 | "position": { 161 | "start": { 162 | "line": 1, 163 | "column": 19, 164 | "offset": 18 165 | }, 166 | "end": { 167 | "line": 1, 168 | "column": 20, 169 | "offset": 19 170 | } 171 | } 172 | } 173 | ], 174 | "position": { 175 | "start": { 176 | "line": 1, 177 | "column": 1, 178 | "offset": 0 179 | }, 180 | "end": { 181 | "line": 1, 182 | "column": 20, 183 | "offset": 19 184 | } 185 | } 186 | } 187 | ], 188 | "position": { 189 | "start": { 190 | "line": 1, 191 | "column": 1, 192 | "offset": 0 193 | }, 194 | "end": { 195 | "line": 1, 196 | "column": 20, 197 | "offset": 19 198 | } 199 | } 200 | } 201 | ], 202 | "position": { 203 | "start": { 204 | "line": 1, 205 | "column": 1, 206 | "offset": 0 207 | }, 208 | "end": { 209 | "line": 1, 210 | "column": 20, 211 | "offset": 19 212 | } 213 | } 214 | } 215 | -------------------------------------------------------------------------------- /test/fixture/full-stop-followed-by-digit.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "Of", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 3, 25 | "offset": 2 26 | } 27 | } 28 | } 29 | ], 30 | "position": { 31 | "start": { 32 | "line": 1, 33 | "column": 1, 34 | "offset": 0 35 | }, 36 | "end": { 37 | "line": 1, 38 | "column": 3, 39 | "offset": 2 40 | } 41 | } 42 | }, 43 | { 44 | "type": "WhiteSpaceNode", 45 | "value": " ", 46 | "position": { 47 | "start": { 48 | "line": 1, 49 | "column": 3, 50 | "offset": 2 51 | }, 52 | "end": { 53 | "line": 1, 54 | "column": 4, 55 | "offset": 3 56 | } 57 | } 58 | }, 59 | { 60 | "type": "WordNode", 61 | "children": [ 62 | { 63 | "type": "PunctuationNode", 64 | "value": ".", 65 | "position": { 66 | "start": { 67 | "line": 1, 68 | "column": 4, 69 | "offset": 3 70 | }, 71 | "end": { 72 | "line": 1, 73 | "column": 5, 74 | "offset": 4 75 | } 76 | } 77 | }, 78 | { 79 | "type": "TextNode", 80 | "value": "5", 81 | "position": { 82 | "start": { 83 | "line": 1, 84 | "column": 5, 85 | "offset": 4 86 | }, 87 | "end": { 88 | "line": 1, 89 | "column": 6, 90 | "offset": 5 91 | } 92 | } 93 | } 94 | ], 95 | "position": { 96 | "start": { 97 | "line": 1, 98 | "column": 4, 99 | "offset": 3 100 | }, 101 | "end": { 102 | "line": 1, 103 | "column": 6, 104 | "offset": 5 105 | } 106 | } 107 | }, 108 | { 109 | "type": "WhiteSpaceNode", 110 | "value": " ", 111 | "position": { 112 | "start": { 113 | "line": 1, 114 | "column": 6, 115 | "offset": 5 116 | }, 117 | "end": { 118 | "line": 1, 119 | "column": 7, 120 | "offset": 6 121 | } 122 | } 123 | }, 124 | { 125 | "type": "WordNode", 126 | "children": [ 127 | { 128 | "type": "TextNode", 129 | "value": "percent", 130 | "position": { 131 | "start": { 132 | "line": 1, 133 | "column": 7, 134 | "offset": 6 135 | }, 136 | "end": { 137 | "line": 1, 138 | "column": 14, 139 | "offset": 13 140 | } 141 | } 142 | } 143 | ], 144 | "position": { 145 | "start": { 146 | "line": 1, 147 | "column": 7, 148 | "offset": 6 149 | }, 150 | "end": { 151 | "line": 1, 152 | "column": 14, 153 | "offset": 13 154 | } 155 | } 156 | }, 157 | { 158 | "type": "PunctuationNode", 159 | "value": ".", 160 | "position": { 161 | "start": { 162 | "line": 1, 163 | "column": 14, 164 | "offset": 13 165 | }, 166 | "end": { 167 | "line": 1, 168 | "column": 15, 169 | "offset": 14 170 | } 171 | } 172 | } 173 | ], 174 | "position": { 175 | "start": { 176 | "line": 1, 177 | "column": 1, 178 | "offset": 0 179 | }, 180 | "end": { 181 | "line": 1, 182 | "column": 15, 183 | "offset": 14 184 | } 185 | } 186 | } 187 | ], 188 | "position": { 189 | "start": { 190 | "line": 1, 191 | "column": 1, 192 | "offset": 0 193 | }, 194 | "end": { 195 | "line": 1, 196 | "column": 15, 197 | "offset": 14 198 | } 199 | } 200 | } 201 | ], 202 | "position": { 203 | "start": { 204 | "line": 1, 205 | "column": 1, 206 | "offset": 0 207 | }, 208 | "end": { 209 | "line": 1, 210 | "column": 15, 211 | "offset": 14 212 | } 213 | } 214 | } 215 | -------------------------------------------------------------------------------- /test/fixture/implicit-sentence-end.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "One", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 4, 25 | "offset": 3 26 | } 27 | } 28 | } 29 | ], 30 | "position": { 31 | "start": { 32 | "line": 1, 33 | "column": 1, 34 | "offset": 0 35 | }, 36 | "end": { 37 | "line": 1, 38 | "column": 4, 39 | "offset": 3 40 | } 41 | } 42 | }, 43 | { 44 | "type": "WhiteSpaceNode", 45 | "value": " ", 46 | "position": { 47 | "start": { 48 | "line": 1, 49 | "column": 4, 50 | "offset": 3 51 | }, 52 | "end": { 53 | "line": 1, 54 | "column": 5, 55 | "offset": 4 56 | } 57 | } 58 | }, 59 | { 60 | "type": "WordNode", 61 | "children": [ 62 | { 63 | "type": "TextNode", 64 | "value": "sentence", 65 | "position": { 66 | "start": { 67 | "line": 1, 68 | "column": 5, 69 | "offset": 4 70 | }, 71 | "end": { 72 | "line": 1, 73 | "column": 13, 74 | "offset": 12 75 | } 76 | } 77 | } 78 | ], 79 | "position": { 80 | "start": { 81 | "line": 1, 82 | "column": 5, 83 | "offset": 4 84 | }, 85 | "end": { 86 | "line": 1, 87 | "column": 13, 88 | "offset": 12 89 | } 90 | } 91 | }, 92 | { 93 | "type": "PunctuationNode", 94 | "value": ".", 95 | "position": { 96 | "start": { 97 | "line": 1, 98 | "column": 13, 99 | "offset": 12 100 | }, 101 | "end": { 102 | "line": 1, 103 | "column": 14, 104 | "offset": 13 105 | } 106 | } 107 | } 108 | ], 109 | "position": { 110 | "start": { 111 | "line": 1, 112 | "column": 1, 113 | "offset": 0 114 | }, 115 | "end": { 116 | "line": 1, 117 | "column": 14, 118 | "offset": 13 119 | } 120 | } 121 | }, 122 | { 123 | "type": "WhiteSpaceNode", 124 | "value": " ", 125 | "position": { 126 | "start": { 127 | "line": 1, 128 | "column": 14, 129 | "offset": 13 130 | }, 131 | "end": { 132 | "line": 1, 133 | "column": 15, 134 | "offset": 14 135 | } 136 | } 137 | }, 138 | { 139 | "type": "SentenceNode", 140 | "children": [ 141 | { 142 | "type": "WordNode", 143 | "children": [ 144 | { 145 | "type": "TextNode", 146 | "value": "Two", 147 | "position": { 148 | "start": { 149 | "line": 1, 150 | "column": 15, 151 | "offset": 14 152 | }, 153 | "end": { 154 | "line": 1, 155 | "column": 18, 156 | "offset": 17 157 | } 158 | } 159 | } 160 | ], 161 | "position": { 162 | "start": { 163 | "line": 1, 164 | "column": 15, 165 | "offset": 14 166 | }, 167 | "end": { 168 | "line": 1, 169 | "column": 18, 170 | "offset": 17 171 | } 172 | } 173 | }, 174 | { 175 | "type": "WhiteSpaceNode", 176 | "value": " ", 177 | "position": { 178 | "start": { 179 | "line": 1, 180 | "column": 18, 181 | "offset": 17 182 | }, 183 | "end": { 184 | "line": 1, 185 | "column": 19, 186 | "offset": 18 187 | } 188 | } 189 | }, 190 | { 191 | "type": "WordNode", 192 | "children": [ 193 | { 194 | "type": "TextNode", 195 | "value": "sentences", 196 | "position": { 197 | "start": { 198 | "line": 1, 199 | "column": 19, 200 | "offset": 18 201 | }, 202 | "end": { 203 | "line": 1, 204 | "column": 28, 205 | "offset": 27 206 | } 207 | } 208 | } 209 | ], 210 | "position": { 211 | "start": { 212 | "line": 1, 213 | "column": 19, 214 | "offset": 18 215 | }, 216 | "end": { 217 | "line": 1, 218 | "column": 28, 219 | "offset": 27 220 | } 221 | } 222 | } 223 | ], 224 | "position": { 225 | "start": { 226 | "line": 1, 227 | "column": 15, 228 | "offset": 14 229 | }, 230 | "end": { 231 | "line": 1, 232 | "column": 28, 233 | "offset": 27 234 | } 235 | } 236 | } 237 | ], 238 | "position": { 239 | "start": { 240 | "line": 1, 241 | "column": 1, 242 | "offset": 0 243 | }, 244 | "end": { 245 | "line": 1, 246 | "column": 28, 247 | "offset": 27 248 | } 249 | } 250 | } 251 | ], 252 | "position": { 253 | "start": { 254 | "line": 1, 255 | "column": 1, 256 | "offset": 0 257 | }, 258 | "end": { 259 | "line": 1, 260 | "column": 28, 261 | "offset": 27 262 | } 263 | } 264 | } 265 | -------------------------------------------------------------------------------- /test/fixture/initialism-like.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "Self", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 5, 25 | "offset": 4 26 | } 27 | } 28 | }, 29 | { 30 | "type": "PunctuationNode", 31 | "value": "-", 32 | "position": { 33 | "start": { 34 | "line": 1, 35 | "column": 5, 36 | "offset": 4 37 | }, 38 | "end": { 39 | "line": 1, 40 | "column": 6, 41 | "offset": 5 42 | } 43 | } 44 | }, 45 | { 46 | "type": "TextNode", 47 | "value": "contained", 48 | "position": { 49 | "start": { 50 | "line": 1, 51 | "column": 6, 52 | "offset": 5 53 | }, 54 | "end": { 55 | "line": 1, 56 | "column": 15, 57 | "offset": 14 58 | } 59 | } 60 | } 61 | ], 62 | "position": { 63 | "start": { 64 | "line": 1, 65 | "column": 1, 66 | "offset": 0 67 | }, 68 | "end": { 69 | "line": 1, 70 | "column": 15, 71 | "offset": 14 72 | } 73 | } 74 | }, 75 | { 76 | "type": "PunctuationNode", 77 | "value": ".", 78 | "position": { 79 | "start": { 80 | "line": 1, 81 | "column": 15, 82 | "offset": 14 83 | }, 84 | "end": { 85 | "line": 1, 86 | "column": 16, 87 | "offset": 15 88 | } 89 | } 90 | } 91 | ], 92 | "position": { 93 | "start": { 94 | "line": 1, 95 | "column": 1, 96 | "offset": 0 97 | }, 98 | "end": { 99 | "line": 1, 100 | "column": 16, 101 | "offset": 15 102 | } 103 | } 104 | } 105 | ], 106 | "position": { 107 | "start": { 108 | "line": 1, 109 | "column": 1, 110 | "offset": 0 111 | }, 112 | "end": { 113 | "line": 1, 114 | "column": 16, 115 | "offset": 15 116 | } 117 | } 118 | } 119 | ], 120 | "position": { 121 | "start": { 122 | "line": 1, 123 | "column": 1, 124 | "offset": 0 125 | }, 126 | "end": { 127 | "line": 1, 128 | "column": 16, 129 | "offset": 15 130 | } 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /test/fixture/intelectual-copyright-symbol.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "SymbolNode", 12 | "value": "©", 13 | "position": { 14 | "start": { 15 | "line": 1, 16 | "column": 1, 17 | "offset": 0 18 | }, 19 | "end": { 20 | "line": 1, 21 | "column": 2, 22 | "offset": 1 23 | } 24 | } 25 | }, 26 | { 27 | "type": "WhiteSpaceNode", 28 | "value": " ", 29 | "position": { 30 | "start": { 31 | "line": 1, 32 | "column": 2, 33 | "offset": 1 34 | }, 35 | "end": { 36 | "line": 1, 37 | "column": 3, 38 | "offset": 2 39 | } 40 | } 41 | }, 42 | { 43 | "type": "WordNode", 44 | "children": [ 45 | { 46 | "type": "TextNode", 47 | "value": "John", 48 | "position": { 49 | "start": { 50 | "line": 1, 51 | "column": 3, 52 | "offset": 2 53 | }, 54 | "end": { 55 | "line": 1, 56 | "column": 7, 57 | "offset": 6 58 | } 59 | } 60 | } 61 | ], 62 | "position": { 63 | "start": { 64 | "line": 1, 65 | "column": 3, 66 | "offset": 2 67 | }, 68 | "end": { 69 | "line": 1, 70 | "column": 7, 71 | "offset": 6 72 | } 73 | } 74 | }, 75 | { 76 | "type": "WhiteSpaceNode", 77 | "value": " ", 78 | "position": { 79 | "start": { 80 | "line": 1, 81 | "column": 7, 82 | "offset": 6 83 | }, 84 | "end": { 85 | "line": 1, 86 | "column": 8, 87 | "offset": 7 88 | } 89 | } 90 | }, 91 | { 92 | "type": "WordNode", 93 | "children": [ 94 | { 95 | "type": "TextNode", 96 | "value": "Smith", 97 | "position": { 98 | "start": { 99 | "line": 1, 100 | "column": 8, 101 | "offset": 7 102 | }, 103 | "end": { 104 | "line": 1, 105 | "column": 13, 106 | "offset": 12 107 | } 108 | } 109 | } 110 | ], 111 | "position": { 112 | "start": { 113 | "line": 1, 114 | "column": 8, 115 | "offset": 7 116 | }, 117 | "end": { 118 | "line": 1, 119 | "column": 13, 120 | "offset": 12 121 | } 122 | } 123 | }, 124 | { 125 | "type": "PunctuationNode", 126 | "value": ".", 127 | "position": { 128 | "start": { 129 | "line": 1, 130 | "column": 13, 131 | "offset": 12 132 | }, 133 | "end": { 134 | "line": 1, 135 | "column": 14, 136 | "offset": 13 137 | } 138 | } 139 | } 140 | ], 141 | "position": { 142 | "start": { 143 | "line": 1, 144 | "column": 1, 145 | "offset": 0 146 | }, 147 | "end": { 148 | "line": 1, 149 | "column": 14, 150 | "offset": 13 151 | } 152 | } 153 | } 154 | ], 155 | "position": { 156 | "start": { 157 | "line": 1, 158 | "column": 1, 159 | "offset": 0 160 | }, 161 | "end": { 162 | "line": 1, 163 | "column": 14, 164 | "offset": 13 165 | } 166 | } 167 | } 168 | ], 169 | "position": { 170 | "start": { 171 | "line": 1, 172 | "column": 1, 173 | "offset": 0 174 | }, 175 | "end": { 176 | "line": 1, 177 | "column": 14, 178 | "offset": 13 179 | } 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /test/fixture/intelectual-service-mark.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "ABC", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 4, 25 | "offset": 3 26 | } 27 | } 28 | } 29 | ], 30 | "position": { 31 | "start": { 32 | "line": 1, 33 | "column": 1, 34 | "offset": 0 35 | }, 36 | "end": { 37 | "line": 1, 38 | "column": 4, 39 | "offset": 3 40 | } 41 | } 42 | }, 43 | { 44 | "type": "WhiteSpaceNode", 45 | "value": " ", 46 | "position": { 47 | "start": { 48 | "line": 1, 49 | "column": 4, 50 | "offset": 3 51 | }, 52 | "end": { 53 | "line": 1, 54 | "column": 5, 55 | "offset": 4 56 | } 57 | } 58 | }, 59 | { 60 | "type": "WordNode", 61 | "children": [ 62 | { 63 | "type": "TextNode", 64 | "value": "Law", 65 | "position": { 66 | "start": { 67 | "line": 1, 68 | "column": 5, 69 | "offset": 4 70 | }, 71 | "end": { 72 | "line": 1, 73 | "column": 8, 74 | "offset": 7 75 | } 76 | } 77 | } 78 | ], 79 | "position": { 80 | "start": { 81 | "line": 1, 82 | "column": 5, 83 | "offset": 4 84 | }, 85 | "end": { 86 | "line": 1, 87 | "column": 8, 88 | "offset": 7 89 | } 90 | } 91 | }, 92 | { 93 | "type": "SymbolNode", 94 | "value": "℠", 95 | "position": { 96 | "start": { 97 | "line": 1, 98 | "column": 8, 99 | "offset": 7 100 | }, 101 | "end": { 102 | "line": 1, 103 | "column": 9, 104 | "offset": 8 105 | } 106 | } 107 | }, 108 | { 109 | "type": "WhiteSpaceNode", 110 | "value": " ", 111 | "position": { 112 | "start": { 113 | "line": 1, 114 | "column": 9, 115 | "offset": 8 116 | }, 117 | "end": { 118 | "line": 1, 119 | "column": 10, 120 | "offset": 9 121 | } 122 | } 123 | }, 124 | { 125 | "type": "WordNode", 126 | "children": [ 127 | { 128 | "type": "TextNode", 129 | "value": "legal", 130 | "position": { 131 | "start": { 132 | "line": 1, 133 | "column": 10, 134 | "offset": 9 135 | }, 136 | "end": { 137 | "line": 1, 138 | "column": 15, 139 | "offset": 14 140 | } 141 | } 142 | } 143 | ], 144 | "position": { 145 | "start": { 146 | "line": 1, 147 | "column": 10, 148 | "offset": 9 149 | }, 150 | "end": { 151 | "line": 1, 152 | "column": 15, 153 | "offset": 14 154 | } 155 | } 156 | }, 157 | { 158 | "type": "WhiteSpaceNode", 159 | "value": " ", 160 | "position": { 161 | "start": { 162 | "line": 1, 163 | "column": 15, 164 | "offset": 14 165 | }, 166 | "end": { 167 | "line": 1, 168 | "column": 16, 169 | "offset": 15 170 | } 171 | } 172 | }, 173 | { 174 | "type": "WordNode", 175 | "children": [ 176 | { 177 | "type": "TextNode", 178 | "value": "services", 179 | "position": { 180 | "start": { 181 | "line": 1, 182 | "column": 16, 183 | "offset": 15 184 | }, 185 | "end": { 186 | "line": 1, 187 | "column": 24, 188 | "offset": 23 189 | } 190 | } 191 | } 192 | ], 193 | "position": { 194 | "start": { 195 | "line": 1, 196 | "column": 16, 197 | "offset": 15 198 | }, 199 | "end": { 200 | "line": 1, 201 | "column": 24, 202 | "offset": 23 203 | } 204 | } 205 | }, 206 | { 207 | "type": "PunctuationNode", 208 | "value": ".", 209 | "position": { 210 | "start": { 211 | "line": 1, 212 | "column": 24, 213 | "offset": 23 214 | }, 215 | "end": { 216 | "line": 1, 217 | "column": 25, 218 | "offset": 24 219 | } 220 | } 221 | } 222 | ], 223 | "position": { 224 | "start": { 225 | "line": 1, 226 | "column": 1, 227 | "offset": 0 228 | }, 229 | "end": { 230 | "line": 1, 231 | "column": 25, 232 | "offset": 24 233 | } 234 | } 235 | } 236 | ], 237 | "position": { 238 | "start": { 239 | "line": 1, 240 | "column": 1, 241 | "offset": 0 242 | }, 243 | "end": { 244 | "line": 1, 245 | "column": 25, 246 | "offset": 24 247 | } 248 | } 249 | } 250 | ], 251 | "position": { 252 | "start": { 253 | "line": 1, 254 | "column": 1, 255 | "offset": 0 256 | }, 257 | "end": { 258 | "line": 1, 259 | "column": 25, 260 | "offset": 24 261 | } 262 | } 263 | } 264 | -------------------------------------------------------------------------------- /test/fixture/intelectual-trademark.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "Mytrademark", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 12, 25 | "offset": 11 26 | } 27 | } 28 | } 29 | ], 30 | "position": { 31 | "start": { 32 | "line": 1, 33 | "column": 1, 34 | "offset": 0 35 | }, 36 | "end": { 37 | "line": 1, 38 | "column": 12, 39 | "offset": 11 40 | } 41 | } 42 | }, 43 | { 44 | "type": "SymbolNode", 45 | "value": "™", 46 | "position": { 47 | "start": { 48 | "line": 1, 49 | "column": 12, 50 | "offset": 11 51 | }, 52 | "end": { 53 | "line": 1, 54 | "column": 13, 55 | "offset": 12 56 | } 57 | } 58 | }, 59 | { 60 | "type": "WhiteSpaceNode", 61 | "value": " ", 62 | "position": { 63 | "start": { 64 | "line": 1, 65 | "column": 13, 66 | "offset": 12 67 | }, 68 | "end": { 69 | "line": 1, 70 | "column": 14, 71 | "offset": 13 72 | } 73 | } 74 | }, 75 | { 76 | "type": "WordNode", 77 | "children": [ 78 | { 79 | "type": "TextNode", 80 | "value": "is", 81 | "position": { 82 | "start": { 83 | "line": 1, 84 | "column": 14, 85 | "offset": 13 86 | }, 87 | "end": { 88 | "line": 1, 89 | "column": 16, 90 | "offset": 15 91 | } 92 | } 93 | } 94 | ], 95 | "position": { 96 | "start": { 97 | "line": 1, 98 | "column": 14, 99 | "offset": 13 100 | }, 101 | "end": { 102 | "line": 1, 103 | "column": 16, 104 | "offset": 15 105 | } 106 | } 107 | }, 108 | { 109 | "type": "WhiteSpaceNode", 110 | "value": " ", 111 | "position": { 112 | "start": { 113 | "line": 1, 114 | "column": 16, 115 | "offset": 15 116 | }, 117 | "end": { 118 | "line": 1, 119 | "column": 17, 120 | "offset": 16 121 | } 122 | } 123 | }, 124 | { 125 | "type": "WordNode", 126 | "children": [ 127 | { 128 | "type": "TextNode", 129 | "value": "a", 130 | "position": { 131 | "start": { 132 | "line": 1, 133 | "column": 17, 134 | "offset": 16 135 | }, 136 | "end": { 137 | "line": 1, 138 | "column": 18, 139 | "offset": 17 140 | } 141 | } 142 | } 143 | ], 144 | "position": { 145 | "start": { 146 | "line": 1, 147 | "column": 17, 148 | "offset": 16 149 | }, 150 | "end": { 151 | "line": 1, 152 | "column": 18, 153 | "offset": 17 154 | } 155 | } 156 | }, 157 | { 158 | "type": "WhiteSpaceNode", 159 | "value": " ", 160 | "position": { 161 | "start": { 162 | "line": 1, 163 | "column": 18, 164 | "offset": 17 165 | }, 166 | "end": { 167 | "line": 1, 168 | "column": 19, 169 | "offset": 18 170 | } 171 | } 172 | }, 173 | { 174 | "type": "WordNode", 175 | "children": [ 176 | { 177 | "type": "TextNode", 178 | "value": "trademark", 179 | "position": { 180 | "start": { 181 | "line": 1, 182 | "column": 19, 183 | "offset": 18 184 | }, 185 | "end": { 186 | "line": 1, 187 | "column": 28, 188 | "offset": 27 189 | } 190 | } 191 | } 192 | ], 193 | "position": { 194 | "start": { 195 | "line": 1, 196 | "column": 19, 197 | "offset": 18 198 | }, 199 | "end": { 200 | "line": 1, 201 | "column": 28, 202 | "offset": 27 203 | } 204 | } 205 | }, 206 | { 207 | "type": "PunctuationNode", 208 | "value": ".", 209 | "position": { 210 | "start": { 211 | "line": 1, 212 | "column": 28, 213 | "offset": 27 214 | }, 215 | "end": { 216 | "line": 1, 217 | "column": 29, 218 | "offset": 28 219 | } 220 | } 221 | } 222 | ], 223 | "position": { 224 | "start": { 225 | "line": 1, 226 | "column": 1, 227 | "offset": 0 228 | }, 229 | "end": { 230 | "line": 1, 231 | "column": 29, 232 | "offset": 28 233 | } 234 | } 235 | } 236 | ], 237 | "position": { 238 | "start": { 239 | "line": 1, 240 | "column": 1, 241 | "offset": 0 242 | }, 243 | "end": { 244 | "line": 1, 245 | "column": 29, 246 | "offset": 28 247 | } 248 | } 249 | } 250 | ], 251 | "position": { 252 | "start": { 253 | "line": 1, 254 | "column": 1, 255 | "offset": 0 256 | }, 257 | "end": { 258 | "line": 1, 259 | "column": 29, 260 | "offset": 28 261 | } 262 | } 263 | } 264 | -------------------------------------------------------------------------------- /test/fixture/latin-exception-al.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "Gibberish", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 10, 25 | "offset": 9 26 | } 27 | } 28 | } 29 | ], 30 | "position": { 31 | "start": { 32 | "line": 1, 33 | "column": 1, 34 | "offset": 0 35 | }, 36 | "end": { 37 | "line": 1, 38 | "column": 10, 39 | "offset": 9 40 | } 41 | } 42 | }, 43 | { 44 | "type": "WhiteSpaceNode", 45 | "value": " ", 46 | "position": { 47 | "start": { 48 | "line": 1, 49 | "column": 10, 50 | "offset": 9 51 | }, 52 | "end": { 53 | "line": 1, 54 | "column": 11, 55 | "offset": 10 56 | } 57 | } 58 | }, 59 | { 60 | "type": "WordNode", 61 | "children": [ 62 | { 63 | "type": "TextNode", 64 | "value": "something", 65 | "position": { 66 | "start": { 67 | "line": 1, 68 | "column": 11, 69 | "offset": 10 70 | }, 71 | "end": { 72 | "line": 1, 73 | "column": 20, 74 | "offset": 19 75 | } 76 | } 77 | } 78 | ], 79 | "position": { 80 | "start": { 81 | "line": 1, 82 | "column": 11, 83 | "offset": 10 84 | }, 85 | "end": { 86 | "line": 1, 87 | "column": 20, 88 | "offset": 19 89 | } 90 | } 91 | }, 92 | { 93 | "type": "WhiteSpaceNode", 94 | "value": " ", 95 | "position": { 96 | "start": { 97 | "line": 1, 98 | "column": 20, 99 | "offset": 19 100 | }, 101 | "end": { 102 | "line": 1, 103 | "column": 21, 104 | "offset": 20 105 | } 106 | } 107 | }, 108 | { 109 | "type": "WordNode", 110 | "children": [ 111 | { 112 | "type": "TextNode", 113 | "value": "Al", 114 | "position": { 115 | "start": { 116 | "line": 1, 117 | "column": 21, 118 | "offset": 20 119 | }, 120 | "end": { 121 | "line": 1, 122 | "column": 23, 123 | "offset": 22 124 | } 125 | } 126 | }, 127 | { 128 | "type": "PunctuationNode", 129 | "value": ".", 130 | "position": { 131 | "start": { 132 | "line": 1, 133 | "column": 23, 134 | "offset": 22 135 | }, 136 | "end": { 137 | "line": 1, 138 | "column": 24, 139 | "offset": 23 140 | } 141 | } 142 | } 143 | ], 144 | "position": { 145 | "start": { 146 | "line": 1, 147 | "column": 21, 148 | "offset": 20 149 | }, 150 | "end": { 151 | "line": 1, 152 | "column": 24, 153 | "offset": 23 154 | } 155 | } 156 | }, 157 | { 158 | "type": "WhiteSpaceNode", 159 | "value": " ", 160 | "position": { 161 | "start": { 162 | "line": 1, 163 | "column": 24, 164 | "offset": 23 165 | }, 166 | "end": { 167 | "line": 1, 168 | "column": 25, 169 | "offset": 24 170 | } 171 | } 172 | }, 173 | { 174 | "type": "WordNode", 175 | "children": [ 176 | { 177 | "type": "TextNode", 178 | "value": "Gobbledygook", 179 | "position": { 180 | "start": { 181 | "line": 1, 182 | "column": 25, 183 | "offset": 24 184 | }, 185 | "end": { 186 | "line": 1, 187 | "column": 37, 188 | "offset": 36 189 | } 190 | } 191 | } 192 | ], 193 | "position": { 194 | "start": { 195 | "line": 1, 196 | "column": 25, 197 | "offset": 24 198 | }, 199 | "end": { 200 | "line": 1, 201 | "column": 37, 202 | "offset": 36 203 | } 204 | } 205 | }, 206 | { 207 | "type": "PunctuationNode", 208 | "value": ".", 209 | "position": { 210 | "start": { 211 | "line": 1, 212 | "column": 37, 213 | "offset": 36 214 | }, 215 | "end": { 216 | "line": 1, 217 | "column": 38, 218 | "offset": 37 219 | } 220 | } 221 | } 222 | ], 223 | "position": { 224 | "start": { 225 | "line": 1, 226 | "column": 1, 227 | "offset": 0 228 | }, 229 | "end": { 230 | "line": 1, 231 | "column": 38, 232 | "offset": 37 233 | } 234 | } 235 | } 236 | ], 237 | "position": { 238 | "start": { 239 | "line": 1, 240 | "column": 1, 241 | "offset": 0 242 | }, 243 | "end": { 244 | "line": 1, 245 | "column": 38, 246 | "offset": 37 247 | } 248 | } 249 | } 250 | ], 251 | "position": { 252 | "start": { 253 | "line": 1, 254 | "column": 1, 255 | "offset": 0 256 | }, 257 | "end": { 258 | "line": 1, 259 | "column": 38, 260 | "offset": 37 261 | } 262 | } 263 | } 264 | -------------------------------------------------------------------------------- /test/fixture/latin-exception-ca.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "Gibberish", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 10, 25 | "offset": 9 26 | } 27 | } 28 | } 29 | ], 30 | "position": { 31 | "start": { 32 | "line": 1, 33 | "column": 1, 34 | "offset": 0 35 | }, 36 | "end": { 37 | "line": 1, 38 | "column": 10, 39 | "offset": 9 40 | } 41 | } 42 | }, 43 | { 44 | "type": "WhiteSpaceNode", 45 | "value": " ", 46 | "position": { 47 | "start": { 48 | "line": 1, 49 | "column": 10, 50 | "offset": 9 51 | }, 52 | "end": { 53 | "line": 1, 54 | "column": 11, 55 | "offset": 10 56 | } 57 | } 58 | }, 59 | { 60 | "type": "WordNode", 61 | "children": [ 62 | { 63 | "type": "TextNode", 64 | "value": "something", 65 | "position": { 66 | "start": { 67 | "line": 1, 68 | "column": 11, 69 | "offset": 10 70 | }, 71 | "end": { 72 | "line": 1, 73 | "column": 20, 74 | "offset": 19 75 | } 76 | } 77 | } 78 | ], 79 | "position": { 80 | "start": { 81 | "line": 1, 82 | "column": 11, 83 | "offset": 10 84 | }, 85 | "end": { 86 | "line": 1, 87 | "column": 20, 88 | "offset": 19 89 | } 90 | } 91 | }, 92 | { 93 | "type": "WhiteSpaceNode", 94 | "value": " ", 95 | "position": { 96 | "start": { 97 | "line": 1, 98 | "column": 20, 99 | "offset": 19 100 | }, 101 | "end": { 102 | "line": 1, 103 | "column": 21, 104 | "offset": 20 105 | } 106 | } 107 | }, 108 | { 109 | "type": "WordNode", 110 | "children": [ 111 | { 112 | "type": "TextNode", 113 | "value": "Ca", 114 | "position": { 115 | "start": { 116 | "line": 1, 117 | "column": 21, 118 | "offset": 20 119 | }, 120 | "end": { 121 | "line": 1, 122 | "column": 23, 123 | "offset": 22 124 | } 125 | } 126 | }, 127 | { 128 | "type": "PunctuationNode", 129 | "value": ".", 130 | "position": { 131 | "start": { 132 | "line": 1, 133 | "column": 23, 134 | "offset": 22 135 | }, 136 | "end": { 137 | "line": 1, 138 | "column": 24, 139 | "offset": 23 140 | } 141 | } 142 | } 143 | ], 144 | "position": { 145 | "start": { 146 | "line": 1, 147 | "column": 21, 148 | "offset": 20 149 | }, 150 | "end": { 151 | "line": 1, 152 | "column": 24, 153 | "offset": 23 154 | } 155 | } 156 | }, 157 | { 158 | "type": "WhiteSpaceNode", 159 | "value": " ", 160 | "position": { 161 | "start": { 162 | "line": 1, 163 | "column": 24, 164 | "offset": 23 165 | }, 166 | "end": { 167 | "line": 1, 168 | "column": 25, 169 | "offset": 24 170 | } 171 | } 172 | }, 173 | { 174 | "type": "WordNode", 175 | "children": [ 176 | { 177 | "type": "TextNode", 178 | "value": "Gobbledygook", 179 | "position": { 180 | "start": { 181 | "line": 1, 182 | "column": 25, 183 | "offset": 24 184 | }, 185 | "end": { 186 | "line": 1, 187 | "column": 37, 188 | "offset": 36 189 | } 190 | } 191 | } 192 | ], 193 | "position": { 194 | "start": { 195 | "line": 1, 196 | "column": 25, 197 | "offset": 24 198 | }, 199 | "end": { 200 | "line": 1, 201 | "column": 37, 202 | "offset": 36 203 | } 204 | } 205 | }, 206 | { 207 | "type": "PunctuationNode", 208 | "value": ".", 209 | "position": { 210 | "start": { 211 | "line": 1, 212 | "column": 37, 213 | "offset": 36 214 | }, 215 | "end": { 216 | "line": 1, 217 | "column": 38, 218 | "offset": 37 219 | } 220 | } 221 | } 222 | ], 223 | "position": { 224 | "start": { 225 | "line": 1, 226 | "column": 1, 227 | "offset": 0 228 | }, 229 | "end": { 230 | "line": 1, 231 | "column": 38, 232 | "offset": 37 233 | } 234 | } 235 | } 236 | ], 237 | "position": { 238 | "start": { 239 | "line": 1, 240 | "column": 1, 241 | "offset": 0 242 | }, 243 | "end": { 244 | "line": 1, 245 | "column": 38, 246 | "offset": 37 247 | } 248 | } 249 | } 250 | ], 251 | "position": { 252 | "start": { 253 | "line": 1, 254 | "column": 1, 255 | "offset": 0 256 | }, 257 | "end": { 258 | "line": 1, 259 | "column": 38, 260 | "offset": 37 261 | } 262 | } 263 | } 264 | -------------------------------------------------------------------------------- /test/fixture/latin-exception-cf.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "Gibberish", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 10, 25 | "offset": 9 26 | } 27 | } 28 | } 29 | ], 30 | "position": { 31 | "start": { 32 | "line": 1, 33 | "column": 1, 34 | "offset": 0 35 | }, 36 | "end": { 37 | "line": 1, 38 | "column": 10, 39 | "offset": 9 40 | } 41 | } 42 | }, 43 | { 44 | "type": "WhiteSpaceNode", 45 | "value": " ", 46 | "position": { 47 | "start": { 48 | "line": 1, 49 | "column": 10, 50 | "offset": 9 51 | }, 52 | "end": { 53 | "line": 1, 54 | "column": 11, 55 | "offset": 10 56 | } 57 | } 58 | }, 59 | { 60 | "type": "WordNode", 61 | "children": [ 62 | { 63 | "type": "TextNode", 64 | "value": "something", 65 | "position": { 66 | "start": { 67 | "line": 1, 68 | "column": 11, 69 | "offset": 10 70 | }, 71 | "end": { 72 | "line": 1, 73 | "column": 20, 74 | "offset": 19 75 | } 76 | } 77 | } 78 | ], 79 | "position": { 80 | "start": { 81 | "line": 1, 82 | "column": 11, 83 | "offset": 10 84 | }, 85 | "end": { 86 | "line": 1, 87 | "column": 20, 88 | "offset": 19 89 | } 90 | } 91 | }, 92 | { 93 | "type": "WhiteSpaceNode", 94 | "value": " ", 95 | "position": { 96 | "start": { 97 | "line": 1, 98 | "column": 20, 99 | "offset": 19 100 | }, 101 | "end": { 102 | "line": 1, 103 | "column": 21, 104 | "offset": 20 105 | } 106 | } 107 | }, 108 | { 109 | "type": "WordNode", 110 | "children": [ 111 | { 112 | "type": "TextNode", 113 | "value": "Cf", 114 | "position": { 115 | "start": { 116 | "line": 1, 117 | "column": 21, 118 | "offset": 20 119 | }, 120 | "end": { 121 | "line": 1, 122 | "column": 23, 123 | "offset": 22 124 | } 125 | } 126 | }, 127 | { 128 | "type": "PunctuationNode", 129 | "value": ".", 130 | "position": { 131 | "start": { 132 | "line": 1, 133 | "column": 23, 134 | "offset": 22 135 | }, 136 | "end": { 137 | "line": 1, 138 | "column": 24, 139 | "offset": 23 140 | } 141 | } 142 | } 143 | ], 144 | "position": { 145 | "start": { 146 | "line": 1, 147 | "column": 21, 148 | "offset": 20 149 | }, 150 | "end": { 151 | "line": 1, 152 | "column": 24, 153 | "offset": 23 154 | } 155 | } 156 | }, 157 | { 158 | "type": "WhiteSpaceNode", 159 | "value": " ", 160 | "position": { 161 | "start": { 162 | "line": 1, 163 | "column": 24, 164 | "offset": 23 165 | }, 166 | "end": { 167 | "line": 1, 168 | "column": 25, 169 | "offset": 24 170 | } 171 | } 172 | }, 173 | { 174 | "type": "WordNode", 175 | "children": [ 176 | { 177 | "type": "TextNode", 178 | "value": "Gobbledygook", 179 | "position": { 180 | "start": { 181 | "line": 1, 182 | "column": 25, 183 | "offset": 24 184 | }, 185 | "end": { 186 | "line": 1, 187 | "column": 37, 188 | "offset": 36 189 | } 190 | } 191 | } 192 | ], 193 | "position": { 194 | "start": { 195 | "line": 1, 196 | "column": 25, 197 | "offset": 24 198 | }, 199 | "end": { 200 | "line": 1, 201 | "column": 37, 202 | "offset": 36 203 | } 204 | } 205 | }, 206 | { 207 | "type": "PunctuationNode", 208 | "value": ".", 209 | "position": { 210 | "start": { 211 | "line": 1, 212 | "column": 37, 213 | "offset": 36 214 | }, 215 | "end": { 216 | "line": 1, 217 | "column": 38, 218 | "offset": 37 219 | } 220 | } 221 | } 222 | ], 223 | "position": { 224 | "start": { 225 | "line": 1, 226 | "column": 1, 227 | "offset": 0 228 | }, 229 | "end": { 230 | "line": 1, 231 | "column": 38, 232 | "offset": 37 233 | } 234 | } 235 | } 236 | ], 237 | "position": { 238 | "start": { 239 | "line": 1, 240 | "column": 1, 241 | "offset": 0 242 | }, 243 | "end": { 244 | "line": 1, 245 | "column": 38, 246 | "offset": 37 247 | } 248 | } 249 | } 250 | ], 251 | "position": { 252 | "start": { 253 | "line": 1, 254 | "column": 1, 255 | "offset": 0 256 | }, 257 | "end": { 258 | "line": 1, 259 | "column": 38, 260 | "offset": 37 261 | } 262 | } 263 | } 264 | -------------------------------------------------------------------------------- /test/fixture/latin-exception-cp.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "Gibberish", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 10, 25 | "offset": 9 26 | } 27 | } 28 | } 29 | ], 30 | "position": { 31 | "start": { 32 | "line": 1, 33 | "column": 1, 34 | "offset": 0 35 | }, 36 | "end": { 37 | "line": 1, 38 | "column": 10, 39 | "offset": 9 40 | } 41 | } 42 | }, 43 | { 44 | "type": "WhiteSpaceNode", 45 | "value": " ", 46 | "position": { 47 | "start": { 48 | "line": 1, 49 | "column": 10, 50 | "offset": 9 51 | }, 52 | "end": { 53 | "line": 1, 54 | "column": 11, 55 | "offset": 10 56 | } 57 | } 58 | }, 59 | { 60 | "type": "WordNode", 61 | "children": [ 62 | { 63 | "type": "TextNode", 64 | "value": "something", 65 | "position": { 66 | "start": { 67 | "line": 1, 68 | "column": 11, 69 | "offset": 10 70 | }, 71 | "end": { 72 | "line": 1, 73 | "column": 20, 74 | "offset": 19 75 | } 76 | } 77 | } 78 | ], 79 | "position": { 80 | "start": { 81 | "line": 1, 82 | "column": 11, 83 | "offset": 10 84 | }, 85 | "end": { 86 | "line": 1, 87 | "column": 20, 88 | "offset": 19 89 | } 90 | } 91 | }, 92 | { 93 | "type": "WhiteSpaceNode", 94 | "value": " ", 95 | "position": { 96 | "start": { 97 | "line": 1, 98 | "column": 20, 99 | "offset": 19 100 | }, 101 | "end": { 102 | "line": 1, 103 | "column": 21, 104 | "offset": 20 105 | } 106 | } 107 | }, 108 | { 109 | "type": "WordNode", 110 | "children": [ 111 | { 112 | "type": "TextNode", 113 | "value": "Cp", 114 | "position": { 115 | "start": { 116 | "line": 1, 117 | "column": 21, 118 | "offset": 20 119 | }, 120 | "end": { 121 | "line": 1, 122 | "column": 23, 123 | "offset": 22 124 | } 125 | } 126 | }, 127 | { 128 | "type": "PunctuationNode", 129 | "value": ".", 130 | "position": { 131 | "start": { 132 | "line": 1, 133 | "column": 23, 134 | "offset": 22 135 | }, 136 | "end": { 137 | "line": 1, 138 | "column": 24, 139 | "offset": 23 140 | } 141 | } 142 | } 143 | ], 144 | "position": { 145 | "start": { 146 | "line": 1, 147 | "column": 21, 148 | "offset": 20 149 | }, 150 | "end": { 151 | "line": 1, 152 | "column": 24, 153 | "offset": 23 154 | } 155 | } 156 | }, 157 | { 158 | "type": "WhiteSpaceNode", 159 | "value": " ", 160 | "position": { 161 | "start": { 162 | "line": 1, 163 | "column": 24, 164 | "offset": 23 165 | }, 166 | "end": { 167 | "line": 1, 168 | "column": 25, 169 | "offset": 24 170 | } 171 | } 172 | }, 173 | { 174 | "type": "WordNode", 175 | "children": [ 176 | { 177 | "type": "TextNode", 178 | "value": "Gobbledygook", 179 | "position": { 180 | "start": { 181 | "line": 1, 182 | "column": 25, 183 | "offset": 24 184 | }, 185 | "end": { 186 | "line": 1, 187 | "column": 37, 188 | "offset": 36 189 | } 190 | } 191 | } 192 | ], 193 | "position": { 194 | "start": { 195 | "line": 1, 196 | "column": 25, 197 | "offset": 24 198 | }, 199 | "end": { 200 | "line": 1, 201 | "column": 37, 202 | "offset": 36 203 | } 204 | } 205 | }, 206 | { 207 | "type": "PunctuationNode", 208 | "value": ".", 209 | "position": { 210 | "start": { 211 | "line": 1, 212 | "column": 37, 213 | "offset": 36 214 | }, 215 | "end": { 216 | "line": 1, 217 | "column": 38, 218 | "offset": 37 219 | } 220 | } 221 | } 222 | ], 223 | "position": { 224 | "start": { 225 | "line": 1, 226 | "column": 1, 227 | "offset": 0 228 | }, 229 | "end": { 230 | "line": 1, 231 | "column": 38, 232 | "offset": 37 233 | } 234 | } 235 | } 236 | ], 237 | "position": { 238 | "start": { 239 | "line": 1, 240 | "column": 1, 241 | "offset": 0 242 | }, 243 | "end": { 244 | "line": 1, 245 | "column": 38, 246 | "offset": 37 247 | } 248 | } 249 | } 250 | ], 251 | "position": { 252 | "start": { 253 | "line": 1, 254 | "column": 1, 255 | "offset": 0 256 | }, 257 | "end": { 258 | "line": 1, 259 | "column": 38, 260 | "offset": 37 261 | } 262 | } 263 | } 264 | -------------------------------------------------------------------------------- /test/fixture/latin-exception-ff.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "Gibberish", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 10, 25 | "offset": 9 26 | } 27 | } 28 | } 29 | ], 30 | "position": { 31 | "start": { 32 | "line": 1, 33 | "column": 1, 34 | "offset": 0 35 | }, 36 | "end": { 37 | "line": 1, 38 | "column": 10, 39 | "offset": 9 40 | } 41 | } 42 | }, 43 | { 44 | "type": "WhiteSpaceNode", 45 | "value": " ", 46 | "position": { 47 | "start": { 48 | "line": 1, 49 | "column": 10, 50 | "offset": 9 51 | }, 52 | "end": { 53 | "line": 1, 54 | "column": 11, 55 | "offset": 10 56 | } 57 | } 58 | }, 59 | { 60 | "type": "WordNode", 61 | "children": [ 62 | { 63 | "type": "TextNode", 64 | "value": "something", 65 | "position": { 66 | "start": { 67 | "line": 1, 68 | "column": 11, 69 | "offset": 10 70 | }, 71 | "end": { 72 | "line": 1, 73 | "column": 20, 74 | "offset": 19 75 | } 76 | } 77 | } 78 | ], 79 | "position": { 80 | "start": { 81 | "line": 1, 82 | "column": 11, 83 | "offset": 10 84 | }, 85 | "end": { 86 | "line": 1, 87 | "column": 20, 88 | "offset": 19 89 | } 90 | } 91 | }, 92 | { 93 | "type": "WhiteSpaceNode", 94 | "value": " ", 95 | "position": { 96 | "start": { 97 | "line": 1, 98 | "column": 20, 99 | "offset": 19 100 | }, 101 | "end": { 102 | "line": 1, 103 | "column": 21, 104 | "offset": 20 105 | } 106 | } 107 | }, 108 | { 109 | "type": "WordNode", 110 | "children": [ 111 | { 112 | "type": "TextNode", 113 | "value": "Ff", 114 | "position": { 115 | "start": { 116 | "line": 1, 117 | "column": 21, 118 | "offset": 20 119 | }, 120 | "end": { 121 | "line": 1, 122 | "column": 23, 123 | "offset": 22 124 | } 125 | } 126 | }, 127 | { 128 | "type": "PunctuationNode", 129 | "value": ".", 130 | "position": { 131 | "start": { 132 | "line": 1, 133 | "column": 23, 134 | "offset": 22 135 | }, 136 | "end": { 137 | "line": 1, 138 | "column": 24, 139 | "offset": 23 140 | } 141 | } 142 | } 143 | ], 144 | "position": { 145 | "start": { 146 | "line": 1, 147 | "column": 21, 148 | "offset": 20 149 | }, 150 | "end": { 151 | "line": 1, 152 | "column": 24, 153 | "offset": 23 154 | } 155 | } 156 | }, 157 | { 158 | "type": "WhiteSpaceNode", 159 | "value": " ", 160 | "position": { 161 | "start": { 162 | "line": 1, 163 | "column": 24, 164 | "offset": 23 165 | }, 166 | "end": { 167 | "line": 1, 168 | "column": 25, 169 | "offset": 24 170 | } 171 | } 172 | }, 173 | { 174 | "type": "WordNode", 175 | "children": [ 176 | { 177 | "type": "TextNode", 178 | "value": "Gobbledygook", 179 | "position": { 180 | "start": { 181 | "line": 1, 182 | "column": 25, 183 | "offset": 24 184 | }, 185 | "end": { 186 | "line": 1, 187 | "column": 37, 188 | "offset": 36 189 | } 190 | } 191 | } 192 | ], 193 | "position": { 194 | "start": { 195 | "line": 1, 196 | "column": 25, 197 | "offset": 24 198 | }, 199 | "end": { 200 | "line": 1, 201 | "column": 37, 202 | "offset": 36 203 | } 204 | } 205 | }, 206 | { 207 | "type": "PunctuationNode", 208 | "value": ".", 209 | "position": { 210 | "start": { 211 | "line": 1, 212 | "column": 37, 213 | "offset": 36 214 | }, 215 | "end": { 216 | "line": 1, 217 | "column": 38, 218 | "offset": 37 219 | } 220 | } 221 | } 222 | ], 223 | "position": { 224 | "start": { 225 | "line": 1, 226 | "column": 1, 227 | "offset": 0 228 | }, 229 | "end": { 230 | "line": 1, 231 | "column": 38, 232 | "offset": 37 233 | } 234 | } 235 | } 236 | ], 237 | "position": { 238 | "start": { 239 | "line": 1, 240 | "column": 1, 241 | "offset": 0 242 | }, 243 | "end": { 244 | "line": 1, 245 | "column": 38, 246 | "offset": 37 247 | } 248 | } 249 | } 250 | ], 251 | "position": { 252 | "start": { 253 | "line": 1, 254 | "column": 1, 255 | "offset": 0 256 | }, 257 | "end": { 258 | "line": 1, 259 | "column": 38, 260 | "offset": 37 261 | } 262 | } 263 | } 264 | -------------------------------------------------------------------------------- /test/fixture/non-alphabetic-sentence.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "SymbolNode", 12 | "value": "🐸", 13 | "position": { 14 | "start": { 15 | "line": 1, 16 | "column": 1, 17 | "offset": 0 18 | }, 19 | "end": { 20 | "line": 1, 21 | "column": 3, 22 | "offset": 2 23 | } 24 | } 25 | }, 26 | { 27 | "type": "PunctuationNode", 28 | "value": ".", 29 | "position": { 30 | "start": { 31 | "line": 1, 32 | "column": 3, 33 | "offset": 2 34 | }, 35 | "end": { 36 | "line": 1, 37 | "column": 4, 38 | "offset": 3 39 | } 40 | } 41 | } 42 | ], 43 | "position": { 44 | "start": { 45 | "line": 1, 46 | "column": 1, 47 | "offset": 0 48 | }, 49 | "end": { 50 | "line": 1, 51 | "column": 4, 52 | "offset": 3 53 | } 54 | } 55 | } 56 | ], 57 | "position": { 58 | "start": { 59 | "line": 1, 60 | "column": 1, 61 | "offset": 0 62 | }, 63 | "end": { 64 | "line": 1, 65 | "column": 4, 66 | "offset": 3 67 | } 68 | } 69 | } 70 | ], 71 | "position": { 72 | "start": { 73 | "line": 1, 74 | "column": 1, 75 | "offset": 0 76 | }, 77 | "end": { 78 | "line": 1, 79 | "column": 4, 80 | "offset": 3 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /test/fixture/terminal-marker-new-line-multiple.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "Aha", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 4, 25 | "offset": 3 26 | } 27 | } 28 | } 29 | ], 30 | "position": { 31 | "start": { 32 | "line": 1, 33 | "column": 1, 34 | "offset": 0 35 | }, 36 | "end": { 37 | "line": 1, 38 | "column": 4, 39 | "offset": 3 40 | } 41 | } 42 | } 43 | ], 44 | "position": { 45 | "start": { 46 | "line": 1, 47 | "column": 1, 48 | "offset": 0 49 | }, 50 | "end": { 51 | "line": 1, 52 | "column": 4, 53 | "offset": 3 54 | } 55 | } 56 | } 57 | ], 58 | "position": { 59 | "start": { 60 | "line": 1, 61 | "column": 1, 62 | "offset": 0 63 | }, 64 | "end": { 65 | "line": 1, 66 | "column": 4, 67 | "offset": 3 68 | } 69 | } 70 | }, 71 | { 72 | "type": "WhiteSpaceNode", 73 | "value": "\n\n", 74 | "position": { 75 | "start": { 76 | "line": 1, 77 | "column": 4, 78 | "offset": 3 79 | }, 80 | "end": { 81 | "line": 3, 82 | "column": 1, 83 | "offset": 5 84 | } 85 | } 86 | }, 87 | { 88 | "type": "ParagraphNode", 89 | "children": [ 90 | { 91 | "type": "SentenceNode", 92 | "children": [ 93 | { 94 | "type": "WordNode", 95 | "children": [ 96 | { 97 | "type": "TextNode", 98 | "value": "oho", 99 | "position": { 100 | "start": { 101 | "line": 3, 102 | "column": 1, 103 | "offset": 5 104 | }, 105 | "end": { 106 | "line": 3, 107 | "column": 4, 108 | "offset": 8 109 | } 110 | } 111 | } 112 | ], 113 | "position": { 114 | "start": { 115 | "line": 3, 116 | "column": 1, 117 | "offset": 5 118 | }, 119 | "end": { 120 | "line": 3, 121 | "column": 4, 122 | "offset": 8 123 | } 124 | } 125 | } 126 | ], 127 | "position": { 128 | "start": { 129 | "line": 3, 130 | "column": 1, 131 | "offset": 5 132 | }, 133 | "end": { 134 | "line": 3, 135 | "column": 4, 136 | "offset": 8 137 | } 138 | } 139 | } 140 | ], 141 | "position": { 142 | "start": { 143 | "line": 3, 144 | "column": 1, 145 | "offset": 5 146 | }, 147 | "end": { 148 | "line": 3, 149 | "column": 4, 150 | "offset": 8 151 | } 152 | } 153 | }, 154 | { 155 | "type": "WhiteSpaceNode", 156 | "value": "\n\n", 157 | "position": { 158 | "start": { 159 | "line": 3, 160 | "column": 4, 161 | "offset": 8 162 | }, 163 | "end": { 164 | "line": 5, 165 | "column": 1, 166 | "offset": 10 167 | } 168 | } 169 | }, 170 | { 171 | "type": "ParagraphNode", 172 | "children": [ 173 | { 174 | "type": "SentenceNode", 175 | "children": [ 176 | { 177 | "type": "WordNode", 178 | "children": [ 179 | { 180 | "type": "TextNode", 181 | "value": "uhu", 182 | "position": { 183 | "start": { 184 | "line": 5, 185 | "column": 1, 186 | "offset": 10 187 | }, 188 | "end": { 189 | "line": 5, 190 | "column": 4, 191 | "offset": 13 192 | } 193 | } 194 | } 195 | ], 196 | "position": { 197 | "start": { 198 | "line": 5, 199 | "column": 1, 200 | "offset": 10 201 | }, 202 | "end": { 203 | "line": 5, 204 | "column": 4, 205 | "offset": 13 206 | } 207 | } 208 | }, 209 | { 210 | "type": "PunctuationNode", 211 | "value": ".", 212 | "position": { 213 | "start": { 214 | "line": 5, 215 | "column": 4, 216 | "offset": 13 217 | }, 218 | "end": { 219 | "line": 5, 220 | "column": 5, 221 | "offset": 14 222 | } 223 | } 224 | } 225 | ], 226 | "position": { 227 | "start": { 228 | "line": 5, 229 | "column": 1, 230 | "offset": 10 231 | }, 232 | "end": { 233 | "line": 5, 234 | "column": 5, 235 | "offset": 14 236 | } 237 | } 238 | } 239 | ], 240 | "position": { 241 | "start": { 242 | "line": 5, 243 | "column": 1, 244 | "offset": 10 245 | }, 246 | "end": { 247 | "line": 5, 248 | "column": 5, 249 | "offset": 14 250 | } 251 | } 252 | }, 253 | { 254 | "type": "WhiteSpaceNode", 255 | "value": "\n", 256 | "position": { 257 | "start": { 258 | "line": 5, 259 | "column": 5, 260 | "offset": 14 261 | }, 262 | "end": { 263 | "line": 6, 264 | "column": 1, 265 | "offset": 15 266 | } 267 | } 268 | } 269 | ], 270 | "position": { 271 | "start": { 272 | "line": 1, 273 | "column": 1, 274 | "offset": 0 275 | }, 276 | "end": { 277 | "line": 6, 278 | "column": 1, 279 | "offset": 15 280 | } 281 | } 282 | } 283 | -------------------------------------------------------------------------------- /test/fixture/trailing-white-space-final-paragraph.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "ParagraphNode", 3 | "children": [ 4 | { 5 | "type": "SentenceNode", 6 | "children": [ 7 | { 8 | "type": "WordNode", 9 | "children": [ 10 | { 11 | "type": "TextNode", 12 | "value": "A", 13 | "position": { 14 | "start": { 15 | "line": 1, 16 | "column": 1, 17 | "offset": 0 18 | }, 19 | "end": { 20 | "line": 1, 21 | "column": 2, 22 | "offset": 1 23 | } 24 | } 25 | } 26 | ], 27 | "position": { 28 | "start": { 29 | "line": 1, 30 | "column": 1, 31 | "offset": 0 32 | }, 33 | "end": { 34 | "line": 1, 35 | "column": 2, 36 | "offset": 1 37 | } 38 | } 39 | }, 40 | { 41 | "type": "WhiteSpaceNode", 42 | "value": " ", 43 | "position": { 44 | "start": { 45 | "line": 1, 46 | "column": 2, 47 | "offset": 1 48 | }, 49 | "end": { 50 | "line": 1, 51 | "column": 3, 52 | "offset": 2 53 | } 54 | } 55 | }, 56 | { 57 | "type": "WordNode", 58 | "children": [ 59 | { 60 | "type": "TextNode", 61 | "value": "sentence", 62 | "position": { 63 | "start": { 64 | "line": 1, 65 | "column": 3, 66 | "offset": 2 67 | }, 68 | "end": { 69 | "line": 1, 70 | "column": 11, 71 | "offset": 10 72 | } 73 | } 74 | } 75 | ], 76 | "position": { 77 | "start": { 78 | "line": 1, 79 | "column": 3, 80 | "offset": 2 81 | }, 82 | "end": { 83 | "line": 1, 84 | "column": 11, 85 | "offset": 10 86 | } 87 | } 88 | }, 89 | { 90 | "type": "PunctuationNode", 91 | "value": ".", 92 | "position": { 93 | "start": { 94 | "line": 1, 95 | "column": 11, 96 | "offset": 10 97 | }, 98 | "end": { 99 | "line": 1, 100 | "column": 12, 101 | "offset": 11 102 | } 103 | } 104 | } 105 | ], 106 | "position": { 107 | "start": { 108 | "line": 1, 109 | "column": 1, 110 | "offset": 0 111 | }, 112 | "end": { 113 | "line": 1, 114 | "column": 12, 115 | "offset": 11 116 | } 117 | } 118 | }, 119 | { 120 | "type": "WhiteSpaceNode", 121 | "value": " ", 122 | "position": { 123 | "start": { 124 | "line": 1, 125 | "column": 12, 126 | "offset": 11 127 | }, 128 | "end": { 129 | "line": 1, 130 | "column": 13, 131 | "offset": 12 132 | } 133 | } 134 | } 135 | ], 136 | "position": { 137 | "start": { 138 | "line": 1, 139 | "column": 1, 140 | "offset": 0 141 | }, 142 | "end": { 143 | "line": 1, 144 | "column": 13, 145 | "offset": 12 146 | } 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /test/fixture/trailing-white-space-final-sentence.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "SentenceNode", 3 | "children": [ 4 | { 5 | "type": "WordNode", 6 | "children": [ 7 | { 8 | "type": "TextNode", 9 | "value": "A", 10 | "position": { 11 | "start": { 12 | "line": 1, 13 | "column": 1, 14 | "offset": 0 15 | }, 16 | "end": { 17 | "line": 1, 18 | "column": 2, 19 | "offset": 1 20 | } 21 | } 22 | } 23 | ], 24 | "position": { 25 | "start": { 26 | "line": 1, 27 | "column": 1, 28 | "offset": 0 29 | }, 30 | "end": { 31 | "line": 1, 32 | "column": 2, 33 | "offset": 1 34 | } 35 | } 36 | }, 37 | { 38 | "type": "WhiteSpaceNode", 39 | "value": " ", 40 | "position": { 41 | "start": { 42 | "line": 1, 43 | "column": 2, 44 | "offset": 1 45 | }, 46 | "end": { 47 | "line": 1, 48 | "column": 3, 49 | "offset": 2 50 | } 51 | } 52 | }, 53 | { 54 | "type": "WordNode", 55 | "children": [ 56 | { 57 | "type": "TextNode", 58 | "value": "sentence", 59 | "position": { 60 | "start": { 61 | "line": 1, 62 | "column": 3, 63 | "offset": 2 64 | }, 65 | "end": { 66 | "line": 1, 67 | "column": 11, 68 | "offset": 10 69 | } 70 | } 71 | } 72 | ], 73 | "position": { 74 | "start": { 75 | "line": 1, 76 | "column": 3, 77 | "offset": 2 78 | }, 79 | "end": { 80 | "line": 1, 81 | "column": 11, 82 | "offset": 10 83 | } 84 | } 85 | }, 86 | { 87 | "type": "PunctuationNode", 88 | "value": ".", 89 | "position": { 90 | "start": { 91 | "line": 1, 92 | "column": 11, 93 | "offset": 10 94 | }, 95 | "end": { 96 | "line": 1, 97 | "column": 12, 98 | "offset": 11 99 | } 100 | } 101 | }, 102 | { 103 | "type": "WhiteSpaceNode", 104 | "value": " ", 105 | "position": { 106 | "start": { 107 | "line": 1, 108 | "column": 12, 109 | "offset": 11 110 | }, 111 | "end": { 112 | "line": 1, 113 | "column": 13, 114 | "offset": 12 115 | } 116 | } 117 | } 118 | ], 119 | "position": { 120 | "start": { 121 | "line": 1, 122 | "column": 1, 123 | "offset": 0 124 | }, 125 | "end": { 126 | "line": 1, 127 | "column": 13, 128 | "offset": 12 129 | } 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /test/fixture/trailing-white-space-final.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "A", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 2, 25 | "offset": 1 26 | } 27 | } 28 | } 29 | ], 30 | "position": { 31 | "start": { 32 | "line": 1, 33 | "column": 1, 34 | "offset": 0 35 | }, 36 | "end": { 37 | "line": 1, 38 | "column": 2, 39 | "offset": 1 40 | } 41 | } 42 | }, 43 | { 44 | "type": "WhiteSpaceNode", 45 | "value": " ", 46 | "position": { 47 | "start": { 48 | "line": 1, 49 | "column": 2, 50 | "offset": 1 51 | }, 52 | "end": { 53 | "line": 1, 54 | "column": 3, 55 | "offset": 2 56 | } 57 | } 58 | }, 59 | { 60 | "type": "WordNode", 61 | "children": [ 62 | { 63 | "type": "TextNode", 64 | "value": "sentence", 65 | "position": { 66 | "start": { 67 | "line": 1, 68 | "column": 3, 69 | "offset": 2 70 | }, 71 | "end": { 72 | "line": 1, 73 | "column": 11, 74 | "offset": 10 75 | } 76 | } 77 | } 78 | ], 79 | "position": { 80 | "start": { 81 | "line": 1, 82 | "column": 3, 83 | "offset": 2 84 | }, 85 | "end": { 86 | "line": 1, 87 | "column": 11, 88 | "offset": 10 89 | } 90 | } 91 | }, 92 | { 93 | "type": "PunctuationNode", 94 | "value": ".", 95 | "position": { 96 | "start": { 97 | "line": 1, 98 | "column": 11, 99 | "offset": 10 100 | }, 101 | "end": { 102 | "line": 1, 103 | "column": 12, 104 | "offset": 11 105 | } 106 | } 107 | } 108 | ], 109 | "position": { 110 | "start": { 111 | "line": 1, 112 | "column": 1, 113 | "offset": 0 114 | }, 115 | "end": { 116 | "line": 1, 117 | "column": 12, 118 | "offset": 11 119 | } 120 | } 121 | } 122 | ], 123 | "position": { 124 | "start": { 125 | "line": 1, 126 | "column": 1, 127 | "offset": 0 128 | }, 129 | "end": { 130 | "line": 1, 131 | "column": 12, 132 | "offset": 11 133 | } 134 | } 135 | }, 136 | { 137 | "type": "WhiteSpaceNode", 138 | "value": " ", 139 | "position": { 140 | "start": { 141 | "line": 1, 142 | "column": 12, 143 | "offset": 11 144 | }, 145 | "end": { 146 | "line": 1, 147 | "column": 13, 148 | "offset": 12 149 | } 150 | } 151 | } 152 | ], 153 | "position": { 154 | "start": { 155 | "line": 1, 156 | "column": 1, 157 | "offset": 0 158 | }, 159 | "end": { 160 | "line": 1, 161 | "column": 13, 162 | "offset": 12 163 | } 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /test/fixture/trailing-white-space-initial-paragraph.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "ParagraphNode", 3 | "children": [ 4 | { 5 | "type": "WhiteSpaceNode", 6 | "value": "\n", 7 | "position": { 8 | "start": { 9 | "line": 1, 10 | "column": 1, 11 | "offset": 0 12 | }, 13 | "end": { 14 | "line": 2, 15 | "column": 1, 16 | "offset": 1 17 | } 18 | } 19 | }, 20 | { 21 | "type": "SentenceNode", 22 | "children": [ 23 | { 24 | "type": "WordNode", 25 | "children": [ 26 | { 27 | "type": "TextNode", 28 | "value": "A", 29 | "position": { 30 | "start": { 31 | "line": 2, 32 | "column": 1, 33 | "offset": 1 34 | }, 35 | "end": { 36 | "line": 2, 37 | "column": 2, 38 | "offset": 2 39 | } 40 | } 41 | } 42 | ], 43 | "position": { 44 | "start": { 45 | "line": 2, 46 | "column": 1, 47 | "offset": 1 48 | }, 49 | "end": { 50 | "line": 2, 51 | "column": 2, 52 | "offset": 2 53 | } 54 | } 55 | }, 56 | { 57 | "type": "WhiteSpaceNode", 58 | "value": " ", 59 | "position": { 60 | "start": { 61 | "line": 2, 62 | "column": 2, 63 | "offset": 2 64 | }, 65 | "end": { 66 | "line": 2, 67 | "column": 3, 68 | "offset": 3 69 | } 70 | } 71 | }, 72 | { 73 | "type": "WordNode", 74 | "children": [ 75 | { 76 | "type": "TextNode", 77 | "value": "sentence", 78 | "position": { 79 | "start": { 80 | "line": 2, 81 | "column": 3, 82 | "offset": 3 83 | }, 84 | "end": { 85 | "line": 2, 86 | "column": 11, 87 | "offset": 11 88 | } 89 | } 90 | } 91 | ], 92 | "position": { 93 | "start": { 94 | "line": 2, 95 | "column": 3, 96 | "offset": 3 97 | }, 98 | "end": { 99 | "line": 2, 100 | "column": 11, 101 | "offset": 11 102 | } 103 | } 104 | }, 105 | { 106 | "type": "PunctuationNode", 107 | "value": ".", 108 | "position": { 109 | "start": { 110 | "line": 2, 111 | "column": 11, 112 | "offset": 11 113 | }, 114 | "end": { 115 | "line": 2, 116 | "column": 12, 117 | "offset": 12 118 | } 119 | } 120 | } 121 | ], 122 | "position": { 123 | "start": { 124 | "line": 2, 125 | "column": 1, 126 | "offset": 1 127 | }, 128 | "end": { 129 | "line": 2, 130 | "column": 12, 131 | "offset": 12 132 | } 133 | } 134 | } 135 | ], 136 | "position": { 137 | "start": { 138 | "line": 1, 139 | "column": 1, 140 | "offset": 0 141 | }, 142 | "end": { 143 | "line": 2, 144 | "column": 12, 145 | "offset": 12 146 | } 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /test/fixture/trailing-white-space-initial-sentence.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "SentenceNode", 3 | "children": [ 4 | { 5 | "type": "WhiteSpaceNode", 6 | "value": "\n", 7 | "position": { 8 | "start": { 9 | "line": 1, 10 | "column": 1, 11 | "offset": 0 12 | }, 13 | "end": { 14 | "line": 2, 15 | "column": 1, 16 | "offset": 1 17 | } 18 | } 19 | }, 20 | { 21 | "type": "WordNode", 22 | "children": [ 23 | { 24 | "type": "TextNode", 25 | "value": "A", 26 | "position": { 27 | "start": { 28 | "line": 2, 29 | "column": 1, 30 | "offset": 1 31 | }, 32 | "end": { 33 | "line": 2, 34 | "column": 2, 35 | "offset": 2 36 | } 37 | } 38 | } 39 | ], 40 | "position": { 41 | "start": { 42 | "line": 2, 43 | "column": 1, 44 | "offset": 1 45 | }, 46 | "end": { 47 | "line": 2, 48 | "column": 2, 49 | "offset": 2 50 | } 51 | } 52 | }, 53 | { 54 | "type": "WhiteSpaceNode", 55 | "value": " ", 56 | "position": { 57 | "start": { 58 | "line": 2, 59 | "column": 2, 60 | "offset": 2 61 | }, 62 | "end": { 63 | "line": 2, 64 | "column": 3, 65 | "offset": 3 66 | } 67 | } 68 | }, 69 | { 70 | "type": "WordNode", 71 | "children": [ 72 | { 73 | "type": "TextNode", 74 | "value": "sentence", 75 | "position": { 76 | "start": { 77 | "line": 2, 78 | "column": 3, 79 | "offset": 3 80 | }, 81 | "end": { 82 | "line": 2, 83 | "column": 11, 84 | "offset": 11 85 | } 86 | } 87 | } 88 | ], 89 | "position": { 90 | "start": { 91 | "line": 2, 92 | "column": 3, 93 | "offset": 3 94 | }, 95 | "end": { 96 | "line": 2, 97 | "column": 11, 98 | "offset": 11 99 | } 100 | } 101 | }, 102 | { 103 | "type": "PunctuationNode", 104 | "value": ".", 105 | "position": { 106 | "start": { 107 | "line": 2, 108 | "column": 11, 109 | "offset": 11 110 | }, 111 | "end": { 112 | "line": 2, 113 | "column": 12, 114 | "offset": 12 115 | } 116 | } 117 | } 118 | ], 119 | "position": { 120 | "start": { 121 | "line": 1, 122 | "column": 1, 123 | "offset": 0 124 | }, 125 | "end": { 126 | "line": 2, 127 | "column": 12, 128 | "offset": 12 129 | } 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /test/fixture/trailing-white-space-initial.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "WhiteSpaceNode", 6 | "value": "\n", 7 | "position": { 8 | "start": { 9 | "line": 1, 10 | "column": 1, 11 | "offset": 0 12 | }, 13 | "end": { 14 | "line": 2, 15 | "column": 1, 16 | "offset": 1 17 | } 18 | } 19 | }, 20 | { 21 | "type": "ParagraphNode", 22 | "children": [ 23 | { 24 | "type": "SentenceNode", 25 | "children": [ 26 | { 27 | "type": "WordNode", 28 | "children": [ 29 | { 30 | "type": "TextNode", 31 | "value": "A", 32 | "position": { 33 | "start": { 34 | "line": 2, 35 | "column": 1, 36 | "offset": 1 37 | }, 38 | "end": { 39 | "line": 2, 40 | "column": 2, 41 | "offset": 2 42 | } 43 | } 44 | } 45 | ], 46 | "position": { 47 | "start": { 48 | "line": 2, 49 | "column": 1, 50 | "offset": 1 51 | }, 52 | "end": { 53 | "line": 2, 54 | "column": 2, 55 | "offset": 2 56 | } 57 | } 58 | }, 59 | { 60 | "type": "WhiteSpaceNode", 61 | "value": " ", 62 | "position": { 63 | "start": { 64 | "line": 2, 65 | "column": 2, 66 | "offset": 2 67 | }, 68 | "end": { 69 | "line": 2, 70 | "column": 3, 71 | "offset": 3 72 | } 73 | } 74 | }, 75 | { 76 | "type": "WordNode", 77 | "children": [ 78 | { 79 | "type": "TextNode", 80 | "value": "sentence", 81 | "position": { 82 | "start": { 83 | "line": 2, 84 | "column": 3, 85 | "offset": 3 86 | }, 87 | "end": { 88 | "line": 2, 89 | "column": 11, 90 | "offset": 11 91 | } 92 | } 93 | } 94 | ], 95 | "position": { 96 | "start": { 97 | "line": 2, 98 | "column": 3, 99 | "offset": 3 100 | }, 101 | "end": { 102 | "line": 2, 103 | "column": 11, 104 | "offset": 11 105 | } 106 | } 107 | }, 108 | { 109 | "type": "PunctuationNode", 110 | "value": ".", 111 | "position": { 112 | "start": { 113 | "line": 2, 114 | "column": 11, 115 | "offset": 11 116 | }, 117 | "end": { 118 | "line": 2, 119 | "column": 12, 120 | "offset": 12 121 | } 122 | } 123 | } 124 | ], 125 | "position": { 126 | "start": { 127 | "line": 2, 128 | "column": 1, 129 | "offset": 1 130 | }, 131 | "end": { 132 | "line": 2, 133 | "column": 12, 134 | "offset": 12 135 | } 136 | } 137 | } 138 | ], 139 | "position": { 140 | "start": { 141 | "line": 2, 142 | "column": 1, 143 | "offset": 1 144 | }, 145 | "end": { 146 | "line": 2, 147 | "column": 12, 148 | "offset": 12 149 | } 150 | } 151 | } 152 | ], 153 | "position": { 154 | "start": { 155 | "line": 1, 156 | "column": 1, 157 | "offset": 0 158 | }, 159 | "end": { 160 | "line": 2, 161 | "column": 12, 162 | "offset": 12 163 | } 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /test/fixture/white-space-only.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "WhiteSpaceNode", 6 | "value": "\n\n", 7 | "position": { 8 | "start": { 9 | "line": 1, 10 | "column": 1, 11 | "offset": 0 12 | }, 13 | "end": { 14 | "line": 3, 15 | "column": 1, 16 | "offset": 2 17 | } 18 | } 19 | } 20 | ], 21 | "position": { 22 | "start": { 23 | "line": 1, 24 | "column": 1, 25 | "offset": 0 26 | }, 27 | "end": { 28 | "line": 3, 29 | "column": 1, 30 | "offset": 2 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /test/fixture/word-initial-ampersand.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "RootNode", 3 | "children": [ 4 | { 5 | "type": "ParagraphNode", 6 | "children": [ 7 | { 8 | "type": "SentenceNode", 9 | "children": [ 10 | { 11 | "type": "WordNode", 12 | "children": [ 13 | { 14 | "type": "TextNode", 15 | "value": "This", 16 | "position": { 17 | "start": { 18 | "line": 1, 19 | "column": 1, 20 | "offset": 0 21 | }, 22 | "end": { 23 | "line": 1, 24 | "column": 5, 25 | "offset": 4 26 | } 27 | } 28 | } 29 | ], 30 | "position": { 31 | "start": { 32 | "line": 1, 33 | "column": 1, 34 | "offset": 0 35 | }, 36 | "end": { 37 | "line": 1, 38 | "column": 5, 39 | "offset": 4 40 | } 41 | } 42 | }, 43 | { 44 | "type": "PunctuationNode", 45 | "value": ",", 46 | "position": { 47 | "start": { 48 | "line": 1, 49 | "column": 5, 50 | "offset": 4 51 | }, 52 | "end": { 53 | "line": 1, 54 | "column": 6, 55 | "offset": 5 56 | } 57 | } 58 | }, 59 | { 60 | "type": "WhiteSpaceNode", 61 | "value": " ", 62 | "position": { 63 | "start": { 64 | "line": 1, 65 | "column": 6, 66 | "offset": 5 67 | }, 68 | "end": { 69 | "line": 1, 70 | "column": 7, 71 | "offset": 6 72 | } 73 | } 74 | }, 75 | { 76 | "type": "WordNode", 77 | "children": [ 78 | { 79 | "type": "TextNode", 80 | "value": "that", 81 | "position": { 82 | "start": { 83 | "line": 1, 84 | "column": 7, 85 | "offset": 6 86 | }, 87 | "end": { 88 | "line": 1, 89 | "column": 11, 90 | "offset": 10 91 | } 92 | } 93 | } 94 | ], 95 | "position": { 96 | "start": { 97 | "line": 1, 98 | "column": 7, 99 | "offset": 6 100 | }, 101 | "end": { 102 | "line": 1, 103 | "column": 11, 104 | "offset": 10 105 | } 106 | } 107 | }, 108 | { 109 | "type": "PunctuationNode", 110 | "value": ",", 111 | "position": { 112 | "start": { 113 | "line": 1, 114 | "column": 11, 115 | "offset": 10 116 | }, 117 | "end": { 118 | "line": 1, 119 | "column": 12, 120 | "offset": 11 121 | } 122 | } 123 | }, 124 | { 125 | "type": "WhiteSpaceNode", 126 | "value": " ", 127 | "position": { 128 | "start": { 129 | "line": 1, 130 | "column": 12, 131 | "offset": 11 132 | }, 133 | "end": { 134 | "line": 1, 135 | "column": 13, 136 | "offset": 12 137 | } 138 | } 139 | }, 140 | { 141 | "type": "WordNode", 142 | "children": [ 143 | { 144 | "type": "SymbolNode", 145 | "value": "&", 146 | "position": { 147 | "start": { 148 | "line": 1, 149 | "column": 13, 150 | "offset": 12 151 | }, 152 | "end": { 153 | "line": 1, 154 | "column": 14, 155 | "offset": 13 156 | } 157 | } 158 | }, 159 | { 160 | "type": "TextNode", 161 | "value": "c", 162 | "position": { 163 | "start": { 164 | "line": 1, 165 | "column": 14, 166 | "offset": 13 167 | }, 168 | "end": { 169 | "line": 1, 170 | "column": 15, 171 | "offset": 14 172 | } 173 | } 174 | } 175 | ], 176 | "position": { 177 | "start": { 178 | "line": 1, 179 | "column": 13, 180 | "offset": 12 181 | }, 182 | "end": { 183 | "line": 1, 184 | "column": 15, 185 | "offset": 14 186 | } 187 | } 188 | }, 189 | { 190 | "type": "PunctuationNode", 191 | "value": ".", 192 | "position": { 193 | "start": { 194 | "line": 1, 195 | "column": 15, 196 | "offset": 14 197 | }, 198 | "end": { 199 | "line": 1, 200 | "column": 16, 201 | "offset": 15 202 | } 203 | } 204 | } 205 | ], 206 | "position": { 207 | "start": { 208 | "line": 1, 209 | "column": 1, 210 | "offset": 0 211 | }, 212 | "end": { 213 | "line": 1, 214 | "column": 16, 215 | "offset": 15 216 | } 217 | } 218 | } 219 | ], 220 | "position": { 221 | "start": { 222 | "line": 1, 223 | "column": 1, 224 | "offset": 0 225 | }, 226 | "end": { 227 | "line": 1, 228 | "column": 16, 229 | "offset": 15 230 | } 231 | } 232 | } 233 | ], 234 | "position": { 235 | "start": { 236 | "line": 1, 237 | "column": 1, 238 | "offset": 0 239 | }, 240 | "end": { 241 | "line": 1, 242 | "column": 16, 243 | "offset": 15 244 | } 245 | } 246 | } 247 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "checkJs": true, 4 | "customConditions": ["development"], 5 | "declaration": true, 6 | "emitDeclarationOnly": true, 7 | "exactOptionalPropertyTypes": true, 8 | "lib": ["es2022"], 9 | "module": "node16", 10 | "strict": true, 11 | "target": "es2022" 12 | }, 13 | "exclude": ["coverage/", "node_modules/"], 14 | "include": ["**/*.js"] 15 | } 16 | --------------------------------------------------------------------------------