├── stack.yaml
├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── test-actions.yaml
├── scripts
    ├── showcase-svg
    │   ├── .gitignore
    │   ├── extract.xq
    │   ├── replace.xq
    │   ├── Makefile
    │   └── template.svg
    ├── ko-kr-stdict
    │   ├── .gitignore
    │   ├── README.rst
    │   └── main.py
    ├── haddock-prologue
    │   ├── omit-rich-elements.lua
    │   └── build.sh
    ├── deno
    │   ├── README.md
    │   └── test.ts
    └── Bundle-RequiredDlls.ps1
├── demo
    ├── .gitignore
    ├── README.md
    ├── elm.json
    ├── github-corner.html
    └── src
    │   └── Markdown
    │       └── HtmlString.elm
├── test
    ├── data
    │   ├── 習慣音.ko-KR.html
    │   ├── 習慣音.ko-Kore.html
    │   ├── initial-sound-raw.ko-KR.html
    │   ├── initial-sound-raw.ko-Kore.html
    │   ├── 이런날.ko-Kore.html
    │   ├── 이런날.ko-KR.html
    │   ├── ellipsis.ko-Kore.html
    │   ├── ellipsis.ko-KR.html
    │   ├── 大韓民國憲法第十號前文.ko-KP.html
    │   ├── 大韓民國憲法第十號前文.ko-KR.html
    │   ├── 大韓民國憲法第十號前文.ko-Kore.html
    │   ├── preservation.ko-Kore.html
    │   └── preservation.ko-KR.html
    ├── Spec.hs
    ├── doctest.json
    ├── hlint.hs
    ├── hspec.hs
    ├── doctest.hs
    └── Text
    │   └── Seonbi
    │       ├── HangulSpec.hs
    │       ├── Html
    │           ├── PrinterSpec.hs
    │           ├── WrapperSpec.hs
    │           ├── PreservationSpec.hs
    │           ├── TextNormalizerSpec.hs
    │           ├── LangSpec.hs
    │           ├── ClipperSpec.hs
    │           └── ScannerSpec.hs
    │       ├── Unihan
    │           └── KHangulSpec.hs
    │       ├── ContentTypesSpec.hs
    │       └── FacadeSpec.hs
├── .gitattributes
├── .dockerignore
├── .gitignore
├── data
    └── ko-kr-stdict.tsv
├── .vscode
    ├── extensions.json
    └── settings.json
├── en.utf-8.add
├── src
    └── Text
    │   └── Seonbi
    │       ├── Unihan
    │           ├── README
    │           └── KHangul.hs
    │       ├── Html.hs
    │       ├── Html
    │           ├── Preservation.hs
    │           ├── Wrapper.hs
    │           ├── Entity.hs
    │           ├── TextNormalizer.hs
    │           ├── Printer.hs
    │           ├── Clipper.hs
    │           ├── TagStack.hs
    │           ├── Lang.hs
    │           ├── Scanner.hs
    │           └── Tag.hs
    │       ├── Hangul.hs
    │       ├── Trie.hs
    │       └── PairedTransformer.hs
├── .editorconfig
├── stack-ghc-8.10.yaml
├── stack-ghc-8.8.yaml
├── stack-ghc-9.2.yaml
├── stack-ghc-9.4.yaml
├── stack-ghc-9.0.yaml
├── bucket
    └── seonbi.json
├── fly.toml
├── Dockerfile
├── setup
    ├── README.md
    └── action.yaml
├── Setup.hs
├── package.yaml
├── README.md
├── app
    └── seonbi-api.hs
└── CHANGES.md


/stack.yaml:
--------------------------------------------------------------------------------
1 | stack-ghc-8.8.yaml


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: dahlia
2 | 


--------------------------------------------------------------------------------
/scripts/showcase-svg/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/demo/.gitignore:
--------------------------------------------------------------------------------
1 | elm-stuff/
2 | index.html
3 | 


--------------------------------------------------------------------------------
/test/data/習慣音.ko-KR.html:
--------------------------------------------------------------------------------
1 | <p>허락하기 곤란하다.</p>
2 | 


--------------------------------------------------------------------------------
/test/data/習慣音.ko-Kore.html:
--------------------------------------------------------------------------------
1 | <p>許諾하기 困難하다.</p>
2 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | data/*.tsv filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/test/data/initial-sound-raw.ko-KR.html:
--------------------------------------------------------------------------------
1 | <p>가리</p>
2 | <p>영리</p>
3 | 


--------------------------------------------------------------------------------
/test/data/initial-sound-raw.ko-Kore.html:
--------------------------------------------------------------------------------
1 | <p>可利</p>
2 | <p>營利</p>
3 | 


--------------------------------------------------------------------------------
/test/Spec.hs:
--------------------------------------------------------------------------------
1 | {-# OPTIONS_GHC -F -pgmF hspec-discover -optF --module-name=Spec #-}
2 | 


--------------------------------------------------------------------------------
/scripts/ko-kr-stdict/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.tsv
3 | *.zip
4 | .env/
5 | .venv/
6 | env/
7 | venv/
8 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | *.cabal
2 | *~
3 | .dockerignore
4 | .git/
5 | .gitignore
6 | .stack-work/
7 | Dockerfile
8 | src/Text/Seonbi/kHangul.txt
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.cabal
 2 | *.prof
 3 | *~
 4 | .stack-work/
 5 | dist-newstyle/
 6 | out/
 7 | seonbi.iml
 8 | src/Text/Seonbi/kHangul.txt
 9 | stack*.yaml.lock
10 | 


--------------------------------------------------------------------------------
/data/ko-kr-stdict.tsv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ae62c488fe95e5c8f3ec36d6c3c7b3a84a89530b14e167c95008289bba67a7f6
3 | size 4970476
4 | 


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 |   "recommendations": [
3 |     "EditorConfig.EditorConfig",
4 |     "haskell.haskell",
5 |     "streetsidesoftware.code-spell-checker"
6 |   ]
7 | }
8 | 


--------------------------------------------------------------------------------
/test/data/이런날.ko-Kore.html:
--------------------------------------------------------------------------------
1 | <blockquote><p>아이들에게 하로의 乾燥한 學課로<br>
2 | 해말간 倦怠가 깃들고、<br>
3 | &quot;矛盾&quot; 두자를 理解치 못하도록<br>
4 | 머리가 單純하였구나。</p>
5 | </blockquote><p>尹東柱 &lt;이런날&gt;</p>
6 | 


--------------------------------------------------------------------------------
/test/data/이런날.ko-KR.html:
--------------------------------------------------------------------------------
1 | <blockquote><p>아이들에게 하로의 건조한 학과로<br>
2 | 해말간 권태가 깃들고,<br>
3 | &ldquo;모순&rdquo; 두자를 이해치 못하도록<br>
4 | 머리가 단순하였구나.</p>
5 | </blockquote><p>윤동주 &#12296;이런날&#12297;</p>
6 | 


--------------------------------------------------------------------------------
/en.utf-8.add:
--------------------------------------------------------------------------------
 1 | deno
 2 | guillemets
 3 | hanja
 4 | inequal
 5 | interpunct
 6 | interpuncts
 7 | phoneticize
 8 | phoneticized
 9 | punct
10 | seonbi
11 | Sino
12 | stdict
13 | submap
14 | typeclass
15 | typeclasses
16 | 


--------------------------------------------------------------------------------
/demo/README.md:
--------------------------------------------------------------------------------
 1 | Seonbi demo
 2 | ===========
 3 | 
 4 | <https://dahlia.github.io/seonbi/>
 5 | 
 6 | This is a web app to demo Seonbi's options and behaviors, and written in
 7 | the [Elm] language.
 8 | 
 9 | [Elm]: https://elm-lang.org/
10 | 


--------------------------------------------------------------------------------
/scripts/showcase-svg/extract.xq:
--------------------------------------------------------------------------------
1 | xquery version "3.0";
2 | 
3 | declare namespace svg = "http://www.w3.org/2000/svg";
4 | declare namespace html = "http://www.w3.org/1999/xhtml";
5 | 
6 | /svg:svg/svg:foreignObject/html:div//html:p[@id="input"]
7 | 


--------------------------------------------------------------------------------
/test/doctest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "ignore": [],
 3 |   "sourceFolders": [
 4 |     "src",
 5 |     "src/Text",
 6 |     "src/Text/Seonbi",
 7 |     "src/Text/Seonbi/Html"
 8 |   ],
 9 |   "doctestOptions": [
10 |     "-XHaskell2010"
11 |   ]
12 | }
13 | 


--------------------------------------------------------------------------------
/test/data/ellipsis.ko-Kore.html:
--------------------------------------------------------------------------------
1 | <blockquote>
2 | <a href="https://www.facebook.com/totolovesme/posts/4431973236875202">
3 | &lt;동물기계&gt;, 이쪽에선 꽤 유명한 고전임에도 불구하고 국내에는 두달 전에 번역되었다. (...)
4 | 그나저나 요새 책 사서 표지랑 목차만 읽는 것이 취미가 되어버린 것 같아서 반성 중임...
5 | </a>
6 | </blockquote>
7 | 


--------------------------------------------------------------------------------
/test/data/ellipsis.ko-KR.html:
--------------------------------------------------------------------------------
1 | <blockquote>
2 | <a href="https://www.facebook.com/totolovesme/posts/4431973236875202">
3 | &#12296;동물기계&#12297;, 이쪽에선 꽤 유명한 고전임에도 불구하고 국내에는 두달 전에 번역되었다. (&hellip;)
4 | 그나저나 요새 책 사서 표지랑 목차만 읽는 것이 취미가 되어버린 것 같아서 반성 중임&hellip;
5 | </a>
6 | </blockquote>
7 | 


--------------------------------------------------------------------------------
/src/Text/Seonbi/Unihan/README:
--------------------------------------------------------------------------------
1 | The .txt data files in this directory are imported from the unihan-json project:
2 | 
3 | 	https://github.com/dahlia/unihan-json/releases/tag/12.1.0
4 | 
5 | If the version of these data become outdated please let me know or send a patch
6 | to update data files!
7 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | end_of_line = lf
 5 | trim_trailing_whitespace = true
 6 | insert_final_newline = true
 7 | charset = utf-8
 8 | indent_style = space
 9 | indent_size = 4
10 | max_line_length = 80
11 | 
12 | [*.yaml]
13 | indent_size = 2
14 | 
15 | [{Makefile,**.mk}]
16 | indent_style = tab
17 | 


--------------------------------------------------------------------------------
/test/hlint.hs:
--------------------------------------------------------------------------------
 1 | import Language.Haskell.HLint (hlint)
 2 | import System.Exit (exitFailure, exitSuccess)
 3 | 
 4 | arguments :: [String]
 5 | arguments = ["app", "src", "test"]
 6 | 
 7 | main :: IO ()
 8 | main = do
 9 |     hlints <- hlint arguments
10 |     case hlints of
11 |         [] -> exitSuccess
12 |         _ -> exitFailure
13 | 


--------------------------------------------------------------------------------
/stack-ghc-8.10.yaml:
--------------------------------------------------------------------------------
 1 | resolver: lts-18.28
 2 | packages:
 3 | - .
 4 | extra-deps:
 5 | - bytestring-trie-0.2.5.0
 6 | - html-charset-0.1.0
 7 | flags:
 8 |   seonbi:
 9 |     iconv: true
10 | allow-newer: true
11 | ghc-options:
12 |   "$everything": -haddock
13 |   "$locals": -Werror -fhide-source-paths
14 | require-stack-version: ">=2.7.0"
15 | 


--------------------------------------------------------------------------------
/stack-ghc-8.8.yaml:
--------------------------------------------------------------------------------
 1 | resolver: lts-16.31
 2 | packages:
 3 | - .
 4 | extra-deps:
 5 | - bytestring-trie-0.2.5.0
 6 | - html-charset-0.1.0
 7 | flags:
 8 |   seonbi:
 9 |     iconv: true
10 | allow-newer: false
11 | ghc-options:
12 |   "$everything": -haddock
13 |   "$locals": -Werror -fhide-source-paths
14 | require-stack-version: ">=2.7.0"
15 | 


--------------------------------------------------------------------------------
/test/hspec.hs:
--------------------------------------------------------------------------------
 1 | import Control.Monad
 2 | import GHC.IO.Encoding
 3 | import System.Info (os)
 4 | 
 5 | import System.IO.CodePage (withCP65001)
 6 | import Test.Hspec.Runner
 7 | 
 8 | import qualified Spec
 9 | 
10 | main :: IO ()
11 | main = withCP65001 $ do
12 |     when (System.Info.os == "ming32") $ setLocaleEncoding utf8
13 |     hspecWith defaultConfig Spec.spec
14 | 


--------------------------------------------------------------------------------
/stack-ghc-9.2.yaml:
--------------------------------------------------------------------------------
 1 | resolver: lts-20.26
 2 | packages:
 3 | - .
 4 | extra-deps:
 5 | - html-charset-0.1.0
 6 | flags:
 7 |   seonbi:
 8 |     iconv: false  # iconv seems unmaintained and only supports bytestring < 0.11
 9 | allow-newer: false
10 | ghc-options:
11 |   "$everything": -haddock
12 |   "$locals": -Werror -fhide-source-paths
13 | require-stack-version: ">=2.7.0"
14 | 


--------------------------------------------------------------------------------
/stack-ghc-9.4.yaml:
--------------------------------------------------------------------------------
 1 | resolver: lts-21.21
 2 | packages:
 3 | - .
 4 | extra-deps:
 5 | - cmark-0.6.1
 6 | - html-charset-0.1.0
 7 | - iconv-0.4.1.3
 8 | flags:
 9 |   seonbi:
10 |     iconv: false  # iconv seems unmaintained and only supports bytestring < 0.11
11 | allow-newer: false
12 | ghc-options:
13 |   "$everything": -haddock
14 |   "$locals": -Werror -fhide-source-paths
15 | require-stack-version: ">=2.7.0"
16 | 


--------------------------------------------------------------------------------
/scripts/showcase-svg/replace.xq:
--------------------------------------------------------------------------------
 1 | xquery version "3.0";
 2 | 
 3 | declare namespace html = "http://www.w3.org/1999/xhtml";
 4 | declare variable $ko-kr external;
 5 | declare variable $ko-kp external;
 6 | declare variable $ko-kore external;
 7 | 
 8 | replace node //html:div/html:p[@id="placeholder-ko-kr"]
 9 | with doc($ko-kr),
10 | replace node //html:div/html:p[@id="placeholder-ko-kp"]
11 | with doc($ko-kp),
12 | replace node //html:div/html:p[@id="placeholder-ko-kore"]
13 | with doc($ko-kore)
14 | 


--------------------------------------------------------------------------------
/stack-ghc-9.0.yaml:
--------------------------------------------------------------------------------
 1 | resolver: lts-19.33
 2 | packages:
 3 | - .
 4 | extra-deps:
 5 | - aeson-1.5.6.0
 6 | - html-charset-0.1.0
 7 | flags:
 8 |   seonbi:
 9 |     iconv: true
10 |   mintty:
11 |     Win32-2-13-1: false  # https://github.com/RyanGlScott/mintty/issues/4
12 | allow-newer: false
13 | ghc-options:
14 |   "$everything": -haddock
15 |   "$locals": -Werror -fhide-source-paths
16 | extra-include-dirs:
17 | - /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/ffi
18 |   # https://gitlab.haskell.org/ghc/ghc/-/issues/20592#note_403426
19 | require-stack-version: ">=2.7.0"
20 | 


--------------------------------------------------------------------------------
/test/doctest.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE CPP #-}
 2 | #if __GLASGOW_HASKELL__ >= 810
 3 | main :: IO ()
 4 | main = do
 5 |     putStrLn "Temporarily, doctests are ignored for GHC >= 8.10 due to bugs:\n"
 6 |     putStrLn "  https://github.com/sol/doctest/issues/301"
 7 | #elif __GLASGOW_HASKELL__ >= 808 && defined(mingw32_HOST_OS)
 8 | main :: IO ()
 9 | main = do
10 |     putStr "Temporarily, doctests are ignored for GHC >= 8.8 on Windows "
11 |     putStrLn "due to bugs:\n"
12 |     putStrLn "  https://github.com/sol/doctest/issues/300"
13 | #else
14 | {-# OPTIONS_GHC -F -pgmF doctest-discover -optF test/doctest.json #-}
15 | #endif
16 | 


--------------------------------------------------------------------------------
/bucket/seonbi.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": "0.5.0",
 3 |   "description": "SmartyPants for Korean language",
 4 |   "homepage": "https://github.com/dahlia/seonbi",
 5 |   "license": "LGPL-2.1",
 6 |   "architecture": {
 7 |     "64bit": {
 8 |       "url": "https://github.com/dahlia/seonbi/releases/download/0.5.0/seonbi-0.5.0.win64.zip",
 9 |       "hash": "8103ff6d7a541827d55ac3d28bcb9182e8e5ceced87de24a63662f2a10e4c610"
10 |     }
11 |   },
12 |   "bin": ["seonbi.exe", "seonbi-api.exe"],
13 |   "checkver": "github",
14 |   "autoupdate": {
15 |     "architecture": {
16 |       "64bit": {
17 |         "url": "https://github.com/dahlia/seonbi/releases/download/$version/seonbi-$version.win64.zip"
18 |       }
19 |     }
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/test/data/大韓民國憲法第十號前文.ko-KP.html:
--------------------------------------------------------------------------------
 1 | <article>
 2 | <h1>
 3 | <a href="www.law.go.kr/lsInfoP.do?lsiSeq=61603&chrClsCd=010201">대한민국헌법</a>
 4 | </h1>
 5 | <p>시행 1988년 2월 25일. 헌법 제10호, 1987년 10월 29일, 전부개정.</p>
 6 | <h2>전문</h2>
 7 | <p>유구한 력사와 전통에 빛나는 우리 대한국민은 3&#xb7;1운동으로 건립된
 8 | 대한민국림시정부의 법통과 불의에 항거한 4&#xb7;19민주리념을 계승하고,
 9 | 조국의 민주개혁과 평화적 통일의 사명에 입각하여 정의&#xb7;인도와 동포애로써
10 | 민족의 단결을 공고히 하고, 모든 사회적 폐습과 불의를 타파하며,
11 | 자률과 조화를 바탕으로 자유민주적 기본질서를 더욱 확고히 하여
12 | 정치&#xb7;경제&#xb7;사회&#xb7;문화의 모든 령역에 있어서 각인의
13 | 기회를 균등히 하고,
14 | 능력을 최고도로 발휘하게 하며, 자유와 권리에 따르는 책임과 의무를 완수하게 하여,
15 | 안으로는 국민생활의 균등한 향상을 기하고 밖으로는 항구적인 세계평화와
16 | 인류공영에 이바지함으로써 우리들과 우리들의 자손의 안전과 자유와 행복을
17 | 영원히 확보할 것을 다짐하면서 1948년 7월 12일에 제정되고 8차에 걸쳐 개정된
18 | 헌법을 이제 국회의 의결을 거쳐 국민투표에 의하여 개정한다.</p>
19 | </article>
20 | 


--------------------------------------------------------------------------------
/test/data/大韓民國憲法第十號前文.ko-KR.html:
--------------------------------------------------------------------------------
 1 | <article>
 2 | <h1>
 3 | <a href="www.law.go.kr/lsInfoP.do?lsiSeq=61603&chrClsCd=010201">대한민국 헌법</a>
 4 | </h1>
 5 | <p>시행 1988년 2월 25일. 헌법 제10호, 1987년 10월 29일, 전부개정.</p>
 6 | <h2>전문</h2>
 7 | <p>유구한 역사와 전통에 빛나는 우리 대한국민은 3&#xb7;1운동으로 건립된
 8 | 대한민국 임시 정부의 법통과 불의에 항거한 4&#xb7;19민주이념을 계승하고,
 9 | 조국의 민주개혁과 평화적 통일의 사명에 입각하여 정의&#xb7;인도와 동포애로써
10 | 민족의 단결을 공고히 하고, 모든 사회적 폐습과 불의를 타파하며,
11 | 자율과 조화를 바탕으로 자유민주적 기본질서를 더욱 확고히 하여
12 | 정치&#xb7;경제&#xb7;사회&#xb7;문화의 모든 영역에 있어서 각인의
13 | 기회를 균등히 하고,
14 | 능력을 최고도로 발휘하게 하며, 자유와 권리에 따르는 책임과 의무를 완수하게 하여,
15 | 안으로는 국민생활의 균등한 향상을 기하고 밖으로는 항구적인 세계평화와
16 | 인류공영에 이바지함으로써 우리들과 우리들의 자손의 안전과 자유와 행복을
17 | 영원히 확보할 것을 다짐하면서 1948년 7월 12일에 제정되고 8차에 걸쳐 개정된
18 | 헌법을 이제 국회의 의결을 거쳐 국민 투표에 의하여 개정한다.</p>
19 | </article>
20 | 


--------------------------------------------------------------------------------
/test/data/大韓民國憲法第十號前文.ko-Kore.html:
--------------------------------------------------------------------------------
 1 | <article>
 2 | <h1>
 3 | <a href="www.law.go.kr/lsInfoP.do?lsiSeq=61603&chrClsCd=010201">大韓民國憲法</a>
 4 | </h1>
 5 | <p>施行 1988年 2月 25日. 憲法 第10號, 1987年 10月 29日, 全部改正.</p>
 6 | <h2>前文</h2>
 7 | <p>悠久한 歷史와 傳統에 빛나는 우리 大韓國民은 3&#xb7;1運動으로 建立된
 8 | 大韓民國臨時政府의 法統과 不義에 抗拒한 4&#xb7;19民主理念을 계승하고,
 9 | 祖國의 民主改革과 平和的 統一의 使命에 입각하여 正義&#xb7;人道와 同胞愛로써
10 | 民族의 團結을 공고히 하고, 모든 社會的 弊習과 不義를 타파하며,
11 | 自律과 調和를 바탕으로 自由民主的 基本秩序를 더욱 확고히 하여
12 | 政治&#xb7;經濟&#xb7;社會&#xb7;文化의 모든 領域에 있어서 各人의
13 | 機會를 균등히 하고,
14 | 能力을 最高度로 발휘하게 하며, 自由와 權利에 따르는 責任과 義務를 완수하게 하여,
15 | 안으로는 國民生活의 균등한 향상을 기하고 밖으로는 항구적인 世界平和와
16 | 人類共榮에 이바지함으로써 우리들과 우리들의 子孫의 安全과 自由와 幸福을
17 | 영원히 확보할 것을 다짐하면서 1948年 7月 12日에 制定되고 8次에 걸쳐 改正된
18 | 憲法을 이제 國會의 議決을 거쳐 國民投票에 의하여 改正한다.</p>
19 | </article>
20 | 


--------------------------------------------------------------------------------
/scripts/haddock-prologue/omit-rich-elements.lua:
--------------------------------------------------------------------------------
 1 | -- Remove the top-level heading since Haddock in itself prints it.
 2 | function Header(elem)
 3 |   if elem.level > 1 then
 4 |     return elem
 5 |   end
 6 |   return {}
 7 | end
 8 | 
 9 | -- Removes linked images since Haddock cannot represent them.
10 | function Link(elem)
11 |   children = elem.content
12 |   if #children ~= 1 or children[1].tag ~= "Image" then
13 |       return nil
14 |   end
15 |   return {}
16 | end
17 | 
18 | -- Escape slashes in hrefs of emphasized links as Pandoc's Haddock target
19 | -- does not escape slashes for us.
20 | function Emph(elem)
21 |   return pandoc.walk_inline(elem, {
22 |     Link = function (elem)
23 |       elem.target = string.gsub(elem.target, "/", "\\/")
24 |       return elem
25 |     end
26 |   })
27 | end
28 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "cSpell.customDictionaries": {
 3 |     "workspace": {
 4 |       "name": "Workspace Dictionary",
 5 |       "description": "A custom dictionary for this poject.",
 6 |       "path": "${workspaceFolder}/en.utf-8.add",
 7 |       "addWords": true,
 8 |       "scope": "workspace"
 9 |     }
10 |   },
11 |   "cSpell.dictionaries": [
12 |     "en_US",
13 |     "filetypes",
14 |     "html",
15 |     "softwareTerms",
16 |     "typescript"
17 |   ],
18 |   "cSpell.ignoreRegExpList": [
19 |     "/\\\\x[0-9A-Fa-f]{2}/",
20 |     "/\\\\u[0-9A-Fa-f]{4}/",
21 |     "/\\\\U[0-9A-Fa-f]{8}/"
22 |   ],
23 | 
24 |   "deno.enable": false,
25 |   "deno.enablePaths": [
26 |     "scripts/deno/"
27 |   ],
28 | 
29 |   "haskell.serverEnvironment": {
30 |     "STACK_YAML": "stack-ghc-9.4.yaml"
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/fly.toml:
--------------------------------------------------------------------------------
 1 | app = "seonbi"
 2 | kill_signal = "SIGINT"
 3 | kill_timeout = 5
 4 | processes = []
 5 | 
 6 | [build]
 7 | image = "ghcr.io/dahlia/seonbi/bin:latest"
 8 | 
 9 | [experimental]
10 | allowed_public_ports = [3800]
11 | auto_rollback = true
12 | cmd = ["seonbi-api", "--allow-origin=*"]
13 | 
14 | [[services]]
15 | http_checks = []
16 | internal_port = 3800
17 | processes = ["app"]
18 | protocol = "tcp"
19 | script_checks = []
20 | 
21 | [services.concurrency]
22 | hard_limit = 25
23 | soft_limit = 20
24 | type = "connections"
25 | 
26 | [[services.ports]]
27 | force_https = true
28 | handlers = ["http"]
29 | port = 80
30 | 
31 | [[services.ports]]
32 | handlers = ["tls", "http"]
33 | port = 443
34 | 
35 | [[services.tcp_checks]]
36 | grace_period = "1s"
37 | interval = "15s"
38 | restart_limit = 0
39 | timeout = "2s"
40 | 


--------------------------------------------------------------------------------
/scripts/haddock-prologue/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Prerequisites:
 3 | # - Pandoc 2.0+
 4 | # - yq
 5 | # - Haskell Stack
 6 | # - GNU sed
 7 | set -e
 8 | root="$(dirname "$0")/../.."
 9 | package="$root/package.yaml"
10 | readme="$root/README.md"
11 | pandoc_script="$(dirname "$0")/omit-rich-elements.lua"
12 | description="$(pandoc --lua-filter "$pandoc_script" -t haddock "$readme")"
13 | backup="$(mktemp)"
14 | cp "$package" "$backup"
15 | cwd="$(pwd)"
16 | exit_code=1
17 | {
18 |   yq \
19 |     -y \
20 |     --arg description "$description" \
21 |     '.description = $description' \
22 |     "$backup" > "$package"
23 |   cd "$root"
24 |   stack haddock --no-haddock-deps
25 |   cd "$(stack path --dist-dir)/doc/html/"
26 |   hackage_url='https://hackage.haskell.org/package/\1/docs/'
27 |   sed -i -E \
28 |     's|\.\./(([A-Za-z][[:alnum:]]*-)+[0-9]+(\.[0-9]+)*)/|'"$hackage_url|g" \
29 |     ./*/*.html
30 |   exit_code=0
31 | } || true
32 | cd "$cwd"
33 | cp "$backup" "$package"
34 | exit "$exit_code"
35 | 


--------------------------------------------------------------------------------
/demo/elm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "application",
 3 |     "source-directories": [
 4 |         "src"
 5 |     ],
 6 |     "elm-version": "0.19.1",
 7 |     "dependencies": {
 8 |         "direct": {
 9 |             "elm/browser": "1.0.1",
10 |             "elm/core": "1.0.2",
11 |             "elm/html": "1.0.0",
12 |             "elm/http": "2.0.0",
13 |             "elm/json": "1.1.3",
14 |             "elm/regex": "1.0.0",
15 |             "elm/url": "1.0.0",
16 |             "hecrj/html-parser": "2.3.4",
17 |             "pablohirafuji/elm-markdown": "2.0.5",
18 |             "pablohirafuji/elm-syntax-highlight": "3.4.1",
19 |             "rundis/elm-bootstrap": "5.1.0"
20 |         },
21 |         "indirect": {
22 |             "avh4/elm-color": "1.0.0",
23 |             "elm/bytes": "1.0.8",
24 |             "elm/file": "1.0.5",
25 |             "elm/parser": "1.1.0",
26 |             "elm/time": "1.0.0",
27 |             "elm/virtual-dom": "1.0.2",
28 |             "rtfeldman/elm-hex": "1.0.0"
29 |         }
30 |     },
31 |     "test-dependencies": {
32 |         "direct": {},
33 |         "indirect": {}
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/test/data/preservation.ko-Kore.html:
--------------------------------------------------------------------------------
 1 | <p>Contents in the preserved elements should not be touched.</p>
 2 | <p>To be transformed:</p>
 3 | <ul>
 4 |   <li>Dashes: a--b, c---d.</li>
 5 |   <li>Arrows: a -&gt; b, c &lt;- d, e &lt;-&gt; f, g =&gt; h, i &lt;= j.</li>
 6 |   <li>Cites: &lt;&lt;Foo&gt;&gt;, &lt;Bar&gt;.</li>
 7 |   <li>Quotes: &quot;double quotes&quot;, 'single quotes'.</li>
 8 |   <li>Ellipses: ...</li>
 9 |   <li lang="ko-Kore">國漢文混用體。</li>
10 | </ul>
11 | <p>Not to be transformed:</p>
12 | <ul>
13 |   <li>Dashes: <code>a--b</code>, <code>c---d</code>.</li>
14 |   <li>Arrows: <code>a -&gt; b</code>, <code>c &lt;- d</code>,
15 |               <code>e &lt;-&gt; f</code>, <code>g =&gt; h</code>,
16 |               <code>i &lt;= j</code>.</li>
17 |   <li>Cites: <code>&lt;&lt;Foo&gt;&gt;</code>, <code>&lt;Bar&gt;</code>.</li>
18 |   <li>Quotes: <code>&quot;double quotes&quot;</code>,
19 |               <code>'single quotes'</code>.</li>
20 |   <li>Ellipses: <code>...</code></li>
21 |   <li><code>國漢文混用體。</code></li>
22 |   <li lang="ja">引用された日本語は扱いません。</li>
23 |   <li lang="zh-CN">被引用的中文也不处理。</li>
24 |   <li lang="zh-TW">也不處理被引用的繁體中文。</li>
25 | </ul>
26 | 


--------------------------------------------------------------------------------
/test/data/preservation.ko-KR.html:
--------------------------------------------------------------------------------
 1 | <p>Contents in the preserved elements should not be touched.</p>
 2 | <p>To be transformed:</p>
 3 | <ul>
 4 |   <li>Dashes: a&mdash;b, c&mdash;d.</li>
 5 |   <li>Arrows: a &rarr; b, c &larr; d, e &harr; f, g &rArr; h, i &lArr; j.</li>
 6 |   <li>Cites: &#12298;Foo&#12299;, &#12296;Bar&#12297;.</li>
 7 |   <li>Quotes: &ldquo;double quotes&rdquo;, &lsquo;single quotes&rsquo;.</li>
 8 |   <li>Ellipses: &hellip;</li>
 9 |   <li lang="ko-Kore">국한문 혼용체.</li>
10 | </ul>
11 | <p>Not to be transformed:</p>
12 | <ul>
13 |   <li>Dashes: <code>a--b</code>, <code>c---d</code>.</li>
14 |   <li>Arrows: <code>a -&gt; b</code>, <code>c &lt;- d</code>,
15 |               <code>e &lt;-&gt; f</code>, <code>g =&gt; h</code>,
16 |               <code>i &lt;= j</code>.</li>
17 |   <li>Cites: <code>&lt;&lt;Foo&gt;&gt;</code>, <code>&lt;Bar&gt;</code>.</li>
18 |   <li>Quotes: <code>&quot;double quotes&quot;</code>,
19 |               <code>'single quotes'</code>.</li>
20 |   <li>Ellipses: <code>...</code></li>
21 |   <li><code>國漢文混用體。</code></li>
22 |   <li lang="ja">引用された日本語は扱いません。</li>
23 |   <li lang="zh-CN">被引用的中文也不处理。</li>
24 |   <li lang="zh-TW">也不處理被引用的繁體中文。</li>
25 | </ul>
26 | 


--------------------------------------------------------------------------------
/scripts/showcase-svg/Makefile:
--------------------------------------------------------------------------------
 1 | SEONBI=seonbi
 2 | XQILLA=xqilla
 3 | 
 4 | build/showcase.svg: template.svg build/ko-kr.html build/ko-kp.html build/ko-kore.html
 5 | 	cp template.svg build/showcase.svg
 6 | 	$(XQILLA) \
 7 | 		-u \
 8 | 		-i build/showcase.svg \
 9 | 		-v ko-kr build/ko-kr.html \
10 | 		-v ko-kp build/ko-kp.html \
11 | 		-v ko-kore build/ko-kore.html \
12 | 		replace.xq
13 | 	sed -i.bak \
14 | 		's|^\s\{0,\}<?xml\s\{0,\}version\s\{0,\}=\s\{0,\}.[0-9.]\{1,\}.|<?xml version="1.0"|' \
15 | 		build/showcase.svg
16 | 
17 | build/ko-kr.html: build/input.html
18 | 	$(SEONBI) \
19 | 		--preset ko-kr \
20 | 		--content-type application/xhtml+xml\
21 | 		--output build/ko-kr.html \
22 | 		build/input.html
23 | 
24 | build/ko-kp.html: build/input.html
25 | 	$(SEONBI) \
26 | 		--preset ko-kp \
27 | 		--content-type application/xhtml+xml\
28 | 		--output build/ko-kp.html \
29 | 		build/input.html
30 | 
31 | build/ko-kore.html: build/input.html
32 | 	$(SEONBI) \
33 | 		--render-hanja hanja-in-ruby \
34 | 		--content-type application/xhtml+xml\
35 | 		--output build/ko-kore.html \
36 | 		build/input.html
37 | 
38 | build/input.html: template.svg
39 | 	mkdir -p build
40 | 	$(XQILLA) -i template.svg -o build/input.html extract.xq
41 | 
42 | clean:
43 | 	rm -rf build
44 | 


--------------------------------------------------------------------------------
/src/Text/Seonbi/Html.hs:
--------------------------------------------------------------------------------
 1 | -- | Since Seonbi's primitive unit to transform is HTML, this module deals with
 2 | -- HTML.
 3 | module Text.Seonbi.Html
 4 |     ( -- * HTML scanner
 5 |       --
 6 |       -- | See more on "Text.Seonbi.Html.Scanner" module.
 7 |       Result (..)
 8 |     , scanHtml
 9 |       -- * HTML printer
10 |       --
11 |       -- | See more on "Text.Seonbi.Html.Printer" module.
12 |     , printHtml
13 |     , printText
14 |     , printXhtml
15 |     , -- * HTML entities
16 |       --
17 |       -- | See more on "Text.Seonbi.Html.Entity" module.
18 |       HtmlEntity (..)
19 |     , HtmlRawAttrs
20 |       -- * HTML tags
21 |       --
22 |       -- | See more on "Text.Seonbi.Html.Tag" module.
23 |     , HtmlTag (..)
24 |     , HtmlTagKind (..)
25 |     , htmlTagKind
26 |     , htmlTagName
27 |       -- * HTML text normalization
28 |       --
29 |       -- | See more on "Text.Seonbi.Html.TextNormalizer" module.
30 |     , normalizeText
31 |       -- * HTML hierarchical stacks
32 |       --
33 |       -- | See more on "Text.Seonbi.Html.TagStack" module.
34 |     , HtmlTagStack
35 |     ) where
36 | 
37 | import Text.Seonbi.Html.Entity
38 | import Text.Seonbi.Html.Printer
39 | import Text.Seonbi.Html.Scanner
40 | import Text.Seonbi.Html.Tag
41 | import Text.Seonbi.Html.TagStack
42 | import Text.Seonbi.Html.TextNormalizer
43 | 


--------------------------------------------------------------------------------
/src/Text/Seonbi/Html/Preservation.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE NamedFieldPuns #-}
 2 | module Text.Seonbi.Html.Preservation
 3 |     ( isPreservedEntity
 4 |     , isPreservedTag
 5 |     , isPreservedTagStack
 6 |     ) where
 7 | 
 8 | import Prelude hiding (any)
 9 | 
10 | import Text.Seonbi.Html.Entity
11 | import Text.Seonbi.Html.Tag
12 | import Text.Seonbi.Html.TagStack
13 | 
14 | -- | 'True' if the given tag should be preserved from transformation.
15 | isPreservedTag :: HtmlTag -> Bool
16 | isPreservedTag tag' =
17 |     case tag' of
18 |         Code -> True
19 |         Kbd -> True
20 |         Pre -> True
21 |         TextArea -> True
22 |         _ ->
23 |             case htmlTagKind tag' of
24 |                 Normal -> False
25 |                 EscapableRawText -> False
26 |                 _ -> True
27 | 
28 | -- | 'True' if the given tag stack should be preserved from transformation.
29 | isPreservedTagStack :: HtmlTagStack -> Bool
30 | isPreservedTagStack = any isPreservedTag
31 | 
32 | -- | 'True' if the given HTML entity should be preserved from transformation.
33 | isPreservedEntity :: HtmlEntity -> Bool
34 | isPreservedEntity HtmlComment {} =
35 |     True
36 | isPreservedEntity HtmlStartTag { tagStack, tag } =
37 |     isPreservedTag tag || isPreservedTagStack tagStack
38 | isPreservedEntity HtmlEndTag { tagStack, tag } =
39 |     isPreservedTag tag || isPreservedTagStack tagStack
40 | isPreservedEntity entity =
41 |     isPreservedTagStack $ tagStack entity
42 | 


--------------------------------------------------------------------------------
/.github/workflows/test-actions.yaml:
--------------------------------------------------------------------------------
 1 | name: test-actions
 2 | on:
 3 |   push: []
 4 |   schedule:
 5 |   - cron: 59 14 * * *
 6 |   pull_request: []
 7 | 
 8 | jobs:
 9 |   test-setup-seonbi:
10 |     strategy:
11 |       matrix:
12 |         os:
13 |         - ubuntu-20.04
14 |         - ubuntu-22.04
15 |         - macos-12  # Intel
16 |         - macos-13-xlarge  # Apple silicon
17 |         - windows-2019
18 |         - windows-2022
19 |       fail-fast: false
20 |     runs-on: ${{ matrix.os }}
21 |     steps:
22 |     - uses: actions/checkout@v4
23 |     # Test 1
24 |     - id: setup1
25 |       uses: ./setup
26 |       with:
27 |         seonbi-version: 0.3.*
28 |         add-to-path: false
29 |     - run: |
30 |         set -e
31 |         [[ "${{ steps.setup1.outputs.seonbi-version }}" = 0.3.* ]]
32 |         [[ "${{ steps.setup1.outputs.seonbi-version }}" != "0.3.*" ]]
33 |         ! command -v seonbi
34 |         ! command -v seonbi-api
35 |       shell: bash
36 |     - run: >-
37 |         ${{ steps.setup1.outputs.seonbi-path }}
38 |         test/data/大韓民國憲法第十號前文.ko-Kore.html
39 |     # Test 2
40 |     - id: setup2
41 |       uses: ./setup
42 |       with:
43 |         seonbi-version: 0.3.0
44 |     - run: |
45 |         set -e
46 |         [[ "${{ steps.setup2.outputs.seonbi-version }}" = "0.3.0" ]]
47 |         command -v seonbi
48 |         command -v seonbi-api
49 |       shell: bash
50 |     - run: seonbi test/data/大韓民國憲法第十號前文.ko-Kore.html
51 |     - run: >-
52 |         ${{ steps.setup1.outputs.seonbi-path }}
53 |         test/data/大韓民國憲法第十號前文.ko-Kore.html
54 | 


--------------------------------------------------------------------------------
/test/Text/Seonbi/HangulSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | module Text.Seonbi.HangulSpec (spec) where
 3 | 
 4 | import Test.Hspec
 5 | 
 6 | import Text.Seonbi.Hangul
 7 | 
 8 | 
 9 | spec :: Spec
10 | spec = do
11 |     specify "isHangulSyllable" $ do
12 |         '가' `shouldSatisfy` isHangulSyllable
13 |         '글' `shouldSatisfy` isHangulSyllable
14 |         'A' `shouldNotSatisfy` isHangulSyllable
15 |         '?' `shouldNotSatisfy` isHangulSyllable
16 |         '字' `shouldNotSatisfy` isHangulSyllable
17 |     describe "toJamoTriple" $ do
18 |         it "returns only initial cosonant and vowel if there is no batchim" $
19 |             toJamoTriple '가' `shouldBe` Just ('ᄀ', 'ᅡ', Nothing)
20 |         it "returns all of triple if there is a batchim" $ do
21 |             toJamoTriple '글' `shouldBe` Just ('ᄀ', 'ᅳ', Just 'ᆯ')
22 |             toJamoTriple '를' `shouldBe` Just ('ᄅ', 'ᅳ', Just 'ᆯ')
23 |         it "returns Nothing for non-Hangul letters" $ do
24 |             toJamoTriple 'A' `shouldBe` Nothing
25 |             toJamoTriple '?' `shouldBe` Nothing
26 |             toJamoTriple '字' `shouldBe` Nothing
27 |     specify "fromJamoTriple" $ do
28 |         fromJamoTriple ('ᄀ', 'ᅡ', Nothing) `shouldBe` Just '가'
29 |         fromJamoTriple ('ᄀ', 'ᅳ', Just 'ᆯ') `shouldBe` Just '글'
30 |         fromJamoTriple ('ᄅ', 'ᅳ', Just 'ᆯ') `shouldBe` Just '를'
31 |         fromJamoTriple ('ᄓ', 'ᅳ', Nothing) `shouldBe` Nothing
32 |         fromJamoTriple ('ᄀ', 'ᅶ', Nothing) `shouldBe` Nothing
33 |         fromJamoTriple ('ᄀ', 'ᅳ', Just 'ᅡ') `shouldBe` Nothing
34 |         fromJamoTriple ('ᄀ', 'ᅳ', Just 'ᇇ') `shouldBe` Nothing
35 | 


--------------------------------------------------------------------------------
/test/Text/Seonbi/Html/PrinterSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedLists #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | module Text.Seonbi.Html.PrinterSpec (spec) where
 4 | 
 5 | import Data.Text.Lazy
 6 | import Test.Hspec
 7 | 
 8 | import Text.Seonbi.Html.Entity
 9 | import Text.Seonbi.Html.Printer
10 | import Text.Seonbi.Html.Tag
11 | 
12 | sample :: [HtmlEntity]
13 | sample =
14 |     [ HtmlComment { tagStack = [], comment = " foo " }
15 |     , HtmlStartTag { tagStack = [], tag = P, rawAttributes = " id=\"a\"" }
16 |     , HtmlText { tagStack = [P], rawText = "Hello," }
17 |     , HtmlStartTag { tagStack = [P], tag = BR, rawAttributes = "" }
18 |     , HtmlEndTag { tagStack = [P], tag = BR }
19 |     , HtmlText { tagStack = [P], rawText = "\n" }
20 |     , HtmlStartTag { tagStack = [P], tag = Em, rawAttributes = "class=\"b\"" }
21 |     , HtmlCdata { tagStack = [P, Em], text = "world" }
22 |     , HtmlEndTag { tagStack = [P], tag = Em }
23 |     , HtmlText { tagStack = [P], rawText = "!" }
24 |     , HtmlEndTag { tagStack = [], tag = P }
25 |     , HtmlStartTag { tagStack = [], tag = P, rawAttributes = "" }
26 |     , HtmlEndTag { tagStack = [], tag = P }
27 |     ]
28 | 
29 | spec :: Spec
30 | spec = do
31 |     specify "printHtml" $
32 |         printHtml sample `shouldBe` Data.Text.Lazy.concat
33 |             [ "<!-- foo --><p id=\"a\">Hello,<br>\n"
34 |             , "<em class=\"b\"><![CDATA[world]]></em>!</p><p></p>"
35 |             ]
36 |     specify "printXhtml" $
37 |         printXhtml sample `shouldBe` Data.Text.Lazy.concat
38 |             [ "<!-- foo --><p id=\"a\">Hello,<br/>\n"
39 |             , "<em class=\"b\"><![CDATA[world]]></em>!</p><p></p>"
40 |             ]
41 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # To correctly make a statically-linked binary, we use Alpine Linux.
 2 | # The distro entirely uses musl instead of glibc which is unfriendly to be
 3 | # statically linked.
 4 | FROM docker.io/alpine:3.19 AS build
 5 | 
 6 | LABEL "org.opencontainers.image.title"="Seonbi"
 7 | LABEL "org.opencontainers.image.licenses"="LGPL-2.1"
 8 | 
 9 | RUN apk add --no-cache \
10 |   build-base=0.5-r3 \
11 |   bzip2-dev=1.0.8-r6 \
12 |   ghc=9.4.7-r1 \
13 |   libbz2=1.0.8-r6 \
14 |   xz=5.4.5-r0 \
15 |   zlib-dev=1.3.1-r0 \
16 |   zlib-static=1.3.1-r0
17 | 
18 | RUN wget -q "https://github.com/commercialhaskell/stack/releases/download/v3.3.1/stack-3.3.1-linux-$(uname -m)-bin" \
19 |   && mv "stack-3.3.1-linux-$(uname -m)-bin" /usr/bin/stack \
20 |   && chmod +x /usr/bin/stack
21 | 
22 | RUN stack config set system-ghc --global true
23 | 
24 | # Add just the package.yaml file to capture dependencies
25 | COPY package.yaml /src/seonbi/package.yaml
26 | COPY stack-ghc-9.4.yaml /src/seonbi/stack.yaml
27 | 
28 | WORKDIR /src/seonbi
29 | 
30 | # Docker will cache this command as a layer, freeing us up to
31 | # modify source code without re-installing dependencies
32 | # (unless the .cabal file changes!)
33 | RUN stack setup --system-ghc
34 | RUN stack build \
35 |   --system-ghc \
36 |   --only-snapshot \
37 |   --flag seonbi:iconv \
38 |   --flag seonbi:static
39 | 
40 | COPY . /src/seonbi
41 | RUN cp /src/seonbi/stack-ghc-9.4.yaml /src/seonbi/stack.yaml
42 | 
43 | RUN stack build \
44 |   --system-ghc \
45 |   --flag seonbi:iconv \
46 |   --flag seonbi:static \
47 |   --copy-bins
48 | 
49 | FROM docker.io/alpine:3.19
50 | 
51 | COPY --from=build /root/.local/bin/seonbi* /usr/local/bin/
52 | ENV LANG=en_US.UTF-8
53 | ENV LANGUAGE=en_US.UTF-8
54 | CMD ["seonbi"]
55 | 


--------------------------------------------------------------------------------
/scripts/deno/README.md:
--------------------------------------------------------------------------------
 1 | [Seonbi] client library for Deno
 2 | ================================
 3 | 
 4 | [![Latest version][Tag badge]][Deno module]
 5 | 
 6 | *[Seonbi] is an HTML preprocessor that makes typographic/orthographic
 7 | adjustments on Korean text.  See the [website][Seonbi] for details.*
 8 | 
 9 | This directory contains a simple client library which manages and communicates
10 | with Seonbi HTTP API server.  The `transform()` function and `Seonbi` class
11 | automatically downloads the Seonbi executable binary and runs the server under
12 | the hood.
13 | 
14 | Here's an example code for one-shot transformation:
15 | 
16 | ~~~~ typescript
17 | import { transform } from "https://deno.land/x/seonbi/mod.ts";
18 | 
19 | const input = "디노를 通해 쓰는 선비";
20 | const output = transform(input);
21 | console.log(output);  // 디노를 통해 쓰는 선비
22 | ~~~~
23 | 
24 | When there are multiple inputs to transform, makes a `Seonbi` instance and
25 | call its `transform()` method multiple times so that the server subprocess
26 | are not spawned needlessly more than once:
27 | 
28 | 
29 | ~~~~ typescript
30 | import { Seonbi } from "https://deno.land/x/seonbi/mod.ts";
31 | 
32 | const inputs = [
33 |   "序詩",
34 |   "看板 없는 거리",
35 |   "太初의 아침",
36 |   "무서운 時間",
37 |   "눈 오는 地圖",
38 |   "별 헤는 밤",
39 |   "슬픈 族屬",
40 | ];
41 | const seonbi = new Seonbi();
42 | const outputs = await Promise.all(inputs.map(input => seonbi.transform(input)));
43 | console.log(outputs);
44 | /*
45 | [
46 |   "서시",
47 |   "간판 없는 거리",
48 |   "태초의 아침",
49 |   "무서운 시간",
50 |   "눈 오는 지도",
51 |   "별 헤는 밤",
52 |   "슬픈 족속",
53 | ]
54 | */
55 | ~~~~
56 | 
57 | [Seonbi]: https://github.com/dahlia/seonbi
58 | [Tag badge]: https://img.shields.io/github/v/tag/dahlia/seonbi
59 | [Deno module]: https://deno.land/x/seonbi
60 | 


--------------------------------------------------------------------------------
/scripts/Bundle-RequiredDlls.ps1:
--------------------------------------------------------------------------------
 1 | [CmdletBinding()] param ()
 2 | 
 3 | Set-Variable ObjdumpPath -Option Constant -Value (stack path --compiler-bin `
 4 |   | Split-Path -Parent `
 5 |   | Join-Path -ChildPath "mingw" -AdditionalChildPath "bin", "objdump.exe")
 6 | 
 7 | function Get-RequiredDlls {
 8 |   [OutputType([System.IO.FileInfo[]])]
 9 |   param (
10 |     [Parameter(Mandatory)]
11 |     [System.IO.FileInfo]$ObjectPath,
12 |     [Parameter(Mandatory)]
13 |     [System.IO.FileInfo]$LibraryPath
14 |   )
15 |   $dlls = & $ObjdumpPath -p $ObjectPath `
16 |     | Select-String "^`tDLL Name: (.*?`.[Dd][Ll]{2})$" -CaseSensitive `
17 |     | ForEach-Object { $_.Matches.Groups[1].Value }
18 |   $dllsToBundle = Get-ChildItem -Filter *.dll -Recurse $LibraryPath `
19 |     | Where-Object { $dlls -contains $_.Name }
20 |   if ($null -eq $dllsToBundle -or $dllsToBundle.Length -lt 1) {
21 |     return @()
22 |   } elseif ($dllsToBundle.GetType() -eq [System.IO.FileInfo]) {
23 |     $dllsToBundle = @($dllsToBundle)
24 |   }
25 |   $dependencies = @()
26 |   foreach ($dll in $dllsToBundle) {
27 |     $dependencies += Get-RequiredDlls $dll $LibraryPath
28 |   }
29 |   if ($dependencies.Length -gt 0) {
30 |     $dllsToBundle += $dependencies
31 |   }
32 |   $dllsToBundle = $dllsToBundle | Select-Object -Unique
33 |   if ($dllsToBundle.GetType() -eq [System.IO.FileInfo]) {
34 |     return @($dllsToBundle)
35 |   }
36 |   return $dllsToBundle
37 | }
38 | 
39 | $localBinDir = stack path --local-install-root | Join-Path -ChildPath "bin"
40 | $objectPaths = Get-ChildItem -Filter *.exe $localBinDir
41 | $libraryPath = stack path --compiler-bin | Split-Path -Parent
42 | 
43 | foreach ($obj in $objectPaths) {
44 |   Get-RequiredDlls $obj $libraryPath | ForEach-Object {
45 |     Write-Verbose $_
46 |     Copy-Item $_ $localBinDir
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/scripts/ko-kr-stdict/README.rst:
--------------------------------------------------------------------------------
 1 | Extract hanja words from *Standard Korean Language Dictionary* (標準國語大辭典)
 2 | ===============================================================================
 3 | 
 4 | This Python script extracts Sino-Korean words from *Standard Korean Language
 5 | Dictionary* (標準國語大辭典) published by NIKL (國立國語院) of South Korea.
 6 | 
 7 | First of all, this script requires Python 3.6 or higher.  Though it might work
 8 | on older versions, I've never tested.  I'm sure it won't work on Python 2.
 9 | It also works well with PyPy3.5 v6.0 or higher, and is even about 2 times
10 | faster than CPython --- so I recommend PyPy more than CPython.
11 | 
12 | Note that this script does not depend on any other than the Python standard
13 | library.
14 | 
15 | NIKL has distributed *Standard Korean Language Dictionary* under CC BY-SA
16 | `since 11th March, 2019`__.  The data can be downloaded from the
17 | `Standard Korean Language Dictionary`__ website --- although this website
18 | does not have English version and you need to make an account to download
19 | the data.  Or, in short, you could download using ``curl`` in one-shot::
20 | 
21 |     # Works as of January 2025.
22 |     curl \
23 |         -LJ \
24 |         -X POST \
25 |         -F link_key=1404371 \
26 |         -F pageUnit=10 \
27 |         -F pageIndex=1 \
28 |         -o stdict.zip \
29 |         https://stdict.korean.go.kr/common/download.do
30 | 
31 | The data is contained by a *.zip* archive, and if you extract it there are
32 | several XML data files.  This script reads the *.zip* archive (not *.xml* files)
33 | and then prints the result in the TSV format that Seonbi can interpret::
34 | 
35 |     ./main.py stdict.zip | sort > kr-stdict.tsv
36 | 
37 | __ https://stdict.korean.go.kr/notice/noticeView.do?board_no=1129
38 | __ https://stdict.korean.go.kr/
39 | 


--------------------------------------------------------------------------------
/demo/github-corner.html:
--------------------------------------------------------------------------------
 1 | <a href="https://github.com/dahlia/seonbi"
 2 |    class="github-corner"
 3 |    aria-label="View source on GitHub">
 4 |   <svg width="80"
 5 |        height="80"
 6 |        viewBox="0 0 250 250"
 7 |        style="fill: #151513;
 8 |               color: #fff;
 9 |               position: absolute;
10 |               top: 0;
11 |               border: 0;
12 |               right: 0;"
13 |        aria-hidden="true">
14 |     <path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path>
15 |     <path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7
16 |              120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9
17 |              125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2"
18 |           fill="currentColor"
19 |           style="transform-origin: 130px 106px;"
20 |           class="octo-arm"></path>
21 |     <path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6
22 |              C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0
23 |              C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1
24 |              C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4
25 |              C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9
26 |              C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8
27 |              198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9
28 |              156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9
29 |              141.8,141.8 Z"
30 |           fill="currentColor"
31 |           class="octo-body"></path>
32 |   </svg>
33 | </a>
34 | <style>
35 | .github-corner:hover .octo-arm { animation: octocat-wave 560ms ease-in-out; }
36 | @keyframes octocat-wave {
37 |   0%, 100% { transform: rotate(0); }
38 |   20%, 60% { transform: rotate(-25deg); }
39 |   40%, 80% { transform: rotate(10deg); }
40 | }
41 | @media (max-width:500px) {
42 |   .github-corner:hover .octo-arm { animation: none; }
43 |   .github-corner .octo-arm { animation: octocat-wave 560ms ease-in-out; }
44 | }
45 | </style>
46 | 


--------------------------------------------------------------------------------
/test/Text/Seonbi/Unihan/KHangulSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedLists #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | module Text.Seonbi.Unihan.KHangulSpec (spec) where
 4 | 
 5 | import Data.Either
 6 | 
 7 | import Data.Aeson
 8 | import Data.Map.Strict
 9 | import Test.Hspec
10 | 
11 | import Text.Seonbi.Unihan.KHangul
12 | 
13 | spec :: Spec
14 | spec = do
15 |     describe "kHangulData'" $
16 |         it "should be loaded" $
17 |             kHangulData' `shouldSatisfy` isRight
18 |     describe "kHangulData" $
19 |         it "contains Hanja Hangul readings" $
20 |             Data.Map.Strict.lookup '識' kHangulData `shouldBe` Just
21 |                 [ ('식', HanjaReadingCitation KS_X_1001 [Education])
22 |                 , ('지', HanjaReadingCitation KS_X_1001 [PersonalName])
23 |                 ]
24 |     describe "HanjaReadingCitation" $
25 |         specify "parseJSON" $ do
26 |             decode "\"\"" `shouldBe` Just (HanjaReadingCitation NonStandard [])
27 |             decode "\"E\"" `shouldBe` Just
28 |                 (HanjaReadingCitation NonStandard [Education])
29 |             decode "\"N\"" `shouldBe` Just
30 |                 (HanjaReadingCitation NonStandard [PersonalName])
31 |             decode "\"EN\"" `shouldBe` Just
32 |                 (HanjaReadingCitation NonStandard [Education, PersonalName])
33 |             decode "\"0\"" `shouldBe` Just (HanjaReadingCitation KS_X_1001 [])
34 |             decode "\"1\"" `shouldBe` Just (HanjaReadingCitation KS_X_1002 [])
35 |             decode "\"0E\"" `shouldBe` Just
36 |                 (HanjaReadingCitation KS_X_1001 [Education])
37 |             decode "\"1N\"" `shouldBe` Just
38 |                 (HanjaReadingCitation KS_X_1002 [PersonalName])
39 |             decode "\"2\"" `shouldBe` (Nothing :: Maybe HanjaReadingCitation)
40 |             decode "\"00\"" `shouldBe` (Nothing :: Maybe HanjaReadingCitation)
41 |             decode "\"0Z\"" `shouldBe` (Nothing :: Maybe HanjaReadingCitation)
42 |             decode "0" `shouldBe` (Nothing :: Maybe HanjaReadingCitation)
43 |             decode "null" `shouldBe` (Nothing :: Maybe HanjaReadingCitation)
44 | 


--------------------------------------------------------------------------------
/src/Text/Seonbi/Html/Wrapper.hs:
--------------------------------------------------------------------------------
 1 | module Text.Seonbi.Html.Wrapper
 2 |     ( isWrappedBy
 3 |     , isWrappedBy'
 4 |     , wrap
 5 |     ) where
 6 | 
 7 | import Text.Seonbi.Html
 8 | import Text.Seonbi.Html.TagStack
 9 | 
10 | -- | Wraps given entities with an element.
11 | wrap :: HtmlTagStack -> HtmlTag -> HtmlRawAttrs -> [HtmlEntity] -> [HtmlEntity]
12 | wrap baseStack tag' attributes entities = (:)
13 |     (HtmlStartTag baseStack tag' attributes)
14 |     [ e { tagStack = rebase' (tagStack e) }
15 |     | e <- entities
16 |     ] ++ [HtmlEndTag baseStack tag']
17 |   where
18 |     newBaseStack :: HtmlTagStack
19 |     newBaseStack = push tag' baseStack
20 |     rebase' :: HtmlTagStack -> HtmlTagStack
21 |     rebase' = rebase baseStack newBaseStack
22 | 
23 | -- | A shortcut to 'isWrappedBy'' of wildcard attributes match.
24 | isWrappedBy :: [HtmlEntity] -> HtmlTag -> Bool
25 | isWrappedBy entities tag' =
26 |     isWrappedBy' entities tag' Nothing
27 | 
28 | -- | 'True' if the given @['HtmlEntity']@ is wrapped by a tag and attributes.
29 | -- E.g.:
30 | --
31 | -- >>> :set -XOverloadedLists
32 | -- >>> :set -XOverloadedStrings
33 | -- >>> :{
34 | -- let entities = 
35 | --         [ HtmlStartTag [] Em " id=foo"
36 | --         , HtmlText [Em] "Hello"
37 | --         , HtmlEndTag [] Em
38 | --         ] :: [HtmlEntity]
39 | -- :}
40 | --
41 | -- >>> isWrappedBy' entities Em $ Just " id=foo"
42 | -- True
43 | -- >>> isWrappedBy' entities Div $ Just " id=foo"
44 | -- False
45 | -- >>> isWrappedBy' entities Em $ Just " id=wrong"
46 | -- False
47 | --
48 | -- In order to match to any attributes (wildcard match), give 'Nothing' to
49 | -- the third argument:
50 | --
51 | -- >>> isWrappedBy' entities Em Nothing
52 | -- True
53 | -- >>> isWrappedBy' entities Span Nothing
54 | -- False
55 | --
56 | -- Or you can use 'isWrappedBy' function which is a shortcut for that.
57 | isWrappedBy' :: [HtmlEntity] -> HtmlTag -> Maybe HtmlRawAttrs -> Bool
58 | isWrappedBy' entities@(HtmlStartTag s t a : _) tag' attributes =
59 |     case Prelude.last entities of
60 |         HtmlEndTag s' t' ->
61 |             t == tag' && t' == tag' && s == s' && maybe True (== a) attributes
62 |         _ ->
63 |             False
64 | isWrappedBy' _ _ _ = False
65 | 


--------------------------------------------------------------------------------
/test/Text/Seonbi/Html/WrapperSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedLists #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | module Text.Seonbi.Html.WrapperSpec (spec) where
 4 | 
 5 | import Test.Hspec
 6 | 
 7 | import Text.Seonbi.Html.Entity
 8 | import Text.Seonbi.Html.Tag
 9 | import Text.Seonbi.Html.Wrapper
10 | 
11 | spec :: Spec
12 | spec =
13 |     specify "wrap" $
14 |         wrap [Div, Article] BlockQuote " class=\"q\""
15 |             [ HtmlStartTag
16 |                 { tagStack = [Div, Article]
17 |                 , tag = P
18 |                 , rawAttributes = ""
19 |                 }
20 |             , HtmlText { tagStack = [Div, Article, P], rawText = "foo" }
21 |             , HtmlStartTag
22 |                 { tagStack = [Div, Article, P]
23 |                 , tag = Em
24 |                 , rawAttributes = ""
25 |                 }
26 |             , HtmlCdata { tagStack = [Div, Article, P, Em], text = "bar" }
27 |             , HtmlEndTag { tagStack = [Div, Article, P], tag = Em }
28 |             , HtmlComment { tagStack = [Div, Article, P], comment = " baz " }
29 |             , HtmlEndTag { tagStack = [Div, Article], tag = P }
30 |             ] `shouldBe`
31 |             [ HtmlStartTag
32 |                 { tagStack = [Div, Article]
33 |                 , tag = BlockQuote
34 |                 , rawAttributes = " class=\"q\""
35 |                 }
36 |             , HtmlStartTag
37 |                 { tagStack = [Div, Article, BlockQuote]
38 |                 , tag = P
39 |                 , rawAttributes = ""
40 |                 }
41 |             , HtmlText
42 |                 { tagStack = [Div, Article, BlockQuote, P]
43 |                 , rawText = "foo"
44 |                 }
45 |             , HtmlStartTag
46 |                 { tagStack = [Div, Article, BlockQuote, P]
47 |                 , tag = Em
48 |                 , rawAttributes = ""
49 |                 }
50 |             , HtmlCdata
51 |                 { tagStack = [Div, Article, BlockQuote, P, Em]
52 |                 , text = "bar"
53 |                 }
54 |             , HtmlEndTag { tagStack = [Div, Article, BlockQuote, P], tag = Em }
55 |             , HtmlComment
56 |                 { tagStack = [Div, Article, BlockQuote, P]
57 |                 , comment = " baz "
58 |                 }
59 |             , HtmlEndTag { tagStack = [Div, Article, BlockQuote], tag = P }
60 |             , HtmlEndTag { tagStack = [Div, Article], tag = BlockQuote }
61 |             ]
62 | 


--------------------------------------------------------------------------------
/test/Text/Seonbi/ContentTypesSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | module Text.Seonbi.ContentTypesSpec (spec) where
 3 | 
 4 | import Data.Text
 5 | import qualified Data.Text.Lazy as LT
 6 | import Data.Text.Lazy.Builder
 7 | 
 8 | import Test.Hspec
 9 | 
10 | import Text.Seonbi.Html
11 | import Text.Seonbi.ContentTypes
12 | import qualified HTMLEntities.Builder
13 | import HTMLEntities.Decoder
14 | 
15 | textReverser :: (Monad m, MonadFail m) => HtmlTransformer m
16 | textReverser entities =
17 |     return $ reverseText <$> entities
18 |   where
19 |     reverseText :: HtmlEntity -> HtmlEntity
20 |     reverseText e@HtmlText { rawText = t } =
21 |         e { rawText = encode $ Data.Text.reverse $ decode t }
22 |     reverseText e@HtmlCdata { text = t } =
23 |         e { text = Data.Text.reverse t }
24 |     reverseText e =
25 |         e
26 |     decode :: Text -> Text
27 |     decode = LT.toStrict . toLazyText . htmlEncodedText
28 |     encode :: Text -> Text
29 |     encode = LT.toStrict . toLazyText . HTMLEntities.Builder.text
30 | 
31 | spec :: Spec
32 | spec = do
33 |     specify "asHtmlTransformer" $ do
34 |         r <- asHtmlTransformer textReverser "<p>foo <em>bar</em><br> baz</p>"
35 |         r `shouldBe` "<p> oof<em>rab</em><br>zab </p>"
36 |     specify "asXhtmlTransformer" $ do
37 |         r <- asXhtmlTransformer textReverser "<p>foo <em>bar</em><br> baz</p>"
38 |         r `shouldBe` "<p> oof<em>rab</em><br/>zab </p>"
39 |     specify "asPlainTextTransformer" $ do
40 |         r <- asPlainTextTransformer textReverser
41 |                 "<p>foo <em>bar</em><br> baz</p>"
42 |         r `shouldBe` ">p/<zab >rb<>me/<rab>me< oof>p<"
43 |     specify "asCommonMarkTransformer" $ do
44 |         r <- asCommonMarkTransformer textReverser
45 |                 "# Foo\n\nBar *Baz*\nQux\n\n> Quote <em>tag</em>\n"
46 |         r `shouldBe` "# ooF\n\n raB*zaB*\nxuQ\n\n>  etouQ<em>gat</em>\n"
47 |     specify "transformWithContentType" $ do
48 |         let input = "*foo* <em>bar</em><br>"
49 |         h <- transformWithContentType "text/html" textReverser input
50 |         h `shouldBe` " *oof*<em>rab</em><br>"
51 |         x <- transformWithContentType "application/xhtml+xml" textReverser input
52 |         x `shouldBe` " *oof*<em>rab</em><br/>"
53 |         p <- transformWithContentType "text/plain" textReverser input
54 |         p `shouldBe` ">rb<>me/<rab>me< *oof*"
55 |         m <- transformWithContentType "text/markdown" textReverser input
56 |         m `shouldBe` "*oof* <em>rab</em><br>\n"
57 | 


--------------------------------------------------------------------------------
/setup/README.md:
--------------------------------------------------------------------------------
 1 | `dahlia/seonbi/setup`: GitHub action to install [Seonbi]
 2 | ========================================================
 3 | 
 4 | This action installs executables `seonbi` and `seonbi-api` during GitHub Actions
 5 | workflow:
 6 | 
 7 | ~~~ yaml
 8 | - uses: dahlia/seonbi/setup@main
 9 | ~~~
10 | 
11 | It installs the latest version of Seonbi by default.  To explicitly specify
12 | the version to install, use the `seonbi-version` option:[^1]
13 | 
14 | ~~~ yaml
15 | - uses: dahlia/seonbi/setup@main
16 |   with:
17 |     seonbi-version: 0.3.*
18 | ~~~
19 | 
20 | The wildcard in the version number chooses the latest released version.
21 | Also, `seonbi-version: 0.*` is equivalent to `seonbi-version: 0.*.*`,
22 | and `seonbi-version: *` is equivalent to `seonbi-version: *.*.*`.
23 | Therefore, `seonbi-version: *` means the latest version.
24 | 
25 | to get the exact version number of the installed Seonbi from the later steps,
26 | use the `seonbi-version` output:
27 | 
28 | ~~~ yaml
29 | - id: setup-seonbi
30 |   uses: dahlia/seonbi/setup@main
31 |   with:
32 |     seonbi-version: *
33 | - run: |
34 |     echo "Installed seonbi version:" \
35 |       "${{ steps.setup-seonbi.outputs.seonbi-version }}"
36 |   shell: bash
37 | ~~~
38 | 
39 | To prevent the installed Seonbi from being added to the `PATH`, turn off
40 | the `add-to-path` option (which is turned on by default) and use
41 | the `seonbi-path` and `seonbi-api-path` outputs instead:
42 | 
43 | ~~~ yaml
44 | - id: setup-seonbi
45 |   uses: dahlia/seonbi/setup@main
46 |   with:
47 |     add-to-path: false
48 | - run: ${{ steps.setup-seonbi.outputs.seonbi-path }} README.md
49 |   shell: bash
50 | ~~~
51 | 
52 | [^1]: Note that the action version and the Seonbi versions are distinct.
53 |       However, it's recommended to match major and minor versions for both.
54 | 
55 | [Seonbi]: https://github.com/dahlia/seonbi
56 | 
57 | 
58 | Input parameters
59 | ----------------
60 | 
61 |  -  `seonbi-version`: Version of executable binaries `seonbi` and `seonbi-api`
62 |     to install.  Note that asterisks can be used to choose the latest version,
63 |     e.g., `1.2.*`, `1.*`, `*`. (Default: `*`.)
64 |  -  `add-to-path`: Whether to add the installed `seonbi` and `seonbi-api` to
65 |     the `PATH`.  Turned on by default.  (Default: `true`.)
66 | 
67 | 
68 | Output parameters
69 | -----------------
70 | 
71 |  -  `seonbi-version`: Exact version number of the installed Seonbi.
72 |  -  `seonbi-path`: Absolute path of the installed executable `seonbi`.
73 |  -  `seonbi-api-path`: Absolute path of the installed executable `seonbi-api`.
74 | 


--------------------------------------------------------------------------------
/test/Text/Seonbi/Html/PreservationSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedLists #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | module Text.Seonbi.Html.PreservationSpec (spec) where
 4 | 
 5 | import Test.Hspec
 6 | 
 7 | import Text.Seonbi.Html.Entity
 8 | import Text.Seonbi.Html.Preservation
 9 | import Text.Seonbi.Html.Tag
10 | 
11 | spec :: Spec
12 | spec = do
13 |     specify "isPreservedTag" $ do
14 |         P `shouldNotSatisfy` isPreservedTag
15 |         Em `shouldNotSatisfy` isPreservedTag
16 |         Title `shouldNotSatisfy` isPreservedTag
17 |         Canvas `shouldSatisfy` isPreservedTag
18 |         Code `shouldSatisfy` isPreservedTag
19 |         Kbd `shouldSatisfy` isPreservedTag
20 |         Pre `shouldSatisfy` isPreservedTag
21 |         Script `shouldSatisfy` isPreservedTag
22 |         Style `shouldSatisfy` isPreservedTag
23 |         Template `shouldSatisfy` isPreservedTag
24 |         TextArea `shouldSatisfy` isPreservedTag
25 |     specify "isPreservedTagStack" $ do
26 |         [] `shouldNotSatisfy` isPreservedTagStack
27 |         [P, Em] `shouldNotSatisfy` isPreservedTagStack
28 |         [Html, Head, Title] `shouldNotSatisfy` isPreservedTagStack
29 |         [Div, Script] `shouldSatisfy` isPreservedTagStack
30 |         [Html, Head, Style] `shouldSatisfy` isPreservedTagStack
31 |         [P, Kbd] `shouldSatisfy` isPreservedTagStack
32 |         [Pre, Code] `shouldSatisfy` isPreservedTagStack
33 |         [Template, P] `shouldSatisfy` isPreservedTagStack
34 |     specify "shouldBePreserved" $ do
35 |         HtmlStartTag [] P "" `shouldNotSatisfy` isPreservedEntity
36 |         HtmlEndTag [] P `shouldNotSatisfy` isPreservedEntity
37 |         HtmlText [] "" `shouldNotSatisfy` isPreservedEntity
38 |         HtmlCdata [] "" `shouldNotSatisfy` isPreservedEntity
39 |         HtmlComment [] " ... " `shouldSatisfy` isPreservedEntity
40 |         HtmlStartTag [P] Em "" `shouldNotSatisfy` isPreservedEntity
41 |         HtmlEndTag [P] Em `shouldNotSatisfy` isPreservedEntity
42 |         HtmlText [P] "" `shouldNotSatisfy` isPreservedEntity
43 |         HtmlCdata [P] "" `shouldNotSatisfy` isPreservedEntity
44 |         HtmlComment [P] " ... " `shouldSatisfy` isPreservedEntity
45 |         HtmlStartTag [P] Code "" `shouldSatisfy` isPreservedEntity
46 |         HtmlEndTag [P] Code `shouldSatisfy` isPreservedEntity
47 |         HtmlStartTag [Pre] Span "" `shouldSatisfy` isPreservedEntity
48 |         HtmlEndTag [Pre] Span `shouldSatisfy` isPreservedEntity
49 |         HtmlText [Pre] "" `shouldSatisfy` isPreservedEntity
50 |         HtmlCdata [Pre] "" `shouldSatisfy` isPreservedEntity
51 |         HtmlComment [Pre] " ... " `shouldSatisfy` isPreservedEntity
52 | 


--------------------------------------------------------------------------------
/src/Text/Seonbi/Hangul.hs:
--------------------------------------------------------------------------------
 1 | module Text.Seonbi.Hangul
 2 |     ( JamoTriple
 3 |     , fromJamoTriple
 4 |     , isHangulSyllable
 5 |     , toJamoTriple
 6 |     ) where
 7 | 
 8 | -- $setup
 9 | -- >>> import qualified Text.Show.Unicode
10 | -- >>> :set -interactive-print=Text.Show.Unicode.uprint
11 | 
12 | -- | A triple of an initial consonant, a vowel, and an optional final consonant.
13 | type JamoTriple = (Char, Char, Maybe Char)
14 | 
15 | -- | Checks if a character is a hangul letter and a complete syllable.
16 | --
17 | -- >>> isHangulSyllable '가'
18 | -- True
19 | -- >>> isHangulSyllable 'ㄱ'
20 | -- False
21 | isHangulSyllable :: Char -> Bool
22 | isHangulSyllable c =
23 |     c >= '\xac00' && c <= '\xd7a3';
24 | 
25 | syllableBase :: Int
26 | syllableBase = 0xac00
27 | 
28 | initialBase :: Int
29 | initialBase = 0x1100
30 | 
31 | vowelBase :: Int
32 | vowelBase = 0x1161
33 | 
34 | finalBase :: Int
35 | finalBase = 0x11a7
36 | 
37 | vowelCount :: Int
38 | vowelCount = 21;
39 | 
40 | finalCount :: Int
41 | finalCount = 28;
42 | 
43 | -- | Takes a complete hangul syllable apart into consonants and a vowel.
44 | -- Returns 'Nothing' for non-hangul letters.
45 | --
46 | -- >>> toJamoTriple '가'
47 | -- Just ('ᄀ','ᅡ',Nothing)
48 | -- >>> toJamoTriple '글'
49 | -- Just ('ᄀ','ᅳ',Just 'ᆯ')
50 | -- >>> toJamoTriple 'A'
51 | -- Nothing
52 | toJamoTriple :: Char -> Maybe JamoTriple
53 | toJamoTriple c
54 |   | isHangulSyllable c = Just
55 |       ( toEnum $ initialBase + ((syllable `div` finalCount) `div` vowelCount)
56 |       , toEnum $ vowelBase + ((syllable `div` finalCount) `mod` vowelCount)
57 |       , case syllable `mod` finalCount of
58 |           0 -> Nothing
59 |           f -> Just $ toEnum (finalBase + f)
60 |       )
61 |   | otherwise = Nothing
62 |   where
63 |     syllable :: Int
64 |     syllable = fromEnum c - syllableBase
65 | 
66 | -- | Composes hangul jamo triple into a hangul syllable.
67 | --
68 | -- >>> fromJamoTriple ('ᄀ', 'ᅡ', Nothing)
69 | -- Just '가'
70 | -- >>> fromJamoTriple ('ᄀ', 'ᅳ', Just 'ᆯ')
71 | -- Just '글'
72 | fromJamoTriple :: JamoTriple -> Maybe Char
73 | fromJamoTriple (initial, vowel, final)
74 |   | initialIndex < 0 = Nothing
75 |   | initialIndex > 18 = Nothing
76 |   | vowelIndex < 0 = Nothing
77 |   | vowelIndex > 20 = Nothing
78 |   | finalIndex < 0 = Nothing
79 |   | finalIndex > 27 = Nothing
80 |   | otherwise = Just $ toEnum $ syllableBase +
81 |       (initialIndex * vowelCount + vowelIndex) * finalCount + finalIndex
82 |   where
83 |     initialIndex :: Int
84 |     initialIndex = fromEnum initial - initialBase
85 |     vowelIndex :: Int
86 |     vowelIndex = fromEnum vowel - vowelBase
87 |     finalIndex :: Int
88 |     finalIndex = maybe 0 (\ f -> fromEnum f - finalBase) final
89 | 


--------------------------------------------------------------------------------
/src/Text/Seonbi/Html/Entity.hs:
--------------------------------------------------------------------------------
 1 | module Text.Seonbi.Html.Entity
 2 |     ( HtmlEntity (..)
 3 |     , HtmlRawAttrs
 4 |     ) where
 5 | 
 6 | import Data.Text
 7 | 
 8 | import Text.Seonbi.Html.Tag (HtmlTag)
 9 | import Text.Seonbi.Html.TagStack (HtmlTagStack)
10 | 
11 | -- | All element attributes in a string.
12 | type HtmlRawAttrs = Text
13 | 
14 | -- | An event entity emitted by 'scanHtml'.
15 | data HtmlEntity
16 |     -- | Represent a token which [opens an HTML element
17 |     -- ](https://www.w3.org/TR/html5/syntax.html#start-tags).
18 |     --
19 |     -- Note that 'rawAttributes' is not a parsed and structured data but a raw
20 |     -- string as its name implies.
21 |     --
22 |     -- The 'tagStack' doesn't include the corresponding opened 'tag'.
23 |     = HtmlStartTag
24 |         { -- | A stack of 'HtmlTag's that represents a hierarchy of a currently
25 |           -- parsing position in an 'HtmlTag' tree.
26 |           tagStack :: HtmlTagStack
27 |         , tag :: HtmlTag
28 |         , rawAttributes :: HtmlRawAttrs
29 |         }
30 |     -- | Represent a token which [closes an HTML element
31 |     -- ](https://www.w3.org/TR/html5/syntax.html#end-tags).
32 |     -- The 'tagStack' doesn't include the corresponding closed 'tag'.
33 |     | HtmlEndTag
34 |         { -- | A stack of 'HtmlTag's that represents a hierarchy of a currently
35 |           -- parsing position in an 'HtmlTag' tree.
36 |           tagStack :: HtmlTagStack
37 |         , tag :: HtmlTag
38 |         }
39 |     -- | Represent a token of a text node.  Note that 'rawText' is not a parsed
40 |     -- and structured data but a raw string as its name implies.  There can be
41 |     -- continuously more than one 'HtmlText' values can be emitted even if they
42 |     -- are not separated by element openings or closings.
43 |     | HtmlText
44 |         { -- | A stack of 'HtmlTag's that represents a hierarchy of a currently
45 |           -- parsing position in an 'HtmlTag' tree.
46 |           tagStack :: HtmlTagStack
47 |         , rawText :: Text
48 |         }
49 |     -- | Represent a token of a
50 |     -- [CDATA section](https://www.w3.org/TR/html5/syntax.html#cdata-sections).
51 |     | HtmlCdata
52 |         { -- | A stack of 'HtmlTag's that represents a hierarchy of a currently
53 |           -- parsing position in an 'HtmlTag' tree.
54 |           tagStack :: HtmlTagStack
55 |         , text :: Text
56 |         }
57 |     -- | Represent a token of an
58 |     -- [HTML comment](https://www.w3.org/TR/html5/syntax.html#comments).
59 |     | HtmlComment
60 |         { -- | A stack of 'HtmlTag's that represents a hierarchy of a currently
61 |           -- parsing position in an 'HtmlTag' tree.
62 |           tagStack :: HtmlTagStack
63 |         , comment :: Text
64 |         }
65 |     deriving (Eq, Ord, Show)
66 | 


--------------------------------------------------------------------------------
/Setup.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | import Control.Monad
 3 | import Prelude hiding (concat)
 4 | import System.IO (Handle, IOMode (..), hClose, hSetEncoding, utf8, withFile)
 5 | 
 6 | import Codec.Archive.Zip
 7 | import Data.ByteString.Lazy (ByteString, hPut)
 8 | import Data.Text
 9 | import Data.Text.IO (hGetLine, hPutStrLn)
10 | import Distribution.Simple
11 | import Network.HTTP.Client
12 | import System.Directory
13 | import System.FilePath
14 | import System.IO.Temp
15 | 
16 | unihanUrl :: String
17 | unihanUrl = "http://ftp.unicode.org/Public/11.0.0/ucd/Unihan.zip"
18 | 
19 | kHangulPath :: FilePath
20 | kHangulPath = "src" </> "Text" </> "Seonbi" </> "kHangul.txt"
21 | 
22 | main :: IO ()
23 | main = do
24 |     exist <- doesFileExist kHangulPath
25 |     unless exist $ do
26 |         data' <- downloadUnihan
27 |         extractUnihanReadings data' $ \ txtPath -> do
28 |             values <- withFile txtPath ReadMode (extractProp "kHangul")
29 |             withFile kHangulPath WriteMode $ \ handle -> do
30 |                 hSetEncoding handle utf8
31 |                 forM_ values $ \ (char, value) ->
32 |                     hPutStrLn handle $ concat [char, "\t", value]
33 |     defaultMain
34 | 
35 | extractProp :: Text -> Handle -> IO [(Text, Text)]
36 | extractProp property handle = do
37 |     hSetEncoding handle utf8
38 |     line <- hGetLine handle
39 |     case line of
40 |         "" ->
41 |             return []
42 |         line' ->
43 |             case breakOn "\t" line' of
44 |                 (_, "") ->
45 |                     extractProp property handle
46 |                 (char, rest)
47 |                   | "U+" `isPrefixOf` char && "\t" `isPrefixOf` rest ->
48 |                     case breakOn "\t" $ Data.Text.tail rest of
49 |                         (_, "") ->
50 |                             extractProp property handle
51 |                         (prop, value) | prop == property ->
52 |                             ((char, value) :) <$> extractProp property handle
53 |                         _ ->
54 |                             extractProp property handle
55 |                 _ ->
56 |                     extractProp property handle
57 |     
58 | 
59 | extractUnihanReadings :: ByteString -> (FilePath -> IO a) -> IO a
60 | extractUnihanReadings data' callback =
61 |     withSystemTempFile "Unihan.zip" $ \ zipPath handle -> do
62 |         hPut handle data'
63 |         hClose handle
64 |         let entryName = "Unihan_Readings.txt"
65 |         withSystemTempFile entryName $ \ txtPath handle' -> do
66 |             hClose handle'
67 |             sel <- mkEntrySelector entryName
68 |             withArchive zipPath (saveEntry sel txtPath)
69 |             callback txtPath
70 | 
71 | downloadUnihan :: IO ByteString
72 | downloadUnihan = do
73 |     mgr <- newManager defaultManagerSettings
74 |     req <- parseRequest unihanUrl
75 |     res <- httpLbs req mgr
76 |     return $ responseBody res
77 | 


--------------------------------------------------------------------------------
/test/Text/Seonbi/FacadeSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedLists #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | module Text.Seonbi.FacadeSpec (spec) where
 4 | 
 5 | import Control.Monad
 6 | import Data.Maybe (fromJust)
 7 | 
 8 | import Data.Algorithm.Diff
 9 | import Data.Text.Lazy
10 | import Data.Text.Lazy.IO
11 | import System.Directory
12 | import System.FilePath
13 | import Test.Hspec
14 | 
15 | import Text.Seonbi.Facade
16 | 
17 | dataDirPath :: FilePath
18 | dataDirPath = "test" </> "data"
19 | 
20 | inputExtension :: String
21 | inputExtension = ".ko-Kore.html"
22 | 
23 | outputExtensions :: Monad m => [(String, Configuration m a)]
24 | outputExtensions =
25 |     [ (".ko-KR.html", ko_KR)
26 |     , (".ko-KP.html", ko_KP)
27 |     ]
28 | 
29 | shouldHaveSameText :: HasCallStack => Text -> Text -> Expectation
30 | actual `shouldHaveSameText` expected =
31 |     unless (actual == expected) (expectationFailure msg)
32 |   where
33 |     expectedLines :: [Text]
34 |     expectedLines = Data.Text.Lazy.lines expected
35 |     actualLines :: [Text]
36 |     actualLines = Data.Text.Lazy.lines actual
37 |     diffLines :: [Diff Text]
38 |     diffLines = getDiff expectedLines actualLines
39 |     diff :: Text
40 |     diff = Data.Text.Lazy.unlines
41 |         [ case d of
42 |             First line -> "- " <> line
43 |             Second line -> "+ " <> line
44 |             Both line _ -> "  " <> line
45 |         | d <- diffLines
46 |         ]
47 |     msg :: String
48 |     msg = "Two values are not equal:\n\n--- expected\n+++ actual\n\n" ++
49 |         unpack diff
50 | 
51 | spec :: Spec
52 | spec = do
53 |     testData <- runIO $ do
54 |         files <- listDirectory dataDirPath
55 |         let inputFiles = [f | f <- files, inputExtension `isExtensionOf` f]
56 |         testFiles <- filterM
57 |             (\(_, o, _) -> doesFileExist (dataDirPath </> o))
58 |             [ (i, dropExtension i -<.> oExt, oCfg)
59 |             | i <- inputFiles
60 |             , (oExt, oCfg) <- outputExtensions
61 |             ]
62 |         forM testFiles $ \ (input, output, cfg) -> do
63 |             inputData <- Data.Text.Lazy.IO.readFile (dataDirPath </> input)
64 |             outputData <- Data.Text.Lazy.IO.readFile (dataDirPath </> output)
65 |             return (input, output, inputData, outputData, cfg)
66 |     describe "transformHtmlLazyText" $
67 |         forM_ testData $ \ (iname, oname, input, output, cfg) ->
68 |             specify (iname ++ " -> " ++ oname) $ do
69 |                 let noOpResult = fromJust $ transformHtmlLazyText noOp input
70 |                 noOpResult `shouldHaveSameText` input
71 |                 let cfgResult = fromJust $ transformHtmlLazyText cfg input
72 |                 cfgResult `shouldHaveSameText` output
73 |   where
74 |     noOp :: Monad m => Configuration m a
75 |     noOp = Configuration
76 |         { quote = Nothing
77 |         , cite = Nothing
78 |         , arrow = Nothing
79 |         , ellipsis = False
80 |         , emDash = False
81 |         , stop = Nothing
82 |         , hanja = Nothing
83 |         , contentType = "text/html"
84 |         , debugLogger = Nothing
85 |         }
86 | 


--------------------------------------------------------------------------------
/src/Text/Seonbi/Html/TextNormalizer.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE LambdaCase #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | module Text.Seonbi.Html.TextNormalizer
 4 |     ( escapeHtmlEntities
 5 |     , normalizeCdata
 6 |     , normalizeText
 7 |     ) where
 8 | 
 9 | import Control.Exception
10 | import Data.List
11 | 
12 | import Data.Text hiding (groupBy, map)
13 | 
14 | import Text.Seonbi.Html.Entity
15 | 
16 | -- | As 'scanHtml' may emit two or more continuous 'HtmlText' fragments even
17 | -- if these can be represented as only one 'HtmlText' fragment, it makes
18 | -- postprocessing hard.
19 | --
20 | -- The 'normalizeText' function concatenates such continuous 'HtmlText'
21 | -- fragments into one if possible so that postprocessing can be easy:
22 | --
23 | -- >>> :set -XOverloadedStrings -XOverloadedLists
24 | -- >>> normalizeText [HtmlText [] "Hello, ", HtmlText [] "world!"]
25 | -- [HtmlText {tagStack = fromList [], rawText = "Hello, world!"}]
26 | --
27 | -- It also transforms all 'HtmlCdata' fragments into an 'HtmlText' together.
28 | --
29 | -- >>> :{
30 | -- normalizeText [ HtmlText [] "foo "
31 | --               , HtmlCdata [] "<bar>", HtmlText [] " baz!"
32 | --               ]
33 | -- :}
34 | -- [HtmlText {tagStack = fromList [], rawText = "foo &lt;bar&gt; baz!"}]
35 | normalizeText :: [HtmlEntity] -> [HtmlEntity]
36 | normalizeText fragments =
37 |     [ case map normalizeCdata frags of
38 |         [f] ->
39 |             f
40 |         frags'@(HtmlText { tagStack = s }:_) ->
41 |             HtmlText
42 |                 { tagStack = s
43 |                 , rawText = Data.Text.concat $ map rawText frags'
44 |                 }
45 |         frags' ->
46 |             throw $ AssertionFailed
47 |                 ("Unexpected error occured; grouping does not work well: " ++
48 |                     show frags')
49 |     | frags <- groupBy isSibling fragments
50 |     ]
51 |   where
52 |     isSibling :: HtmlEntity -> HtmlEntity -> Bool
53 |     isSibling HtmlText  { tagStack = a } HtmlText  { tagStack = b } = a == b
54 |     isSibling HtmlText  { tagStack = a } HtmlCdata { tagStack = b } = a == b
55 |     isSibling HtmlCdata { tagStack = a } HtmlText  { tagStack = b } = a == b
56 |     isSibling HtmlCdata { tagStack = a } HtmlCdata { tagStack = b } = a == b
57 |     isSibling _ _ = False
58 | 
59 | -- | Transform a given 'HtmlCdata' node into an equivalent 'HtmlText' node.
60 | --
61 | -- >>> import Text.Seonbi.Html.Tag
62 | -- >>> normalizeCdata HtmlCdata { tagStack = [P], text = "<p id=\"foo\">" }
63 | -- HtmlText {tagStack = fromList [P], rawText = "&lt;p id=&quot;foo&quot;&gt;"}
64 | normalizeCdata :: HtmlEntity -> HtmlEntity
65 | normalizeCdata HtmlCdata { tagStack = s, text = t } =
66 |     HtmlText { tagStack = s, rawText = escapeHtmlEntities t }
67 | normalizeCdata entity = entity
68 | 
69 | -- | Escape special (control) characters into corresponding character entities
70 | -- in the given HTML text.
71 | --
72 | -- >>> escapeHtmlEntities "<foo & \"bar\">"
73 | -- "&lt;foo &amp; &quot;bar&quot;&gt;"
74 | escapeHtmlEntities :: Text -> Text
75 | escapeHtmlEntities =
76 |     Data.Text.concatMap $ \ case
77 |         '<' -> "&lt;"
78 |         '>' -> "&gt;"
79 |         '&' -> "&amp;"
80 |         '"' -> "&quot;"
81 |         c -> Data.Text.singleton c
82 | 


--------------------------------------------------------------------------------
/test/Text/Seonbi/Html/TextNormalizerSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedLists #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | module Text.Seonbi.Html.TextNormalizerSpec (spec) where
 4 | 
 5 | import Control.Monad
 6 | 
 7 | import Test.Hspec
 8 | 
 9 | import Text.Seonbi.Html.Entity
10 | import Text.Seonbi.Html.Tag
11 | import Text.Seonbi.Html.TagStack
12 | import Text.Seonbi.Html.TextNormalizer
13 | 
14 | spec :: Spec
15 | spec = do
16 |     specify "normalizeText" $
17 |         normalizeText
18 |             [ HtmlText { tagStack = [], rawText = "foo " }
19 |             , HtmlText { tagStack = [], rawText = "&amp; bar" }
20 |             , HtmlCdata { tagStack = [], text = " & baz " }
21 |             , HtmlStartTag { tagStack = [], tag = P, rawAttributes = "" }
22 |             , HtmlText { tagStack = [P], rawText = "qux " }
23 |             , HtmlCdata { tagStack = [P], text = "& \"quux\"" }
24 |             , HtmlEndTag { tagStack = [], tag = P }
25 |             , HtmlCdata { tagStack = [], text = " <end>" }
26 |             ] `shouldBe`
27 |             [ HtmlText { tagStack = [], rawText = "foo &amp; bar &amp; baz " }
28 |             , HtmlStartTag { tagStack = [], tag = P, rawAttributes = "" }
29 |             , HtmlText
30 |                 { tagStack = [P]
31 |                 , rawText = "qux &amp; &quot;quux&quot;"
32 |                 }
33 |             , HtmlEndTag { tagStack = [], tag = P }
34 |             , HtmlText { tagStack = [], rawText = " &lt;end&gt;" }
35 |             ]
36 | 
37 |     describe "normalizeCdata" $ do
38 |         let s1 = [] :: HtmlTagStack
39 |         let s2 = [Div, P] :: HtmlTagStack
40 |         specify "HtmlStartTag" $ do
41 |             let entity1 = HtmlStartTag
42 |                     { tagStack = s1
43 |                     , tag = P
44 |                     , rawAttributes = ""
45 |                     }
46 |             normalizeCdata entity1 `shouldBe` entity1
47 |             let entity2 = HtmlStartTag
48 |                     { tagStack = s2
49 |                     , tag = P
50 |                     , rawAttributes = " class=\"entity2\""
51 |                     }
52 |             normalizeCdata entity2 `shouldBe` entity2
53 |         let stacks = [s1, s2] :: [HtmlTagStack]
54 |         forM_ stacks $ \ s -> do
55 |             specify ("HtmlEndTag: " ++ show s) $ do
56 |                 let e = HtmlEndTag { tagStack = s, tag = P }
57 |                 normalizeCdata e `shouldBe` e
58 |             specify ("HtmlText: " ++ show s) $ do
59 |                 let e = HtmlText { tagStack = s, rawText = "foo &amp; bar" }
60 |                 normalizeCdata e `shouldBe` e
61 |             specify ("HtmlComment: " ++ show s) $ do
62 |                 let e = HtmlComment { tagStack = s, comment = "foo" }
63 |                 normalizeCdata e `shouldBe` e
64 |             specify ("HtmlCdata: " ++ show s) $ do
65 |                 let e = HtmlCdata { tagStack = s, text = "<p>foo & bar</p>" }
66 |                 normalizeCdata e `shouldBe`
67 |                     HtmlText
68 |                         { tagStack = s
69 |                         , rawText = "&lt;p&gt;foo &amp; bar&lt;/p&gt;"
70 |                         }
71 | 
72 |     specify "escapeHtmlEntities" $ do
73 |         escapeHtmlEntities "<p id=\"foo\">" `shouldBe`
74 |             "&lt;p id=&quot;foo&quot;&gt;"
75 |         escapeHtmlEntities "AT&T" `shouldBe`
76 |             "AT&amp;T"
77 | 


--------------------------------------------------------------------------------
/scripts/deno/test.ts:
--------------------------------------------------------------------------------
  1 | import {
  2 |   Configuration,
  3 |   DEFAULT_CONFIGURATION,
  4 |   Options,
  5 |   Seonbi,
  6 |   transform,
  7 | } from "./mod.ts";
  8 | import { assertEquals } from "https://deno.land/std@0.106.0/testing/asserts.ts";
  9 | 
 10 | const hanjaInParens: Options = {
 11 |   contentType: "text/html",
 12 |   quote: "CurvedQuotes",
 13 |   cite: null,
 14 |   arrow: null,
 15 |   ellipsis: false,
 16 |   emDash: false,
 17 |   stop: null,
 18 |   hanja: {
 19 |     rendering: "HanjaInParentheses",
 20 |     reading: {
 21 |       initialSoundLaw: true,
 22 |       useDictionaries: ["kr-stdict"],
 23 |       dictionary: {},
 24 |     },
 25 |   },
 26 | };
 27 | 
 28 | const customDict: Options = {
 29 |   ...hanjaInParens,
 30 |   hanja: {
 31 |     rendering: "HanjaInParentheses",
 32 |     reading: {
 33 |       initialSoundLaw: true,
 34 |       useDictionaries: [],
 35 |       dictionary: { "言語": "말", "文字": "글" },
 36 |     },
 37 |   },
 38 | };
 39 | 
 40 | let config: Configuration = {
 41 |   ...DEFAULT_CONFIGURATION,
 42 |   process: { distType: "nightly" },
 43 | };
 44 | 
 45 | try {
 46 |   const binPath = Deno.env.get("SEONBI_API");
 47 |   if (binPath != null && "process" in config) config.process = { binPath };
 48 | } catch (e) {
 49 |   if (!(e instanceof Deno.errors.PermissionDenied)) throw e;
 50 | }
 51 | 
 52 | try {
 53 |   const port = Deno.env.get("SEONBI_API_PORT");
 54 |   if (port != null && port.match(/^[0-9]+$/) && "process" in config) {
 55 |     config.port = parseInt(port);
 56 |   }
 57 | } catch (e) {
 58 |   if (!(e instanceof Deno.errors.PermissionDenied)) throw e;
 59 | }
 60 | 
 61 | try {
 62 |   const apiUrl = Deno.env.get("SEONBI_API_URL");
 63 |   if (apiUrl != null) config = { apiUrl };
 64 | } catch (e) {
 65 |   if (!(e instanceof Deno.errors.PermissionDenied)) throw e;
 66 | }
 67 | 
 68 | Deno.test("transform()", async () => {
 69 |   const koKr = await transform("<p>言語와 文字</p>", config);
 70 |   assertEquals(koKr, "<p>언어와 문자</p>");
 71 | });
 72 | 
 73 | Deno.test("Seonbi#start()", async () => {
 74 |   const seonbi = new Seonbi(config);
 75 |   await seonbi.start();
 76 |   try {
 77 |     for (let i = 0; i < 5; i++) {
 78 |       try {
 79 |         const response = await fetch(seonbi.apiUrl);
 80 |         assertEquals(
 81 |           { message: "Unsupported method: GET", success: false },
 82 |           await response.json(),
 83 |         );
 84 |         break;
 85 |       } catch (e) {
 86 |         if (
 87 |           !(e instanceof TypeError) ||
 88 |           e.message.indexOf("os error 61") < 0 &&
 89 |             e.message.indexOf("os error 111") < 0
 90 |         ) {
 91 |           throw e;
 92 |         }
 93 | 
 94 |         return new Promise((r) => setTimeout(r, 1000));
 95 |       }
 96 |     }
 97 |   } finally {
 98 |     await seonbi.stop();
 99 |   }
100 | });
101 | 
102 | function withSeonbi(fn: (s: Seonbi) => Promise<void>): () => Promise<void> {
103 |   return async () => {
104 |     const seonbi = new Seonbi(config);
105 |     await seonbi.start();
106 |     try {
107 |       await fn(seonbi);
108 |     } finally {
109 |       await seonbi.stop();
110 |     }
111 |   };
112 | }
113 | 
114 | function testWithSeonbi(label: string, fn: (s: Seonbi) => Promise<void>): void {
115 |   Deno.test(label, withSeonbi(fn));
116 | }
117 | 
118 | testWithSeonbi("Seonbi#transform()", async (seonbi: Seonbi) => {
119 |   assertEquals(
120 |     await seonbi.transform("<p>言語와 文字</p>"),
121 |     "<p>언어와 문자</p>",
122 |   );
123 |   assertEquals(
124 |     await seonbi.transform("<p>言語와 文字</p>", hanjaInParens),
125 |     "<p>언어(言語)와 문자(文字)</p>",
126 |   );
127 |   assertEquals(
128 |     await seonbi.transform("<p>言語와 文字</p>", customDict),
129 |     "<p>말(言語)와 글(文字)</p>",
130 |   );
131 | });
132 | 


--------------------------------------------------------------------------------
/src/Text/Seonbi/Unihan/KHangul.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP #-}
  2 | {-# LANGUAGE FlexibleInstances #-}
  3 | {-# LANGUAGE LambdaCase #-}
  4 | {-# LANGUAGE OverloadedStrings #-}
  5 | {-# LANGUAGE TemplateHaskell #-}
  6 | {-# LANGUAGE TypeSynonymInstances #-}
  7 | module Text.Seonbi.Unihan.KHangul
  8 |     ( CharacterSet (..)
  9 |     , HanjaReadings
 10 |     , HanjaReadingCitation (..)
 11 |     , KHangulData
 12 |     , Purpose (..)
 13 |     , kHangulData
 14 |     , kHangulData'
 15 |     ) where
 16 | 
 17 | import Data.Either
 18 | 
 19 | import Data.Aeson
 20 | import Data.Attoparsec.Text
 21 | import Data.ByteString.Lazy (fromStrict)
 22 | import Data.FileEmbed
 23 | import Data.Map.Strict
 24 | import Data.Set hiding (empty)
 25 | import System.FilePath (takeDirectory, (</>))
 26 | 
 27 | -- $setup
 28 | -- >>> import qualified Text.Show.Unicode
 29 | -- >>> :set -interactive-print=Text.Show.Unicode.uprint
 30 | 
 31 | -- | Maps all Hanja characters to their possible readings.
 32 | type KHangulData = Map Char HanjaReadings
 33 | 
 34 | -- | All readings of a Hanja character.
 35 | type HanjaReadings = Map Char HanjaReadingCitation
 36 | 
 37 | -- | Represents what standard a reading of character belongs to and a purpose
 38 | -- of the reading.
 39 | data HanjaReadingCitation =
 40 |     HanjaReadingCitation CharacterSet (Set Purpose) deriving (Eq, Ord, Show)
 41 | 
 42 | -- | Represents character set standards for Korean writing system.
 43 | data CharacterSet
 44 |     -- | KS X 1001 (정보 교환용 부호계).
 45 |     = KS_X_1001
 46 |     -- | KS X 1002 (정보 교환용 부호 확장 세트).
 47 |     | KS_X_1002
 48 |     -- | Represents that a Hanja character is not included in any Korean
 49 |     -- character set standards.
 50 |     | NonStandard
 51 |     deriving (Eq, Ord, Show)
 52 | 
 53 | -- | Represents purposes of Hanja characters.
 54 | data Purpose
 55 |     -- | Basic Hanja for educational use (漢文敎育用基礎漢字), a subset of
 56 |     -- Hanja defined in 1972 by a South Korean standard for educational use.
 57 |     = Education
 58 |     -- | Hanja for personal names (人名用漢字).
 59 |     | PersonalName
 60 |     deriving (Eq, Ord, Show)
 61 | 
 62 | citationParser :: Parser HanjaReadingCitation
 63 | citationParser = do
 64 |     charset' <- option NonStandard charset
 65 |     purposes <- many' purpose
 66 |     return $ HanjaReadingCitation charset' $ Data.Set.fromList purposes
 67 |   where
 68 |     charset :: Parser CharacterSet
 69 |     charset = do
 70 |         d <- digit
 71 |         case d of
 72 |             '0' -> return KS_X_1001
 73 |             '1' -> return KS_X_1002
 74 |             c -> fail ("Invalid kHangul character set code: " ++ show c)
 75 |     purpose :: Parser Purpose
 76 |     purpose = do
 77 |         l <- letter
 78 |         case l of
 79 |             'E' -> return Education
 80 |             'N' -> return PersonalName
 81 |             c -> fail ("Invalid kHangul purpose code: " ++ show c)
 82 | 
 83 | instance FromJSON HanjaReadingCitation where
 84 |     parseJSON =
 85 |         withText "kHangul value (e.g., 0E, 1N, 0EN)" $ \ t ->
 86 |             case parseOnly (citationParser <* endOfInput) t of
 87 |                 Right cite -> return cite
 88 |                 Left msg -> fail msg
 89 | 
 90 | kHangulData' :: Either String KHangulData
 91 | kHangulData' = eitherDecode $
 92 |     fromStrict $(embedFile $ takeDirectory __FILE__ </> "kHangul.json")
 93 | 
 94 | -- | Data that map Hanja characters to their corresponding kHangul entries
 95 | -- (i.e., Hanja readings and citations).
 96 | --
 97 | -- >>> import Data.Map.Strict as M
 98 | -- >>> let Just entries = M.lookup '天' kHangulData
 99 | -- >>> entries
100 | -- fromList [('천',HanjaReadingCitation KS_X_1001 (fromList [Education]))]
101 | kHangulData :: KHangulData
102 | kHangulData = fromRight empty kHangulData'
103 | 
104 | {- HLINT ignore "Unused LANGUAGE pragma" -}
105 | 


--------------------------------------------------------------------------------
/src/Text/Seonbi/Html/Printer.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE LambdaCase #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | module Text.Seonbi.Html.Printer
  4 |     ( printHtml
  5 |     , printText
  6 |     , printXhtml
  7 |     ) where
  8 | 
  9 | import Data.Char
 10 | import Data.List
 11 | 
 12 | import qualified Data.Text
 13 | import Data.Text.Lazy
 14 | import Data.Text.Lazy.Builder
 15 | import HTMLEntities.Decoder
 16 | 
 17 | import Text.Seonbi.Html.Entity
 18 | import Text.Seonbi.Html.Tag
 19 | 
 20 | -- $setup
 21 | -- >>> :set -XOverloadedStrings
 22 | -- >>> import Text.Seonbi.Html.Scanner
 23 | -- >>> :set -interactive-print=Text.Show.Unicode.uprint
 24 | 
 25 | -- | Print the list of 'HtmlEntity' into a lazy 'Text'.
 26 | --
 27 | -- >>> let Done "" tokens = scanHtml "<p>Hello,<br>\n<em>world</em>!</p>"
 28 | -- >>> printHtml tokens
 29 | -- "<p>Hello,<br>\n<em>world</em>!</p>"
 30 | printHtml :: [HtmlEntity] -> Text
 31 | printHtml = printHtml' False
 32 | 
 33 | -- | Similar to 'printHtml' except it renders void (self-closing) tags as
 34 | -- like @<br/>@ instead of @<br>@.
 35 | --
 36 | -- >>> let Done "" tokens = scanHtml "<p>Hello,<br>\n<em>world</em>!</p>"
 37 | -- >>> printXhtml tokens
 38 | -- "<p>Hello,<br/>\n<em>world</em>!</p>"
 39 | --
 40 | -- Note that normal tags are not rendered as self-closed; only void tags
 41 | -- according to HTML specification are:
 42 | --
 43 | -- >>> let Done "" tokens' = scanHtml "<p></p><p><br></p>"
 44 | -- >>> printXhtml tokens'
 45 | -- "<p></p><p><br/></p>"
 46 | printXhtml :: [HtmlEntity] -> Text
 47 | printXhtml = printHtml' True
 48 | 
 49 | printHtml' :: Bool -> [HtmlEntity] -> Text
 50 | printHtml' xhtml =
 51 |     Data.Text.Lazy.concat . Prelude.concatMap render . Data.List.groupBy isVoid
 52 |   where
 53 |     isVoid :: HtmlEntity -> HtmlEntity -> Bool
 54 |     isVoid (HtmlStartTag stck tg _) (HtmlEndTag stck' tg') =
 55 |         htmlTagKind tg == Void && stck == stck' && tg == tg'
 56 |     isVoid _ _ = False
 57 |     render :: [HtmlEntity] -> [Text]
 58 |     render [a@HtmlStartTag { tag = t, rawAttributes = at }, b@HtmlEndTag {}] =
 59 |         if isVoid a b
 60 |         then
 61 |             [ "<"
 62 |             , fromStrict (htmlTagName t)
 63 |             , renderAttrs at
 64 |             , if xhtml then "/>" else ">"
 65 |             ]
 66 |         else e a ++ e b
 67 |     render entities = Prelude.concatMap e entities
 68 |     e :: HtmlEntity -> [Text]
 69 |     e HtmlStartTag { tag = t, rawAttributes = a } =
 70 |         ["<", fromStrict (htmlTagName t), renderAttrs a, ">"]
 71 |     e HtmlEndTag { tag = t } = ["</", fromStrict (htmlTagName t), ">"]
 72 |     e HtmlText { rawText = t } = [fromStrict t]
 73 |     e HtmlCdata { text = t } = ["<![CDATA[", fromStrict t, "]]>"]
 74 |     e HtmlComment { comment = c } = ["<!--", fromStrict c, "-->"]
 75 |     renderAttrs :: Data.Text.Text -> Text
 76 |     renderAttrs "" = ""
 77 |     renderAttrs attrs
 78 |       | isSpace (Data.Text.head attrs) = fromStrict attrs
 79 |       | otherwise = ' ' `cons` fromStrict attrs
 80 | 
 81 | -- | Print only the text contents (including CDATA sections) without tags
 82 | -- into a lazy 'Text'.
 83 | --
 84 | -- >>> let Done "" tokens = scanHtml "<p>Hello,<br>\n<em>world</em>!</p>"
 85 | -- >>> printText tokens
 86 | -- "Hello,\nworld!"
 87 | --
 88 | -- Entities are decoded:
 89 | --
 90 | -- >>> let Done "" tokens = scanHtml "<p><code>&lt;&gt;&quot;&amp;</code></p>"
 91 | -- >>> printText tokens
 92 | -- "<>\"&"
 93 | printText :: [HtmlEntity] -> Text
 94 | printText [] = Data.Text.Lazy.empty
 95 | printText (x:xs) =
 96 |     render x <> printText xs
 97 |   where
 98 |     render :: HtmlEntity -> Text
 99 |     render = \ case
100 |         HtmlText { rawText = t } -> toLazyText $ htmlEncodedText t
101 |         HtmlCdata { text = t } -> fromStrict t
102 |         _ -> Data.Text.Lazy.empty
103 | 


--------------------------------------------------------------------------------
/test/Text/Seonbi/Html/LangSpec.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedLists #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | module Text.Seonbi.Html.LangSpec (spec) where
  4 | 
  5 | import Test.Hspec
  6 | 
  7 | import Text.Seonbi.Html
  8 | import Text.Seonbi.Html.Lang
  9 | 
 10 | source :: [HtmlEntity]
 11 | source =
 12 |     [ HtmlStartTag
 13 |         { tagStack = []
 14 |         , tag = P
 15 |         , rawAttributes = "id=\"foo\" lang=\"en\""
 16 |         }
 17 |     , HtmlText { tagStack = [P], rawText = "English" }
 18 |     , HtmlEndTag { tagStack = [], tag = P }
 19 |     , HtmlStartTag { tagStack = [], tag = Div, rawAttributes = "" }
 20 |     , HtmlStartTag
 21 |         { tagStack = [Div]
 22 |         , tag = P
 23 |         , rawAttributes = "class=bar lang=ja"
 24 |         }
 25 |     , HtmlStartTag { tagStack = [Div, P], tag = B, rawAttributes = "" }
 26 |     , HtmlText { tagStack = [Div, P, B], rawText = "日本語" }
 27 |     , HtmlEndTag { tagStack = [Div, P], tag = B }
 28 |     , HtmlStartTag
 29 |         { tagStack = [Div, P]
 30 |         , tag = Span
 31 |         , rawAttributes = "lang='yue-Hant'"
 32 |         }
 33 |     , HtmlText { tagStack = [Div, P, Span], rawText = "與" }
 34 |     , HtmlStartTag { tagStack = [Div, P, Span], tag = B, rawAttributes = "" }
 35 |     , HtmlText { tagStack = [Div, P, Span, B], rawText = "與粵語" }
 36 |     , HtmlEndTag { tagStack = [Div, P, Span], tag = B }
 37 |     , HtmlEndTag { tagStack = [Div, P], tag = Span }
 38 |     , HtmlEndTag { tagStack = [Div], tag = P }
 39 |     , HtmlEndTag { tagStack = [], tag = Div }
 40 |     ]
 41 | 
 42 | annotated :: [LangHtmlEntity]
 43 | annotated =
 44 |     [ LangHtmlEntity
 45 |         (Just "en")
 46 |         HtmlStartTag
 47 |             { tagStack = []
 48 |             , tag = P
 49 |             , rawAttributes = "id=\"foo\" lang=\"en\""
 50 |             }
 51 |     , LangHtmlEntity
 52 |         (Just "en")
 53 |         HtmlText { tagStack = [P], rawText = "English" }
 54 |     , LangHtmlEntity (Just "en") HtmlEndTag { tagStack = [], tag = P }
 55 |     , LangHtmlEntity
 56 |         Nothing
 57 |         HtmlStartTag { tagStack = [], tag = Div, rawAttributes = "" }
 58 |     , LangHtmlEntity
 59 |         (Just "ja")
 60 |         HtmlStartTag
 61 |             { tagStack = [Div]
 62 |             , tag = P
 63 |             , rawAttributes = "class=bar lang=ja"
 64 |             }
 65 |     , LangHtmlEntity
 66 |         (Just "ja")
 67 |         HtmlStartTag { tagStack = [Div, P], tag = B, rawAttributes = "" }
 68 |     , LangHtmlEntity
 69 |         (Just "ja")
 70 |         HtmlText { tagStack = [Div, P, B], rawText = "日本語" }
 71 |     , LangHtmlEntity (Just "ja") HtmlEndTag { tagStack = [Div, P], tag = B }
 72 |     , LangHtmlEntity
 73 |         (Just "yue-hant")
 74 |         HtmlStartTag
 75 |             { tagStack = [Div, P]
 76 |             , tag = Span
 77 |             , rawAttributes = "lang='yue-Hant'"
 78 |             }
 79 |     , LangHtmlEntity
 80 |         (Just "yue-hant")
 81 |         HtmlText { tagStack = [Div, P, Span], rawText = "與" }
 82 |     , LangHtmlEntity
 83 |         (Just "yue-hant")
 84 |         HtmlStartTag { tagStack = [Div, P, Span], tag = B, rawAttributes = "" }
 85 |     , LangHtmlEntity
 86 |         (Just "yue-hant")
 87 |         HtmlText { tagStack = [Div, P, Span, B], rawText = "與粵語" }
 88 |     , LangHtmlEntity
 89 |         (Just "yue-hant")
 90 |         HtmlEndTag { tagStack = [Div, P, Span], tag = B }
 91 |     , LangHtmlEntity
 92 |         (Just "yue-hant")
 93 |         HtmlEndTag { tagStack = [Div, P], tag = Span }
 94 |     , LangHtmlEntity (Just "ja") HtmlEndTag { tagStack = [Div] , tag = P }
 95 |     , LangHtmlEntity Nothing HtmlEndTag { tagStack = [], tag = Div }
 96 |     ]
 97 | 
 98 | spec :: Spec
 99 | spec = do
100 |     specify "extractLang" $ do
101 |         extractLang "" `shouldBe` Nothing
102 |         extractLang "lang=en" `shouldBe` Just "en"
103 |         extractLang "lang=en-US" `shouldBe` Just "en-us"
104 |         extractLang "lang='ko-KR'" `shouldBe` Just "ko-kr"
105 |         extractLang "lang=\"zh-Hant\"" `shouldBe` Just "zh-hant"
106 |         extractLang "lang=\"yue-Hans-HK\"" `shouldBe` Just "yue-hans-hk"
107 |         extractLang "id=\"foo\" lang=\"en\"" `shouldBe` Just "en"
108 |         extractLang "id=\"foo\" lang=zh-CN class=bar" `shouldBe` Just "zh-cn"
109 |     specify "annotateWithLang" $ do
110 |         annotateWithLang [] `shouldBe` []
111 |         annotateWithLang source `shouldBe` annotated
112 | 


--------------------------------------------------------------------------------
/src/Text/Seonbi/Html/Clipper.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE LambdaCase #-}
  2 | module Text.Seonbi.Html.Clipper
  3 |     ( clipPrefixText
  4 |     , clipSuffixText
  5 |     , clipText
  6 |     ) where
  7 | 
  8 | import Control.Monad
  9 | import Data.List (dropWhileEnd)
 10 | 
 11 | import Data.Text
 12 | 
 13 | import Text.Seonbi.Html
 14 | 
 15 | -- | Clip the given prefix text and suffix text from the HTML fragments.
 16 | -- It simply is composed of 'clipPrefixText' and 'clipSuffixText' functions.
 17 | -- It returns 'Nothing' if any of a prefix and a suffix does not match.
 18 | clipText :: Text -> Text -> [HtmlEntity] -> Maybe [HtmlEntity]
 19 | clipText prefix suffix =
 20 |     clipSuffixText suffix <=< clipPrefixText prefix
 21 | 
 22 | -- | Clip the given prefix text from the HTML fragments.  If its first
 23 | -- text element does not have the same prefix, or the first element is not
 24 | -- an 'HtmlText' node, or the list of HTML fragments have nothing at all,
 25 | -- it returns 'Nothing'.
 26 | --
 27 | -- >>> :set -XOverloadedLists
 28 | -- >>> :set -XOverloadedStrings
 29 | -- >>> clipPrefixText "foo" [HtmlText [] "bar", HtmlStartTag [] P ""]
 30 | -- Nothing
 31 | -- >>> clipPrefixText "foo" [HtmlStartTag [] P "", HtmlText [] "foo"]
 32 | -- Nothing
 33 | -- >>> clipPrefixText "foo" []
 34 | -- Nothing
 35 | --
 36 | -- If the first element is an 'HtmlText' node, and its 'rawText' contains
 37 | -- the common prefix text, it returns a 'Just' value holding a list of
 38 | -- HTML fragments with the common prefix removed.
 39 | --
 40 | -- >>> clipPrefixText "foo" [HtmlText [] "foobar", HtmlStartTag [] P ""]
 41 | -- Just [HtmlText {... "bar"},HtmlStartTag {...}]
 42 | -- >>> clipPrefixText "foo" [HtmlText [] "foo", HtmlStartTag [] P ""]
 43 | -- Just [HtmlStartTag {..., tag = P, ...}]
 44 | --
 45 | -- A given text is treated as a raw text, which means even if some HTML
 46 | -- entities refer to the same characters it may fails to match unless
 47 | -- they share the exactly same representation, e.g.:
 48 | --
 49 | -- >>> clipPrefixText "&amp;" [HtmlText [] "&AMP;"]
 50 | -- Nothing
 51 | --
 52 | -- In the same manner, it doesn't find a prefix from 'HtmlCdata', e.g.:
 53 | --
 54 | -- >>> clipPrefixText "foo" [HtmlCdata [] "foo", HtmlStartTag [] P ""]
 55 | -- Nothing
 56 | --
 57 | -- In order to remove a prefix from both 'HtmlText' and 'HtmlCdata',
 58 | -- apply 'normalizeText' first so that all 'HtmlCdata' entities are transformed
 59 | -- to equivalent 'HtmlText' entities:
 60 | --
 61 | -- >>> import Text.Seonbi.Html.TextNormalizer (normalizeText)
 62 | -- >>> let normalized = normalizeText [HtmlCdata [] "foo", HtmlStartTag [] P ""]
 63 | -- >>> clipPrefixText "foo" normalized
 64 | -- Just [HtmlStartTag {..., tag = P, ...}]
 65 | --
 66 | -- Plus, it works even if HTML fragments contain some 'HtmlComment' entities,
 67 | -- but these are not touched at all, e.g.:
 68 | --
 69 | -- >>> clipPrefixText "bar" [HtmlComment [] "foo", HtmlText [] "barbaz"]
 70 | -- Just [HtmlComment {... "foo"},HtmlText {... "baz"}]
 71 | clipPrefixText :: Text -> [HtmlEntity] -> Maybe [HtmlEntity]
 72 | clipPrefixText prefix []
 73 |   | Data.Text.null prefix = Just []
 74 |   | otherwise = Nothing
 75 | clipPrefixText prefix (x@HtmlComment {} : xs) =
 76 |     (x :) <$> clipPrefixText prefix xs
 77 | clipPrefixText prefix (x@HtmlText { rawText = rawText' } : xs)
 78 |   | prefix == rawText' = Just xs
 79 |   | prefix `isPrefixOf` rawText' = Just $
 80 |       x { rawText = Data.Text.drop (Data.Text.length prefix) rawText' } : xs
 81 |   | otherwise = Nothing
 82 | clipPrefixText _ _ = Nothing
 83 | 
 84 | -- | Clip the given suffix text from the HTML fragments, in the same manner
 85 | -- to 'clipPrefixText'.
 86 | clipSuffixText :: Text -> [HtmlEntity] -> Maybe [HtmlEntity]
 87 | clipSuffixText suffix []
 88 |   | Data.Text.null suffix = Just []
 89 |   | otherwise = Nothing
 90 | clipSuffixText suffix entities =
 91 |     case Prelude.last entities' of
 92 |         e@HtmlText { rawText = rawText' }
 93 |           | suffix == rawText' -> Just (init' ++ comments)
 94 |           | suffix `isSuffixOf` rawText' ->
 95 |               let
 96 |                   sLen = Data.Text.length suffix
 97 |                   rtLen = Data.Text.length rawText'
 98 |                   clipped = Data.Text.take (rtLen - sLen) rawText'
 99 |               in
100 |                   Just (init' ++ e { rawText = clipped } : comments)
101 |           | otherwise -> Nothing
102 |         _ -> Nothing
103 |   where
104 |     entities' :: [HtmlEntity]
105 |     entities' = (`Data.List.dropWhileEnd` entities) $ \ case
106 |         HtmlComment {} -> True
107 |         _ -> False
108 |     init' :: [HtmlEntity]
109 |     init' = Prelude.init entities'
110 |     comments :: [HtmlEntity]
111 |     comments = Prelude.drop (Prelude.length entities') entities
112 | 


--------------------------------------------------------------------------------
/src/Text/Seonbi/Trie.hs:
--------------------------------------------------------------------------------
  1 | {-# OPTIONS_GHC -fno-warn-orphans #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | {-# LANGUAGE RankNTypes #-}
  4 | {-# LANGUAGE TypeFamilies #-}
  5 | -- | A trie from 'Text' keys to values.
  6 | module Text.Seonbi.Trie
  7 |     ( Trie
  8 |     , elems
  9 |     , empty
 10 |     , fromList
 11 |     , insert
 12 |     , keys
 13 |     , lookup
 14 |     , member
 15 |     , mergeBy
 16 |     , null
 17 |     , singleton
 18 |     , size
 19 |     , toList
 20 |     , unionL
 21 |     , unionR
 22 |     ) where
 23 | 
 24 | import Prelude hiding (lookup, null)
 25 | 
 26 | import Control.Monad (ap)
 27 | import qualified GHC.Exts
 28 | 
 29 | import Data.ByteString (ByteString)
 30 | import Data.Text hiding (empty, null, singleton)
 31 | import Data.Text.Encoding (encodeUtf8, decodeUtf8)
 32 | import qualified Data.Trie as BTrie
 33 | 
 34 | -- | A trie from 'Text' keys to 'a' values.
 35 | newtype Trie a
 36 |   = Trie (BTrie.Trie a)
 37 |   deriving (Eq, Show)
 38 | 
 39 | encodeKey :: Text -> ByteString
 40 | encodeKey = encodeUtf8
 41 | 
 42 | decodeKey :: ByteString -> Text
 43 | decodeKey = decodeUtf8
 44 | 
 45 | -- | The empty trie.
 46 | empty :: Trie a
 47 | empty = Trie BTrie.empty
 48 | 
 49 | -- | Checks if the trie is empty.
 50 | null :: Trie a -> Bool
 51 | null (Trie btrie) = BTrie.null btrie
 52 | 
 53 | -- | Constructs a singleton trie.
 54 | singleton :: Text -> a -> Trie a
 55 | singleton key value = Trie $ BTrie.singleton (encodeKey key) value
 56 | 
 57 | -- | Gets the number of elements in the trie.
 58 | size :: Trie a -> Int
 59 | size (Trie btrie) = BTrie.size btrie
 60 | 
 61 | fromList' :: [(Text, a)] -> Trie a
 62 | fromList' list = Trie $ BTrie.fromList [(encodeKey k, v) | (k, v) <- list]
 63 | 
 64 | toList' :: Trie a -> [(Text, a)]
 65 | toList' (Trie btrie) = [(decodeKey k, v) | (k, v) <- BTrie.toList btrie]
 66 | 
 67 | -- | Converts a list of associated pairs into a trie.  For duplicate keys,
 68 | -- values earlier in the list shadow later ones.
 69 | fromList :: [(Text, a)] -> Trie a
 70 | fromList = fromList'
 71 | 
 72 | -- | Converts a trie into a list of associated pairs.  Keys will be ordered.
 73 | toList :: Trie a -> [(Text, a)]
 74 | toList = toList'
 75 | 
 76 | -- | Lists all keys in the trie.  Keys will be ordered.
 77 | keys :: Trie a -> [Text]
 78 | keys (Trie btrie) = Prelude.map decodeKey $ BTrie.keys btrie
 79 | 
 80 | -- | Lists all values in the trie.  Values are ordered by their associated keys.
 81 | elems :: Trie a -> [a]
 82 | elems (Trie btrie) = BTrie.elems btrie
 83 | 
 84 | -- | Gets the value associated with a key if it exists.
 85 | lookup :: Text -> Trie a -> Maybe a
 86 | lookup key (Trie btrie) = BTrie.lookup (encodeKey key) btrie
 87 | 
 88 | -- | Checks if a key has a value in the trie.
 89 | member :: Text -> Trie a -> Bool
 90 | member key (Trie btrie) = BTrie.member (encodeKey key) btrie
 91 | 
 92 | -- | Inserts a new key into the trie.
 93 | insert
 94 |     :: Text
 95 |     -- ^ A new key to insert.  If there is already the same key in the trie,
 96 |     -- the existing value is overwritten by the new value.
 97 |     -> a
 98 |     -- ^ A value associated to the key.
 99 |     -> Trie a
100 |     -- ^ An existing trie.
101 |     -> Trie a
102 |     -- ^ The new trie with the inserted key.
103 | insert key value (Trie btrie) = Trie $ BTrie.insert (encodeKey key) value btrie
104 | 
105 | -- | Combines two tries, using a function to resolve collisions.  This can only
106 | -- define the space of functions between union and symmetric difference but,
107 | -- with those two, all set operations can be defined (albeit inefficiently).
108 | mergeBy :: (a -> a -> Maybe a) -> Trie a -> Trie a -> Trie a
109 | mergeBy f (Trie a) (Trie b) = Trie $ BTrie.mergeBy f a b
110 | 
111 | -- | Combines two tries, resolving conflicts by choosing the value from the
112 | -- left (former) trie.
113 | unionL :: Trie a -> Trie a -> Trie a
114 | unionL (Trie left) (Trie right) = Trie $ BTrie.unionL left right
115 | 
116 | -- | Combines two tries, resolving conflicts by choosing the value from the
117 | -- right (latter) trie.
118 | unionR :: Trie a -> Trie a -> Trie a
119 | unionR (Trie left) (Trie right) = Trie $ BTrie.unionR left right
120 | 
121 | instance Functor Trie where
122 |     fmap f (Trie btrie) = Trie $ fmap f btrie
123 | 
124 | instance Foldable Trie where
125 |     foldMap f (Trie btrie) = foldMap f btrie
126 | 
127 | instance Traversable Trie where
128 |     traverse f (Trie btrie) = Trie <$> traverse f btrie
129 | 
130 | instance Applicative Trie where
131 |     pure = singleton ""
132 |     (<*>) = ap
133 | 
134 | instance Monad Trie where
135 |     Trie btrie >>= f = Trie $ btrie >>= (\ v -> case f v of { Trie b -> b })
136 | 
137 | instance (Semigroup a) => Semigroup (Trie a) where
138 |     (Trie a) <> (Trie b) = Trie (a <> b)
139 | 
140 | instance (Monoid a) => Monoid (Trie a) where
141 |     mempty = Trie mempty
142 | 
143 | instance GHC.Exts.IsList (Trie a) where
144 |     type Item (Trie a) = (Text, a)
145 |     fromList = fromList'
146 |     toList = toList'
147 | 


--------------------------------------------------------------------------------
/setup/action.yaml:
--------------------------------------------------------------------------------
  1 | name: Setup Seonbi
  2 | description: Set up a specific version of Seonbi and add it to the PATH.
  3 | author: Hong Minhee
  4 | branding:
  5 |   icon: package
  6 |   color: gray-dark
  7 | inputs:
  8 |   seonbi-version:
  9 |     description: >-
 10 |       Version of Seonbi to install.  Note that asterisks can be used to
 11 |       choose the latest version, e.g., 1.2.*, 1.*, *.
 12 |     default: "*"
 13 |   add-to-path:
 14 |     description: >-
 15 |       Whether to add the installed seonbi and seonbi-api to the PATH.  Turned
 16 |       on by default.
 17 |     default: true
 18 | outputs:
 19 |   seonbi-version:
 20 |     description: Exact version number of the installed Seonbi.
 21 |     value: ${{ steps.prepare.outputs.seonbi-version }}
 22 |   seonbi-path:
 23 |     description: Absolute path of the installed executable seonbi.
 24 |     value: ${{ steps.prepare.outputs.seonbi-path }}
 25 |   seonbi-api-path:
 26 |     description: Absolute path of the installed executable seonbi-api.
 27 |     value: ${{ steps.prepare.outputs.seonbi-api-path }}
 28 | runs:
 29 |   using: composite
 30 |   steps:
 31 |   - id: prepare
 32 |     shell: python
 33 |     run: |
 34 |       from __future__ import print_function
 35 |       import fnmatch
 36 |       import json
 37 |       import os
 38 |       import os.path
 39 |       try: from urllib import request as urllib2
 40 |       except ImportError: import urllib2
 41 |       import tempfile
 42 | 
 43 |       suffixes = {
 44 |         ('Linux', 'X64'): 'linux-x86_64.tar.bz2',
 45 |         ('Linux', 'ARM64'): 'linux-arm64.tar.bz2',
 46 |         ('macOS', 'X64'): 'macos-x86_64.tar.bz2',
 47 |         ('macOS', 'ARM64'): 'macos-arm64.tar.bz2',
 48 |         ('Windows', 'X64'): 'win64.zip',
 49 |       }
 50 |       os_ = os.environ['RUNNER_OS']
 51 |       arch = os.environ['RUNNER_ARCH']
 52 |       try:
 53 |         suffix = suffixes[os_, arch]
 54 |       except KeyError:
 55 |         print(
 56 |           "::error title=Unsupported OS and architecture::Seonbi doesn't",
 57 |           'support {0}/{1}'.format(os_, arch)
 58 |         )
 59 |         raise SystemExit(1)
 60 | 
 61 |       # TODO: paging
 62 |       req = urllib2.Request(
 63 |         'https://api.github.com/repos/dahlia/seonbi/releases?per_page=100',
 64 |         headers={'Authorization': 'Bearer ' + os.environ['GH_TOKEN']}
 65 |       )
 66 |       res = urllib2.urlopen(req)
 67 |       tags = json.load(res)
 68 |       tags.sort(
 69 |         key=lambda tag: tuple(map(int, tag['tag_name'].split('.'))),
 70 |         reverse=True
 71 |       )
 72 |       res.close()
 73 |       version_pattern = os.environ['SEONBI_VERSION'].strip()
 74 |       for tag in tags:
 75 |         if not fnmatch.fnmatch(tag['tag_name'], version_pattern):
 76 |           continue
 77 |         for asset in tag['assets']:
 78 |           if asset['name'] == 'seonbi-{0}.{1}'.format(tag['tag_name'], suffix):
 79 |             print('::set-output name=seonbi-version::' + tag['tag_name'])
 80 |             print(
 81 |               '::set-output name=download-url::' + asset['browser_download_url']
 82 |             )
 83 |             break
 84 |         else:
 85 |           continue
 86 |         break
 87 |       else:
 88 |         print(
 89 |           '::error title=Unsupported platform::Seonbi', version_pattern,
 90 |           'does not support', os_, '&', arch + '.'
 91 |         )
 92 | 
 93 |       dir_path = tempfile.mkdtemp('seonbi', dir=os.environ.get('RUNNER_TEMP'))
 94 |       seonbi_path = os.path.join(
 95 |         dir_path,
 96 |         'seonbi.exe' if os_ == 'Windows' else 'seonbi'
 97 |       )
 98 |       seonbi_api_path = os.path.join(
 99 |         dir_path,
100 |         'seonbi-api.exe' if os_ == 'Windows' else 'seonbi-api'
101 |       )
102 |       print('::set-output name=dir-path::' + dir_path)
103 |       print('::set-output name=seonbi-path::' + seonbi_path)
104 |       print('::set-output name=seonbi-api-path::' + seonbi_api_path)
105 |     env:
106 |       GH_TOKEN: ${{ github.token }}
107 |       SEONBI_VERSION: ${{ inputs.seonbi-version }}
108 |   # Linux & macOS
109 |   - if: runner.os != 'Windows'
110 |     shell: bash
111 |     run: |
112 |       set -e
113 |       wget "$DOWNLOAD_URL"
114 |       tar xvfj "$(basename "$DOWNLOAD_URL")"
115 |       chmod +x seonbi seonbi-api
116 |       if [[ "$ADD_TO_PATH" = "true" ]]; then
117 |         pwd >> "$GITHUB_PATH"
118 |       fi
119 |     env:
120 |       DOWNLOAD_URL: ${{ steps.prepare.outputs.download-url }}
121 |       DIR_PATH: ${{ steps.prepare.outputs.dir-path }}
122 |       ADD_TO_PATH: ${{ inputs.add-to-path }}
123 |     working-directory: ${{ steps.prepare.outputs.dir-path }}
124 |   # Windows
125 |   - if: runner.os == 'Windows'
126 |     shell: pwsh
127 |     run: |
128 |       Invoke-WebRequest `
129 |         $env:DOWNLOAD_URL `
130 |         -OutFile $env:DOWNLOAD_URL.Split("/")[-1]
131 |       7z x $env:DOWNLOAD_URL.Split("/")[-1]
132 |       if (ConvertFrom-Json $env:ADD_TO_PATH) {
133 |         Add-Content `
134 |           -Path $env:GITHUB_PATH `
135 |           -Value "$(Get-Location)"
136 |       }
137 |     env:
138 |       DOWNLOAD_URL: ${{ steps.prepare.outputs.download-url }}
139 |       DIR_PATH: ${{ steps.prepare.outputs.dir-path }}
140 |       ADD_TO_PATH: ${{ inputs.add-to-path }}
141 |     working-directory: ${{ steps.prepare.outputs.dir-path }}
142 | 


--------------------------------------------------------------------------------
/scripts/showcase-svg/template.svg:
--------------------------------------------------------------------------------
  1 | <svg xmlns="http://www.w3.org/2000/svg"
  2 |      fill="none" height="650" viewBox="0 0 800 650" width="800">
  3 |   <foreignObject height="100%" width="100%">
  4 |     <div xmlns="http://www.w3.org/1999/xhtml" class="root">
  5 |       <link
  6 |         href="https://fonts.googleapis.com/css2?family=Noto+Serif+KR:wght@100;300;700;900&amp;display=swap"
  7 |         rel="stylesheet"
  8 |         type="text/css" />
  9 |       <style>
 10 |       div, p, ruby, rt {
 11 |         box-sizing: border-box;
 12 |         font-family: 'Noto Serif CJK KR', 'Noto Serif KR', serif;
 13 |         text-justify: inter-character;
 14 |         line-height: 1.6;
 15 |         text-align: justify;
 16 |       }
 17 |       .root {
 18 |         width: 100%; height: 100%;
 19 |         margin: 0; padding: 0;
 20 |       }
 21 |       .top {
 22 |         width: 100%; height: 10%;
 23 |         margin: 0; padding-top: 1.5rem;
 24 |         position: absolute;
 25 |         left: 0; top: 0;
 26 |         background: #333;
 27 |         color: white;
 28 |         text-align: center;
 29 |       }
 30 |       .top ruby { letter-spacing: 0.1rem; }
 31 |       .top rt { letter-spacing: 0; text-align: center; }
 32 |       @supports (-moz-appearance:none) {
 33 |         .top { padding-top: 0.8rem; }
 34 |         .top rt { letter-spacing: -0.1rem; }
 35 |       }
 36 |       .title {
 37 |         width: 100%; height: 4%;
 38 |         position: absolute;
 39 |         left: 0; top: 10%;
 40 |         padding: 0.5rem;
 41 |         font-family: smaller;
 42 |         font-variant: small-caps;
 43 |         text-align: center;
 44 |         background: #eee;
 45 |       }
 46 |       .left {
 47 |         width: 50%; height: 86%;
 48 |         margin: 0; padding: 0 1.2rem;
 49 |         position: absolute;
 50 |         left: 0; bottom: 0;
 51 |         background: #eee;
 52 |       }
 53 |       .right {
 54 |         width: 50%; height: 86%;
 55 |         margin: 0; padding: 0 1.2rem;
 56 |         position: absolute;
 57 |         right: 0; bottom: 0;
 58 |         background: #eee
 59 |       }
 60 |       .right.ko-kore, .right.ko-kore * {
 61 |         line-height: 1.3;
 62 |       }
 63 |       .right.ko-kore ruby rt {
 64 |         margin-bottom: -2rem;
 65 |         text-align: center;
 66 |       }
 67 |       @supports (-moz-appearance:none) {
 68 |         .right.ko-kore, .right.ko-kore * {
 69 |           line-height: 1.6;
 70 |         }
 71 |         .right.ko-kore ruby rt {
 72 |             margin-bottom: -0.25rem;
 73 |         }
 74 |       }
 75 |       .arrow {
 76 |         width: 1em; height: 2em;
 77 |         border-top: 1em solid transparent;
 78 |         border-bottom: 1em solid transparent;
 79 |         border-left: 1em solid #333;
 80 |         position: absolute;
 81 |         left: 49.3%; top: 50%;
 82 |       }
 83 |       @keyframes ko-kr {
 84 |         from { opacity: 1; }
 85 |         31%  { opacity: 1; }
 86 |         35%  { opacity: 0; }
 87 |         98%  { opacity: 0; }
 88 |         to   { opacity: 1; }
 89 |       }
 90 |       @keyframes ko-kp {
 91 |         from { opacity: 0; }
 92 |         31%  { opacity: 0; }
 93 |         35%  { opacity: 1; }
 94 |         64%  { opacity: 1; }
 95 |         68%  { opacity: 0; }
 96 |         to   { opacity: 0; }
 97 |       }
 98 |       @keyframes ko-kore {
 99 |         from { opacity: 0; }
100 |         64%  { opacity: 0; }
101 |         68%  { opacity: 1; }
102 |         98%  { opacity: 1; }
103 |         to   { opacity: 0; }
104 |       }
105 |       .ko-kr {
106 |         animation-name: ko-kr;
107 |         animation-duration: 10s;
108 |         animation-iteration-count: infinite;
109 |       }
110 |       .ko-kp {
111 |         animation-name: ko-kp;
112 |         animation-duration: 10s;
113 |         animation-iteration-count: infinite;
114 |       }
115 |       .ko-kore {
116 |         animation-name: ko-kore;
117 |         animation-duration: 10s;
118 |         animation-iteration-count: infinite;
119 |       }
120 |       </style>
121 |       <div class="top">
122 |         <ruby>선비<rt>Seonbi</rt></ruby> trasforms:
123 |       </div>
124 |       <div class="left">
125 |         <p id="input">悠久한 歷史와 傳統에 빛나는 우리 大韓國民은 3·1運動으로
126 |         建立된 大韓民國臨時政府의 法統과 不義에 抗拒한 4·19民主理念을 계승하고,
127 |         祖國의 民主改革과 平和的 統一의 使命에 입각하여 正義·人道와 同胞愛로써
128 |         民族의 團結을 공고히 하고, 모든 社會的 弊習과 不義를 타파하며,
129 |         自律과 調和를 바탕으로 自由民主的 基本秩序를 더욱 확고히 하여
130 |         政治·經濟·社會·文化의 모든 領域에 있어서 各人의 機會를 균등히 하고,
131 |         能力을 最高度로 발휘하게 하며, 自由와 權利에 따르는 責任과 義務를
132 |         완수하게 하여, 안으로는 國民生活의 균등한 향상을 기하고 밖으로는
133 |         항구적인 世界平和와 人類共榮에 이바지함으로써 우리들과 우리들의 子孫의
134 |         安全과 自由와 幸福을 영원히 확보할 것을 다짐하면서 1948年 7月 12日에
135 |         制定되고 8次에 걸쳐 改正된 憲法을 이제 國會의 議決을 거쳐 國民投票에
136 |         의하여 改正한다.</p>
137 |       </div>
138 |       <div class="title ko-kr">South Korean orthography</div>
139 |       <div class="right ko-kr">
140 |         <p id="placeholder-ko-kr">PLACEHOLDER: ko-kp</p>
141 |       </div>
142 |       <div class="title ko-kp">North Korean orthography</div>
143 |       <div class="right ko-kp">
144 |         <p id="placeholder-ko-kp">PLACEHOLDER: ko-kp</p>
145 |       </div>
146 |       <div class="title ko-kore">Mixed script with
147 |         <code>&lt;ruby&gt;</code></div>
148 |       <div class="right ko-kore">
149 |         <p id="placeholder-ko-kore">PLACEHOLDER: ko-Kore</p>
150 |       </div>
151 |       <div class="arrow"/>
152 |     </div>
153 |   </foreignObject>
154 | </svg>
155 | 


--------------------------------------------------------------------------------
/package.yaml:
--------------------------------------------------------------------------------
  1 | name: seonbi
  2 | version: 0.6.0
  3 | synopsis: SmartyPants for Korean language
  4 | category: Text
  5 | author: Hong Minhee <hong.minhee@gmail.com>
  6 | maintainer: Hong Minhee <hong.minhee@gmail.com>
  7 | copyright: "\xa9 2018\u20132023 Hong Minhee"
  8 | license: LGPL-2.1
  9 | homepage: https://github.com/dahlia/seonbi
 10 | bug-reports: https://github.com/dahlia/seonbi/issues
 11 | git: git://github.com/dahlia/seonbi.git
 12 | description:
 13 |   Please see the README.md on GitHub at <https://github.com/dahlia/seonbi>.
 14 | extra-source-files:
 15 | - src/Text/Seonbi/Unihan/*.json
 16 | - CHANGES.md
 17 | - README.md
 18 | data-dir: data
 19 | data-files:
 20 | - '*.tsv'
 21 | build-type: Custom
 22 | custom-setup:
 23 |   dependencies:
 24 |   - base
 25 |   - bytestring
 26 |   - Cabal
 27 |   - directory >= 1 && < 2
 28 |   - filepath
 29 |   - http-client >= 0.5 && < 0.8
 30 |   - temporary >= 1.2 && < 1.4
 31 |   - text
 32 |   - zip >= 1.1 && < 3.0
 33 | dependencies:
 34 | - aeson >= 1.3.1 && < 3
 35 | - base >= 4.12 && < 5
 36 | - bytestring
 37 | - containers
 38 | - html-entities >= 1 && < 2
 39 | - text
 40 | flags:
 41 |   static:
 42 |     description: Static link
 43 |     manual: true
 44 |     default: false
 45 |   iconv:
 46 |     description: Use iconv; however it is ignored on Windows
 47 |     manual: true
 48 |     default: false
 49 |   embed-dictionary:
 50 |     description: Embed dictionary rather than load from file
 51 |     manual: true
 52 |     default: false
 53 | when:
 54 | - condition: os(darwin)
 55 |   else:
 56 |     ghc-options:
 57 |     - -Wall
 58 |     - -fprint-explicit-kinds
 59 |   then:
 60 |     ghc-options:
 61 |     - -Wall
 62 |     - -fprint-explicit-kinds
 63 |     - -optP-Wno-nonportable-include-path
 64 |     # The above option works around https://github.com/haskell/cabal/issues/4739
 65 | library:
 66 |   source-dirs: src
 67 |   dependencies:
 68 |   - attoparsec >= 0.12 && < 1
 69 |   - bytestring-trie >= 0.2.5 && < 0.3
 70 |   - cassava >= 0.5 && < 0.6
 71 |   - cmark >= 0.6 && < 1
 72 |   - data-default >= 0.2 && < 1
 73 |   - filepath >= 1 && < 2
 74 |   - file-embed >= 0.0.10 && < 0.0.16
 75 |   - http-media >= 0.8 && < 1
 76 |   when:
 77 |   - condition: flag(static) || flag(embed-dictionary)
 78 |     then:
 79 |       cpp-options:
 80 |       - -DEMBED_DICTIONARY
 81 |     else:
 82 |       cpp-options:
 83 |       - -DNO_EMBED_DICTIONARY
 84 | executables:
 85 |   seonbi:
 86 |     main: seonbi.hs
 87 |     source-dirs: app
 88 |     when:
 89 |     - condition: flag(iconv) && !os(windows)
 90 |       else:
 91 |         dependencies: &executable-seonbi-dependencies
 92 |           cases: ">= 0.1.3.2 && < 0.1.5"
 93 |           code-page: ">= 0.2 && < 0.3"
 94 |           html-charset: ">= 0.1 && < 0.2"
 95 |           optparse-applicative: ">= 0.14 && < 0.18"
 96 |           seonbi: ">= 0"
 97 |       then:
 98 |         dependencies:
 99 |           <<: *executable-seonbi-dependencies
100 |           iconv: ">= 0.4 && < 0.5"
101 |         cpp-options:
102 |         - -DICONV
103 |     - &executable-ghc-options
104 |       condition: flag(static)
105 |       then:
106 |         when:
107 |         - condition: os(darwin) || os(windows)
108 |           then:
109 |             ghc-options:
110 |             - -Wall
111 |             - -fwarn-incomplete-uni-patterns
112 |             - -threaded
113 |             - -rtsopts
114 |             - -with-rtsopts=-N
115 |             # Static link
116 |             - -static
117 |             - -optc-Os
118 |           else:
119 |             ghc-options:
120 |             - -Wall
121 |             - -fwarn-incomplete-uni-patterns
122 |             - -threaded
123 |             - -rtsopts
124 |             - -with-rtsopts=-N
125 |             # Static link
126 |             - -static
127 |             - -optl-static
128 |             - -optl-pthread
129 |             - -optc-Os
130 |             - -fPIC
131 |         ld-options:
132 |         - -static
133 |       else:
134 |         ghc-options:
135 |         - -Wall
136 |         - -fwarn-incomplete-uni-patterns
137 |         - -threaded
138 |         - -rtsopts
139 |         - -with-rtsopts=-N
140 |   seonbi-api:
141 |     main: seonbi-api.hs
142 |     source-dirs: app
143 |     dependencies:
144 |     - http-types >= 0.12 && < 0.13
145 |     - optparse-applicative >= 0.14 && < 0.18
146 |     - seonbi
147 |     - wai >= 3.2 && < 3.4
148 |     - warp >= 3.2 && < 3.4
149 |     when:
150 |     - *executable-ghc-options
151 | tests:
152 |   doctest:
153 |     main: doctest.hs
154 |     source-dirs: test
155 |     other-modules: []
156 |     ghc-options:
157 |     - -threaded
158 |     dependencies:
159 |     - doctest
160 |     - doctest-discover
161 |     - QuickCheck
162 |     - seonbi
163 |     - unicode-show
164 |   spec:
165 |     main: hspec.hs
166 |     source-dirs: test
167 |     ghc-options:
168 |     - -threaded
169 |     - -rtsopts
170 |     - -with-rtsopts=-N
171 |     dependencies:
172 |     - code-page >= 0.2 && < 0.3
173 |     - Diff >= 0.3.4 && < 0.5
174 |     - directory >= 1 && < 2
175 |     - filepath >= 1 && < 2
176 |     - hspec >= 2.4.8 && < 3
177 |     - hspec-discover >= 2.4.8 && < 3
178 |     - interpolatedstring-perl6 >= 1.0.1 && < 2
179 |     - random >= 1.1 && < 1.3
180 |     - seonbi
181 |     - text
182 |   hlint:
183 |     main: hlint.hs
184 |     source-dirs: test
185 |     other-modules: []
186 |     ghc-options:
187 |     - -threaded
188 |     dependencies:
189 |     - hlint >= 2.1.7 && < 3.6
190 | 


--------------------------------------------------------------------------------
/src/Text/Seonbi/Html/TagStack.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE TypeFamilies #-}
  2 | module Text.Seonbi.Html.TagStack
  3 |     ( HtmlTagStack
  4 |     , Text.Seonbi.Html.TagStack.any
  5 |     , descendsFrom
  6 |     , Text.Seonbi.Html.TagStack.elem
  7 |     , depth
  8 |     , empty
  9 |     , fromList
 10 |     , last
 11 |     , pop
 12 |     , push
 13 |     , rebase
 14 |     , toList
 15 |     ) where
 16 | 
 17 | import Prelude hiding (last)
 18 | 
 19 | import Data.List hiding (last)
 20 | import GHC.Exts (IsList (..))
 21 | 
 22 | import Text.Seonbi.Html.Tag
 23 | 
 24 | -- | Represents a hierarchy of a currently parsing position in an 'HtmlTag'
 25 | -- tree.
 26 | --
 27 | -- For example, if an 'scanHtml' has read "@\<a href="#">\<b>\<i>foo\</i> bar@"
 28 | -- it is represented as @'HtmlTagStack' ['B', 'A']@.
 29 | --
 30 | -- Note that the tags are stored in reverse order, from the deepest to
 31 | -- the shallowest, to make inserting a more deeper tag efficient.
 32 | newtype HtmlTagStack = HtmlTagStack [HtmlTag] deriving (Eq, Ord)
 33 | 
 34 | instance IsList HtmlTagStack where
 35 |     type Item HtmlTagStack = HtmlTag
 36 |     fromList = HtmlTagStack . reverse
 37 |     toList (HtmlTagStack tags) = reverse tags
 38 | 
 39 | instance Show HtmlTagStack where
 40 |     show tags = "fromList " ++ show (toList tags)
 41 | 
 42 | -- | An empty stack.
 43 | empty :: HtmlTagStack
 44 | empty = HtmlTagStack []
 45 | 
 46 | -- | Count the depth of a stack.
 47 | --
 48 | -- >>> :set -XOverloadedLists
 49 | -- >>> depth empty
 50 | -- 0
 51 | -- >>> depth [Div, Article, P, Em]
 52 | -- 4
 53 | depth :: HtmlTagStack -> Int
 54 | depth (HtmlTagStack stack) = Data.List.length stack
 55 | 
 56 | -- | Get the deepest tag from a 'HtmlTagStack'.
 57 | --
 58 | -- >>> :set -XOverloadedLists
 59 | -- >>> let stack = [Div, Article, P, Em] :: HtmlTagStack
 60 | -- >>> last stack
 61 | -- Just Em
 62 | -- >>> last []
 63 | -- Nothing
 64 | last :: HtmlTagStack -> Maybe HtmlTag
 65 | last (HtmlTagStack []) = Nothing
 66 | last (HtmlTagStack (tag:_)) = Just tag
 67 | 
 68 | -- | Build a new stack from a stack by replacing its bottom with a new base.
 69 | --
 70 | -- >>> :set -XOverloadedLists
 71 | -- >>> rebase [Article, BlockQuote] [Div] [Article, BlockQuote, P, Em]
 72 | -- fromList [Div,P,Em]
 73 | --
 74 | -- If there are no such bottom elements, it replaces nothing.
 75 | --
 76 | -- >>> rebase [Div, Article, BlockQuote] [Div] [Article, BlockQuote, P, Em]
 77 | -- fromList [Article,BlockQuote,P,Em]
 78 | rebase :: HtmlTagStack -> HtmlTagStack -> HtmlTagStack -> HtmlTagStack
 79 | rebase (HtmlTagStack base) (HtmlTagStack newBase) stack@(HtmlTagStack l)
 80 |   | base `isSuffixOf` l = HtmlTagStack $
 81 |       take (depth stack - length base) l ++ newBase
 82 |   | otherwise = stack
 83 | 
 84 | -- | Push one deeper @tag@ to a 'HtmlTagStack'.
 85 | --
 86 | -- >>> push A empty
 87 | -- fromList [A]
 88 | -- >>> push B (push A empty)
 89 | -- fromList [A,B]
 90 | push :: HtmlTag -> HtmlTagStack -> HtmlTagStack
 91 | push tag (HtmlTagStack tags) =
 92 |     HtmlTagStack (tag : tags)
 93 | 
 94 | -- | Pop the deepest @tag@ from a 'HtmlTagStack'.
 95 | --
 96 | -- >>> :set -XOverloadedLists
 97 | -- >>> pop Em [A, B, Em]
 98 | -- fromList [A,B]
 99 | --
100 | -- It may pop a @tag@ in the middle if a @tag@ looking for is not the deepest:
101 | --
102 | -- >>> pop B [A, B, Em]
103 | -- fromList [A,Em]
104 | --
105 | -- It does not affect to the input if there is no such @tag@ in the input:
106 | --
107 | -- >>> pop P [A, B, Em]
108 | -- fromList [A,B,Em]
109 | -- >>> pop A empty
110 | -- fromList []
111 | pop :: HtmlTag -> HtmlTagStack -> HtmlTagStack
112 | pop tag (HtmlTagStack tags'@(t : ags)) =
113 |     if t == tag
114 |          then HtmlTagStack ags
115 |          else
116 |             let
117 |                 (head', rest) = span (/= tag) tags'
118 |                 tail' = case uncons rest of
119 |                     Just (_, tail'') -> tail''
120 |                     Nothing -> []
121 |             in
122 |                 HtmlTagStack (head' ++ tail')
123 | pop _ (HtmlTagStack []) = empty
124 | 
125 | -- | Check if a node ('HtmlEntity') that a 'HtmlTagStack' (the first argument)
126 | -- refers is contained by a node that another 'HtmlTagStack' (the second
127 | -- argument), or they are sibling at least.
128 | --
129 | -- >>> :set -XOverloadedLists
130 | -- >>> descendsFrom [Div, P, A, Em] [Div, P, A]
131 | -- True
132 | -- >>> descendsFrom [Div, P, A] [Div, P, A]
133 | -- True
134 | -- >>> descendsFrom [Div, P, Em] [Div, P, A]
135 | -- False
136 | -- >>> descendsFrom [Div, P] [Div, P, A]
137 | -- False
138 | descendsFrom :: HtmlTagStack -> HtmlTagStack -> Bool
139 | HtmlTagStack a `descendsFrom` HtmlTagStack b =
140 |     b `isSuffixOf` a
141 | 
142 | -- | Determine whether any element of the tag stack satisfies the predicate.
143 | --
144 | -- >>> :set -XOverloadedLists
145 | -- >>> Text.Seonbi.Html.TagStack.any ((== Void) . htmlTagKind) [Div, P, Script]
146 | -- False
147 | -- >>> Text.Seonbi.Html.TagStack.any ((== Void) . htmlTagKind) [BR, P, Script]
148 | -- True
149 | any :: (HtmlTag -> Bool) -> HtmlTagStack -> Bool
150 | any fn (HtmlTagStack stack) =
151 |     Prelude.any fn stack
152 | 
153 | -- | Determine whether the element occurs in the tag stack.
154 | --
155 | -- >>> :set -XOverloadedLists
156 | -- >>> A `Text.Seonbi.Html.TagStack.elem` [A, B, Code]
157 | -- True
158 | -- >>> Em `Text.Seonbi.Html.TagStack.elem` [A, B, Code]
159 | -- False
160 | elem :: HtmlTag -> HtmlTagStack -> Bool
161 | elem tag (HtmlTagStack stack) = tag `Prelude.elem` stack
162 | 


--------------------------------------------------------------------------------
/src/Text/Seonbi/PairedTransformer.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE ScopedTypeVariables #-}
  2 | module Text.Seonbi.PairedTransformer
  3 |     ( PairedTransformer (..)
  4 |     , transformPairs
  5 |     ) where
  6 | 
  7 | import Data.Text hiding (break, reverse)
  8 | 
  9 | import Text.Seonbi.Html
 10 | 
 11 | -- | Settings for 'transformPairs'.
 12 | data PairedTransformer match = PairedTransformer
 13 |     { ignoresTagStack :: HtmlTagStack -> Bool
 14 |     , matchStart :: [match] -> Text -> Maybe (match, Text, Text, Text)
 15 |     , matchEnd :: Text -> Maybe (match, Text, Text, Text)
 16 |     , areMatchesPaired :: match -> match -> Bool
 17 |     , transformPair :: match -> match -> [HtmlEntity] -> [HtmlEntity]
 18 |     }
 19 | 
 20 | -- | Some transformations should be done only if a start and an end are paired
 21 | -- like parentheses.  These even usually can be nested.  Even if there is
 22 | -- a start and an end they should not be paired unless they are sibling in
 23 | -- an HTML tree.
 24 | --
 25 | -- These kinds of scanning are easily turned highly stateful and imperative,
 26 | -- hence hard to debug.  This base class provides the common logic between
 27 | -- these kinds of paired transformations so that an implementation class fill
 28 | -- several abstract methods triggered by the state machine.
 29 | transformPairs :: forall m . PairedTransformer m -> [HtmlEntity] -> [HtmlEntity]
 30 | transformPairs (PairedTransformer ignores start end arePaired transform) =
 31 |     iter [] . normalizeText
 32 |   where
 33 |     iter :: [Unclosed m] -> [HtmlEntity] -> [HtmlEntity]
 34 |     iter [] [] = []
 35 |     iter stack [] = unstack stack
 36 |     iter stack (x@HtmlText { tagStack = ts, rawText = txt } : xs) =
 37 |         case (startMatch, endMatch) of
 38 |             (Just captured, Nothing) ->
 39 |                 roll stack captured ts xs
 40 |             (Nothing, Just captured@(m, _, _, _))
 41 |               | Prelude.any ((`arePaired` m) . match) stack ->
 42 |                     unroll stack captured ts xs
 43 |             (Just captured@(_, pre, _, _), Just captured'@(m', pre', _, _)) ->
 44 |                 if Data.Text.length pre >= Data.Text.length pre' &&
 45 |                     Prelude.any ((`arePaired` m') . match) stack
 46 |                 then unroll stack captured' ts xs
 47 |                 else roll stack captured ts xs
 48 |             (Nothing, _) ->
 49 |                 case stack of
 50 |                     [] -> x : iter stack xs
 51 |                     s : ss -> iter (s { buffer = x : buffer s } : ss) xs
 52 |       where
 53 |         startMatch :: Maybe (m, Text, Text, Text)
 54 |         startMatch = start (reverse $ fmap match stack) txt
 55 |         endMatch :: Maybe (m, Text, Text, Text)
 56 |         endMatch = end txt
 57 |     iter (s@Unclosed {} : ss) (x : xs) =
 58 |         iter (s { buffer = x : buffer s } : ss) xs
 59 |     iter [] (x : xs) = x : iter [] xs
 60 |     roll :: [Unclosed m]
 61 |          -> (m, Text, Text, Text)
 62 |          -> HtmlTagStack
 63 |          -> [HtmlEntity]
 64 |          -> [HtmlEntity]
 65 |     roll [] (startMatch, pre, t, post) tagStack_ entities =
 66 |         prependText tagStack_ pre $ iter
 67 |             [Unclosed startMatch [HtmlText tagStack_ t]]
 68 |             (normalizeText (prependText tagStack_ post entities))
 69 |     roll (s : ss) (startMatch, pre, t, post) tagStack_ entities = iter
 70 |         ( Unclosed startMatch [HtmlText tagStack_ t]
 71 |         : s { buffer = prependText tagStack_ pre $ buffer s }
 72 |         : ss
 73 |         )
 74 |         (normalizeText (prependText tagStack_ post entities))
 75 |     unroll :: [Unclosed m]
 76 |            -> (m, Text, Text, Text)
 77 |            -> HtmlTagStack
 78 |            -> [HtmlEntity]
 79 |            -> [HtmlEntity]
 80 |     unroll stack (endMatch, pre, t, post) tagStack_ es =
 81 |         case remainStack of
 82 |             [] -> unrolled ++ iter [] remainEntities
 83 |             s : ss -> iter
 84 |                 (s { buffer = reverse unrolled ++ buffer s } : ss)
 85 |                 remainEntities
 86 |       where
 87 |         prependText' :: Text -> [HtmlEntity] -> [HtmlEntity]
 88 |         prependText' = prependText tagStack_
 89 |         unrolled :: [HtmlEntity]
 90 |         remainStack :: [Unclosed m]
 91 |         (unrolled, remainStack) = case findPair endMatch stack of
 92 |             (_, []) ->
 93 |                 ([HtmlText tagStack_ (pre `append` t)], [])
 94 |             (stack', s@Unclosed { match = startMatch } : ss) ->
 95 |                 let
 96 |                     buf = prependText' pre (unstack' stack' ++ buffer s)
 97 |                     buf' = prependText' t buf
 98 |                     buf'' = reverse buf'
 99 |                     transformed = if Prelude.any (ignores . tagStack) buf''
100 |                        then buf''
101 |                        else transform startMatch endMatch buf''
102 |                 in
103 |                     (transformed, ss)
104 |         remainEntities :: [HtmlEntity]
105 |         remainEntities = prependText' post es
106 |     findPair :: m -> [Unclosed m] -> ([Unclosed m], [Unclosed m])
107 |     findPair m = break (arePaired m . match)
108 |     unstack :: [Unclosed m] -> [HtmlEntity]
109 |     unstack = reverse . unstack'
110 |     unstack' :: [Unclosed m] -> [HtmlEntity]
111 |     unstack' [] = []
112 |     unstack' (Unclosed { buffer = b } : ss) = b ++ unstack' ss
113 |     prependText :: HtmlTagStack -> Text -> [HtmlEntity] -> [HtmlEntity]
114 |     prependText tagStack_ txt
115 |       | Data.Text.null txt = id
116 |       | otherwise = (HtmlText tagStack_ txt :)
117 | 
118 | data Unclosed match = Unclosed
119 |     { match :: match
120 |     , buffer :: [HtmlEntity] -- in reverse order
121 |     }
122 | 


--------------------------------------------------------------------------------
/demo/src/Markdown/HtmlString.elm:
--------------------------------------------------------------------------------
  1 | module Markdown.HtmlString exposing (render)
  2 | 
  3 | import List
  4 | import Markdown.Block exposing (..)
  5 | import Markdown.Inline exposing (..)
  6 | import Maybe exposing (andThen, withDefault)
  7 | import String
  8 | 
  9 | 
 10 | escape : String -> String
 11 | escape =
 12 |     String.replace "&" "&amp;"
 13 |         >> String.replace "<" "&lt;"
 14 |         >> String.replace ">" "&gt;"
 15 |         >> String.replace "\"" "&quot;"
 16 | 
 17 | 
 18 | render : List (Block b i) -> String
 19 | render blocks =
 20 |     List.map renderBlock blocks |> String.concat
 21 | 
 22 | 
 23 | renderBlock : Block b i -> String
 24 | renderBlock block =
 25 |     case block of
 26 |         BlankLine text ->
 27 |             escape text
 28 | 
 29 |         ThematicBreak ->
 30 |             "\n<hr>\n"
 31 | 
 32 |         Heading _ level inlines ->
 33 |             "\n<h"
 34 |                 ++ String.fromInt level
 35 |                 ++ ">"
 36 |                 ++ renderInlines inlines
 37 |                 ++ "</h"
 38 |                 ++ String.fromInt level
 39 |                 ++ ">\n"
 40 | 
 41 |         CodeBlock _ code ->
 42 |             "<pre>" ++ escape code ++ "</pre>\n"
 43 | 
 44 |         Paragraph _ text ->
 45 |             "<p>" ++ renderInlines text ++ "</p>\n"
 46 | 
 47 |         BlockQuote blocks ->
 48 |             "<blockquote>\n" ++ render blocks ++ "</blockquote>\n"
 49 | 
 50 |         List list items ->
 51 |             let
 52 |                 ( open, close ) =
 53 |                     case list.type_ of
 54 |                         Unordered ->
 55 |                             ( "<ul>", "</ul>" )
 56 | 
 57 |                         Ordered start ->
 58 |                             ( "<ol start=\"" ++ String.fromInt start ++ "\">"
 59 |                             , "</ol>"
 60 |                             )
 61 | 
 62 |                 renderItem =
 63 |                     \blocks ->
 64 |                         "<li>" ++ render blocks ++ "</li>\n"
 65 |             in
 66 |             open
 67 |                 ++ "\n"
 68 |                 ++ String.concat (List.map renderItem items)
 69 |                 ++ close
 70 |                 ++ "\n"
 71 | 
 72 |         PlainInlines inlines ->
 73 |             renderInlines inlines
 74 | 
 75 |         Markdown.Block.Custom _ blocks ->
 76 |             render blocks
 77 | 
 78 | 
 79 | renderInlines : List (Inline i) -> String
 80 | renderInlines inlines =
 81 |     List.map renderInline inlines
 82 |         |> String.concat
 83 | 
 84 | 
 85 | renderInline : Inline i -> String
 86 | renderInline inline =
 87 |     case inline of
 88 |         Text text ->
 89 |             escape text
 90 | 
 91 |         HardLineBreak ->
 92 |             "<br>\n"
 93 | 
 94 |         CodeInline text ->
 95 |             "<code>" ++ escape text ++ "</code>"
 96 | 
 97 |         Link href title label ->
 98 |             "<a href=\""
 99 |                 ++ escape href
100 |                 ++ "\""
101 |                 ++ (title
102 |                         |> andThen (\t -> Just <| " title=\"" ++ t ++ "\"")
103 |                         |> withDefault ""
104 |                    )
105 |                 ++ ">"
106 |                 ++ (List.map renderInline label |> String.concat)
107 |                 ++ "</a>"
108 | 
109 |         Image src title alt ->
110 |             "<img src=\""
111 |                 ++ escape src
112 |                 ++ "\""
113 |                 ++ (title
114 |                         |> andThen (\t -> Just <| " title=\"" ++ t ++ "\"")
115 |                         |> withDefault ""
116 |                    )
117 |                 ++ " alt=\""
118 |                 ++ (List.map simplifyInline alt |> String.concat)
119 |                 ++ "\">"
120 | 
121 |         HtmlInline tag attrs inlines ->
122 |             renderHtmlInline tag attrs inlines
123 | 
124 |         Emphasis 1 inlines ->
125 |             "<em>" ++ renderInlines inlines ++ "</em>"
126 | 
127 |         Emphasis _ inlines ->
128 |             "<strong>" ++ renderInlines inlines ++ "</strong>"
129 | 
130 |         Markdown.Inline.Custom _ inlines ->
131 |             renderInlines inlines
132 | 
133 | 
134 | simplifyInlines : List (Inline i) -> String
135 | simplifyInlines inlines =
136 |     List.map simplifyInline inlines |> String.concat
137 | 
138 | 
139 | simplifyInline : Inline i -> String
140 | simplifyInline inline =
141 |     case inline of
142 |         Text text ->
143 |             escape text
144 | 
145 |         HardLineBreak ->
146 |             "\n"
147 | 
148 |         CodeInline text ->
149 |             escape text
150 | 
151 |         Link _ _ label ->
152 |             simplifyInlines label
153 | 
154 |         Image _ _ alt ->
155 |             simplifyInlines alt
156 | 
157 |         HtmlInline _ _ inlines ->
158 |             simplifyInlines inlines
159 | 
160 |         Emphasis _ inlines ->
161 |             simplifyInlines inlines
162 | 
163 |         Markdown.Inline.Custom _ inlines ->
164 |             simplifyInlines inlines
165 | 
166 | 
167 | renderHtmlInline :
168 |     String
169 |     -> List ( String, Maybe String )
170 |     -> List (Inline i)
171 |     -> String
172 | renderHtmlInline tag attrs inlines =
173 |     let
174 |         attrsString =
175 |             String.concat <| List.map renderAttr attrs
176 | 
177 |         renderAttr =
178 |             \( attr, value ) ->
179 |                 case value of
180 |                     Just v ->
181 |                         " " ++ attr ++ "=\"" ++ escape v ++ "\""
182 | 
183 |                     Nothing ->
184 |                         " " ++ attr
185 |     in
186 |     "<"
187 |         ++ tag
188 |         ++ attrsString
189 |         ++ ">"
190 |         ++ renderInlines inlines
191 |         ++ "</"
192 |         ++ tag
193 |         ++ ">"
194 | 


--------------------------------------------------------------------------------
/src/Text/Seonbi/Html/Lang.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings #-}
  2 | module Text.Seonbi.Html.Lang
  3 |     ( LangHtmlEntity (..)
  4 |     , LanguageTag
  5 |     , annotateWithLang
  6 |     , extractLang
  7 |     , isKorean
  8 |     , isNeverKorean
  9 |     ) where
 10 | 
 11 | import Control.Applicative
 12 | import Data.Char (isSpace)
 13 | import Data.Maybe
 14 | 
 15 | import Data.Attoparsec.Text
 16 | import Data.Text
 17 | 
 18 | import Text.Seonbi.Html.Entity
 19 | import Text.Seonbi.Html.Tag (HtmlTag)
 20 | 
 21 | -- | Represents a language tag.  Although it is defined as an alias for 'Text',
 22 | -- it can be structured in the future.  Do not use its contents directly.
 23 | type LanguageTag = Text
 24 | 
 25 | -- | Extracts the language tag from the given raw HTML attributes if it has
 26 | -- @lang@ attribute.
 27 | --
 28 | -- >>> extractLang ""
 29 | -- Nothing
 30 | -- >>> extractLang "lang=en"
 31 | -- Just "en"
 32 | -- >>> extractLang "lang=\"ko-KR\""
 33 | -- Just "ko-kr"
 34 | -- >>> extractLang " lang='ko-Hang'"
 35 | -- Just "ko-hang"
 36 | extractLang
 37 |     :: HtmlRawAttrs
 38 |     -- ^ A raw HTML attributes to extract the language tag from.
 39 |     -> Maybe LanguageTag
 40 |     -- ^ A language tag extracted from the given raw HTML attributes.
 41 |     -- If the given raw HTML attributes does not have @lang@ attribute or
 42 |     -- its value is invalid, 'Nothing' is returned.
 43 | extractLang attrs =
 44 |     case parseOnly parser' attrs of
 45 |         Right (Just lang') ->
 46 |             let lt = toLower . strip . normalizeEntities $ lang'
 47 |             in if Data.Text.null lt then Nothing else Just lt
 48 |         _ -> Nothing
 49 |   where
 50 |     parser' :: Parser (Maybe Text)
 51 |     parser' = do
 52 |         skipSpace
 53 |         attrs' <- langAttr `sepBy` space
 54 |         skipSpace
 55 |         return $ listToMaybe $ catMaybes attrs'
 56 |     langAttr :: Parser (Maybe Text)
 57 |     langAttr = do
 58 |         (isLang, cont) <- attrIsLang
 59 |         value <- if cont then attrValue else return ""
 60 |         return (if isLang then Just value else Nothing)
 61 |     attrIsLang :: Parser (Bool, Bool)
 62 |     attrIsLang = choice
 63 |         [ asciiCI "lang=" >> return (True, True)
 64 |         , do { _ <- takeWhile1 (/= '=')
 65 |              ; eq <- optional (char '=')
 66 |              ; return (False, isJust eq)
 67 |              }
 68 |         ]
 69 |     attrValue :: Parser Text
 70 |     attrValue = choice
 71 |         [ do { skip (== '"'); v <- takeTill (== '"'); skip (== '"'); return v }
 72 |         , do { skip (== '\'')
 73 |              ; v <- takeTill (== '\'')
 74 |              ; skip (== '\''); return v
 75 |              }
 76 |         , takeWhile1 (not . isSpace)
 77 |         ]
 78 |     normalizeEntities :: Text -> Text
 79 |     normalizeEntities
 80 |         = Data.Text.replace "&hyphen;" "-"
 81 |         . Data.Text.replace "&dash;" "-"
 82 |         . Data.Text.replace "&#8208;" "-"
 83 |         . Data.Text.replace "&#x2010;" "-"
 84 |         . Data.Text.replace "&#X2010;" "-"
 85 | 
 86 | -- | Annotates 'HtmlEntity' with the 'lang' tag extracted from it or its
 87 | -- ancestors.
 88 | data LangHtmlEntity = LangHtmlEntity
 89 |     { -- | The @lang@ tag extracted from the HTML 'entity' or its ancestors.
 90 |       lang :: Maybe LanguageTag
 91 |       -- | The annotated HTML 'entity'.
 92 |     , entity :: HtmlEntity
 93 |     } deriving (Show, Eq)
 94 | 
 95 | -- | Annotates the given HTML entities with the language tag extracted from
 96 | -- their @lang@ attributes.  If a parent entity has @lang@ attribute, its
 97 | -- all descendants are annotated with the same language tag.
 98 | annotateWithLang :: [HtmlEntity] -> [LangHtmlEntity]
 99 | annotateWithLang =
100 |     annotate []
101 |   where
102 |     annotate :: [(HtmlTag, Maybe LanguageTag)]
103 |              -> [HtmlEntity]
104 |              -> [LangHtmlEntity]
105 |     annotate _ [] = []
106 |     annotate stack (x@HtmlStartTag { tag = tag', rawAttributes = attrs } : xs) =
107 |         LangHtmlEntity thisLang x : annotate nextStack xs
108 |       where
109 |         parentLang :: Maybe LanguageTag
110 |         parentLang = case stack of
111 |             (_, l):_ -> l
112 |             _ -> Nothing
113 |         thisLang :: Maybe LanguageTag
114 |         thisLang = extractLang attrs <|> parentLang
115 |         nextStack :: [(HtmlTag, Maybe LanguageTag)]
116 |         nextStack = (tag', thisLang) : stack
117 |     annotate stack (x@HtmlEndTag { tag = tag' } : xs) =
118 |         LangHtmlEntity thisLang x : annotate nextStack xs
119 |       where
120 |         (nextStack, thisLang) = case stack of
121 |             [] -> ([], Nothing)
122 |             s@((t, lang'):ys) ->
123 |                 (if t == tag' then ys else s, lang')
124 |     annotate stack (x : xs) =
125 |         LangHtmlEntity parentLang x : annotate stack xs
126 |       where
127 |         parentLang :: Maybe LanguageTag
128 |         parentLang = case stack of
129 |             (_, l):_ -> l
130 |             _ -> Nothing
131 | 
132 | -- | Determines whether the given language tag refers to any kind of Korean.
133 | --
134 | -- >>> isKorean "ko"
135 | -- True
136 | -- >>> isKorean "ko-KR"
137 | -- True
138 | -- >>> isKorean "kor-Hang"
139 | -- True
140 | -- >>> isKorean "en"
141 | -- False
142 | -- >>> isKorean "en-KR"
143 | -- False
144 | isKorean :: LanguageTag -> Bool
145 | isKorean lang' =
146 |     l == "ko" || l == "kor" ||
147 |     "ko-" `isPrefixOf` l ||
148 |     "kor-" `isPrefixOf` l
149 |   where
150 |     l :: Text
151 |     l = toLower lang'
152 | 
153 | -- | Determines whether the given language tag undoubtedly does not refer
154 | -- to any kind of Korean.
155 | --
156 | -- >>> isNeverKorean $ Just "ko"
157 | -- False
158 | -- >>> isNeverKorean $ Just "ko-KR"
159 | -- False
160 | -- >>> isNeverKorean Nothing
161 | -- False
162 | -- >>> isNeverKorean $ Just "en"
163 | -- True
164 | isNeverKorean :: Maybe LanguageTag -> Bool
165 | isNeverKorean Nothing = False
166 | isNeverKorean (Just lang') = not (isKorean lang')
167 | 


--------------------------------------------------------------------------------
/src/Text/Seonbi/Html/Scanner.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings #-}
  2 | {-# LANGUAGE TupleSections #-}
  3 | module Text.Seonbi.Html.Scanner
  4 |     ( Result (..)
  5 |     , scanHtml
  6 |     ) where
  7 | 
  8 | import Data.Char
  9 | import Prelude hiding (takeWhile)
 10 | 
 11 | import Data.Attoparsec.Text.Lazy
 12 | import Data.Map.Strict
 13 | import qualified Data.Text
 14 | import qualified Data.Text.Lazy
 15 | 
 16 | import Text.Seonbi.Html.Entity
 17 | import Text.Seonbi.Html.Tag
 18 | import Text.Seonbi.Html.TagStack
 19 | 
 20 | htmlFragments :: Parser [HtmlEntity]
 21 | htmlFragments = do
 22 |     result <- option [] $ fragments Text.Seonbi.Html.TagStack.empty
 23 |     txt <- htmlText Text.Seonbi.Html.TagStack.empty
 24 |     endOfInput
 25 |     return $ case txt of
 26 |         HtmlText { rawText = "" } -> result
 27 |         _ -> result ++ [txt]
 28 | 
 29 | fragments :: HtmlTagStack -> Parser [HtmlEntity]
 30 | fragments tagStack' = do
 31 |     txt <- htmlText tagStack'
 32 |     (entities, nextStack) <- htmlEntity tagStack'
 33 |     nextChunk <- option [] $ fragments nextStack
 34 |     let chunks = entities ++ nextChunk
 35 |     return $ case txt of
 36 |         HtmlText { rawText = "" } -> chunks
 37 |         txt' -> txt' : chunks
 38 | 
 39 | htmlText :: HtmlTagStack -> Parser HtmlEntity
 40 | htmlText tagStack' = do
 41 |     texts <- many' textFragment
 42 |     return $ mkText $ Data.Text.concat texts
 43 |   where
 44 |       mkText :: Data.Text.Text -> HtmlEntity
 45 |       mkText txt = HtmlText { tagStack = tagStack', rawText = txt }
 46 | 
 47 | textFragment :: Parser Data.Text.Text
 48 | textFragment = choice
 49 |     [ takeWhile1 (/= '<')
 50 |     , do
 51 |         a <- char '<'
 52 |         b <- satisfy $ \ c ->
 53 |             not (c == '!' || c == '/' || isAsciiUpper c || isAsciiLower c)
 54 |         return $ Data.Text.pack [a, b]
 55 |     ]
 56 | 
 57 | htmlEntity :: HtmlTagStack -> Parser ([HtmlEntity], HtmlTagStack)
 58 | htmlEntity tagStack' = choice
 59 |     [ htmlComment tagStack'
 60 |     , cdata tagStack'
 61 |     , startTag tagStack'
 62 |     , endTag tagStack'
 63 |     -- fallback:
 64 |     , (, tagStack') . (: []) . HtmlText tagStack' . Data.Text.singleton
 65 |         <$> anyChar
 66 |     ]
 67 | 
 68 | -- https://www.w3.org/TR/html5/syntax.html#comments
 69 | htmlComment :: HtmlTagStack -> Parser ([HtmlEntity], HtmlTagStack)
 70 | htmlComment tagStack' = do
 71 |     _ <- string "<!--"
 72 |     contents <- many' $ choice
 73 |         [ takeWhile1 (/= '-')
 74 |         , do
 75 |               a <- char '-'
 76 |               b <- notChar '-'
 77 |               return $ Data.Text.pack [a, b]
 78 |         , do
 79 |             a <- string "--"
 80 |             b <- notChar '>'
 81 |             return $ Data.Text.snoc a b
 82 |         ]
 83 |     _ <- string "-->"
 84 |     return
 85 |         ( [ HtmlComment
 86 |                 { tagStack = tagStack'
 87 |                 , comment = Data.Text.concat contents
 88 |                 }
 89 |           ]
 90 |         , tagStack'
 91 |         )
 92 | 
 93 | -- https://www.w3.org/TR/html5/syntax.html#cdata-sections
 94 | cdata :: HtmlTagStack -> Parser ([HtmlEntity], HtmlTagStack)
 95 | cdata tagStack' = do
 96 |     _ <- string "<![CDATA["
 97 |     contents <- many' $ choice
 98 |         [ takeWhile1 (/= ']')
 99 |         , do
100 |             a <- char ']'
101 |             b <- notChar ']'
102 |             return $ Data.Text.pack [a, b]
103 |         , do
104 |             a <- string "]]"
105 |             b <- notChar '>'
106 |             return $ Data.Text.snoc a b
107 |         ]
108 |     _ <- string "]]>"
109 |     return
110 |         ( [HtmlCdata { tagStack = tagStack', text = Data.Text.concat contents }]
111 |         , tagStack'
112 |         )
113 | 
114 | -- https://www.w3.org/TR/html5/syntax.html#start-tags
115 | startTag :: HtmlTagStack -> Parser ([HtmlEntity], HtmlTagStack)
116 | startTag tagStack' = do
117 |     _ <- char '<'
118 |     tag' <- htmlTag
119 |     attributes <- many' $ choice
120 |         [ do
121 |             s <- char '"'
122 |             c <- takeWhile (/= '"')
123 |             e <- char '"'
124 |             return (Data.Text.cons s $ Data.Text.snoc c e)
125 |         , do
126 |             s <- char '\''
127 |             c <- takeWhile (/= '\'')
128 |             e <- char '\''
129 |             return (Data.Text.cons s $ Data.Text.snoc c e)
130 |         , takeWhile1 $ \ c -> c /= '"' && c /= '\'' && c /= '/' && c /= '>'
131 |         ]
132 |     selfClosing <- option ' ' $ char '/'
133 |     _ <- char '>'
134 |     let (trailingEntities, nextTagStack) =
135 |             if selfClosing == '/' || htmlTagKind tag' == Void
136 |             then ([HtmlEndTag { tagStack = tagStack', tag = tag' }], tagStack')
137 |             else ([], push tag' tagStack')
138 |     return
139 |         ( HtmlStartTag
140 |             { tagStack = tagStack'
141 |             , tag = tag'
142 |             , rawAttributes = Data.Text.concat attributes
143 |             } : trailingEntities
144 |         , nextTagStack
145 |         )
146 | 
147 | -- https://www.w3.org/TR/html5/syntax.html#end-tags
148 | endTag :: HtmlTagStack -> Parser ([HtmlEntity], HtmlTagStack)
149 | endTag tagStack' = do
150 |     _ <- string "</"
151 |     tag' <- htmlTag
152 |     _ <- char '>'
153 |     return $ case htmlTagKind tag' of
154 |         Void -> ([], tagStack')
155 |         _ ->
156 |             let
157 |                 nextTagStack = pop tag' tagStack'
158 |             in
159 |                 ( [HtmlEndTag { tagStack = nextTagStack, tag = tag' }]
160 |                 , nextTagStack
161 |                 )
162 | 
163 | htmlTag :: Parser HtmlTag
164 | htmlTag = do
165 |     name <- tagName
166 |     case Data.Map.Strict.lookup (Data.Text.toLower name) htmlTagNames of
167 |         Just t -> return t
168 |         _ -> fail ("failed to parse; invalid tag: " ++ Data.Text.unpack name)
169 | 
170 | tagName :: Parser Data.Text.Text
171 | tagName = do
172 |     first <- satisfy $ \ c -> isAsciiUpper c || isAsciiLower c
173 |     rest <- takeWhile $ \ c -> isAsciiUpper c || isAsciiLower c || isDigit c
174 |     return $ Data.Text.cons first rest
175 | 
176 | scanHtml :: Data.Text.Lazy.Text -> Result [HtmlEntity]
177 | scanHtml = parse htmlFragments
178 | 


--------------------------------------------------------------------------------
/test/Text/Seonbi/Html/ClipperSpec.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedLists #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | module Text.Seonbi.Html.ClipperSpec (spec) where
  4 | 
  5 | import Control.Monad
  6 | 
  7 | import Data.Text
  8 | import Test.Hspec
  9 | 
 10 | import Text.Seonbi.Html.Clipper
 11 | import Text.Seonbi.Html.Entity
 12 | import Text.Seonbi.Html.Tag
 13 | 
 14 | spec :: Spec
 15 | spec = do
 16 |     describe "clipPrefixText" $ do
 17 |         it "returns Nothing if entities are empty and a prefix is not empty" $
 18 |             clipPrefixText "foo" [] `shouldBe` Nothing
 19 |         it "returns Nothing if the first entity is not an HtmlText" $
 20 |             forM_ (["", "foo"] :: [Text]) $ \ prefix -> do
 21 |                 clipPrefixText prefix
 22 |                     [ HtmlStartTag [] P ""
 23 |                     , HtmlText [P] "foo"
 24 |                     , HtmlEndTag [] P
 25 |                     ] `shouldBe` Nothing
 26 |                 clipPrefixText prefix
 27 |                     [ HtmlComment [] "foo"
 28 |                     , HtmlStartTag [] P ""
 29 |                     , HtmlText [P] "foo"
 30 |                     , HtmlEndTag [] P
 31 |                     ] `shouldBe` Nothing
 32 |                 clipPrefixText prefix [HtmlEndTag [] P] `shouldBe` Nothing
 33 |                 clipPrefixText prefix [HtmlCdata [] "foo"] `shouldBe` Nothing
 34 |         it "returns Just [] if entities are empty and a prefix is empty too" $
 35 |             clipPrefixText "" [] `shouldBe` Just []
 36 |         it "returns entities with the prefix text dropped" $ do
 37 |             clipPrefixText "foo"
 38 |                 [ HtmlText [] "foobar"
 39 |                 , HtmlStartTag [] P ""
 40 |                 , HtmlText [P] "foo"
 41 |                 , HtmlEndTag [] P
 42 |                 ]
 43 |                 `shouldBe` Just
 44 |                     [ HtmlText [] "bar"
 45 |                     , HtmlStartTag [] P ""
 46 |                     , HtmlText [P] "foo"
 47 |                     , HtmlEndTag [] P
 48 |                     ]
 49 |             clipPrefixText "foo"
 50 |                 [ HtmlText [] "foo"
 51 |                 , HtmlStartTag [] P ""
 52 |                 , HtmlText [P] "foo"
 53 |                 , HtmlEndTag [] P
 54 |                 ]
 55 |                 `shouldBe` Just
 56 |                     [ HtmlStartTag [] P ""
 57 |                     , HtmlText [P] "foo"
 58 |                     , HtmlEndTag [] P
 59 |                     ]
 60 |         it "ignores HtmlComment entities but preseves them" $ do
 61 |             clipPrefixText "foo"
 62 |                 [ HtmlComment [] "comment"
 63 |                 , HtmlText [] "foobar"
 64 |                 , HtmlStartTag [] P ""
 65 |                 , HtmlText [P] "foo"
 66 |                 , HtmlEndTag [] P
 67 |                 ]
 68 |                 `shouldBe` Just
 69 |                     [ HtmlComment [] "comment"
 70 |                     , HtmlText [] "bar"
 71 |                     , HtmlStartTag [] P ""
 72 |                     , HtmlText [P] "foo"
 73 |                     , HtmlEndTag [] P
 74 |                     ]
 75 |             clipPrefixText "foo"
 76 |                 [ HtmlComment [] "comment"
 77 |                 , HtmlText [] "foo"
 78 |                 , HtmlStartTag [] P ""
 79 |                 , HtmlText [P] "foo"
 80 |                 , HtmlEndTag [] P
 81 |                 ]
 82 |                 `shouldBe` Just
 83 |                     [ HtmlComment [] "comment"
 84 |                     , HtmlStartTag [] P ""
 85 |                     , HtmlText [P] "foo"
 86 |                     , HtmlEndTag [] P
 87 |                     ]
 88 | 
 89 |     describe "clipSuffixText" $ do
 90 |         it "returns Nothing if entities are empty and a suffix is not empty" $
 91 |             clipSuffixText "foo" [] `shouldBe` Nothing
 92 |         it "returns Nothing if the last entity is not an HtmlText" $
 93 |             forM_ (["", "foo"] :: [Text]) $ \ suffix -> do
 94 |                 clipSuffixText suffix
 95 |                     [ HtmlStartTag [] P ""
 96 |                     , HtmlText [P] "foo"
 97 |                     , HtmlEndTag [] P
 98 |                     ] `shouldBe` Nothing
 99 |                 clipSuffixText suffix
100 |                     [ HtmlStartTag [] P ""
101 |                     , HtmlText [P] "foo"
102 |                     , HtmlEndTag [] P
103 |                     , HtmlComment [] "foo"
104 |                     ] `shouldBe` Nothing
105 |                 clipSuffixText suffix [HtmlEndTag [] P] `shouldBe` Nothing
106 |                 clipSuffixText suffix [HtmlCdata [] "foo"] `shouldBe` Nothing
107 |         it "returns Just [] if entities are empty and a suffix is empty too" $
108 |             clipSuffixText "" [] `shouldBe` Just []
109 |         it "returns entities with the suffix text dropped" $ do
110 |             clipSuffixText "bar"
111 |                 [ HtmlStartTag [] P ""
112 |                 , HtmlText [P] "foo"
113 |                 , HtmlEndTag [] P
114 |                 , HtmlText [] "foobar"
115 |                 ]
116 |                 `shouldBe` Just
117 |                     [ HtmlStartTag [] P ""
118 |                     , HtmlText [P] "foo"
119 |                     , HtmlEndTag [] P
120 |                     , HtmlText [] "foo"
121 |                     ]
122 |             clipSuffixText "foo"
123 |                 [ HtmlStartTag [] P ""
124 |                 , HtmlText [P] "foo"
125 |                 , HtmlEndTag [] P
126 |                 , HtmlText [] "foo"
127 |                 ]
128 |                 `shouldBe` Just
129 |                     [ HtmlStartTag [] P ""
130 |                     , HtmlText [P] "foo"
131 |                     , HtmlEndTag [] P
132 |                     ]
133 |         it "ignores HtmlComment entities but preseves them" $ do
134 |             clipSuffixText "bar"
135 |                 [ HtmlStartTag [] P ""
136 |                 , HtmlText [P] "foo"
137 |                 , HtmlEndTag [] P
138 |                 , HtmlText [] "foobar"
139 |                 , HtmlComment [] "comment"
140 |                 ]
141 |                 `shouldBe` Just
142 |                     [ HtmlStartTag [] P ""
143 |                     , HtmlText [P] "foo"
144 |                     , HtmlEndTag [] P
145 |                     , HtmlText [] "foo"
146 |                     , HtmlComment [] "comment"
147 |                     ]
148 |             clipSuffixText "foo"
149 |                 [ HtmlStartTag [] P ""
150 |                 , HtmlText [P] "foo"
151 |                 , HtmlEndTag [] P
152 |                 , HtmlText [] "foo"
153 |                 , HtmlComment [] "comment"
154 |                 ]
155 |                 `shouldBe` Just
156 |                     [ HtmlStartTag [] P ""
157 |                     , HtmlText [P] "foo"
158 |                     , HtmlEndTag [] P
159 |                     , HtmlComment [] "comment"
160 |                     ]
161 | 
162 |     specify "clipText" $ do
163 |         clipText "foo" "baz"
164 |             [ HtmlText [] "foo"
165 |             , HtmlStartTag [] P ""
166 |             , HtmlText [P] "bar"
167 |             , HtmlEndTag [] P
168 |             , HtmlText [] "baz"
169 |             ] `shouldBe` Just
170 |                 [ HtmlStartTag [] P ""
171 |                 , HtmlText [P] "bar"
172 |                 , HtmlEndTag [] P
173 |                 ]
174 |         clipText "foo" "quux"
175 |             [ HtmlText [] "foobar"
176 |             , HtmlStartTag [] P ""
177 |             , HtmlText [P] "baz"
178 |             , HtmlEndTag [] P
179 |             , HtmlText [] "quxquux"
180 |             ] `shouldBe` Just
181 |                 [ HtmlText [] "bar"
182 |                 , HtmlStartTag [] P ""
183 |                 , HtmlText [P] "baz"
184 |                 , HtmlEndTag [] P
185 |                 , HtmlText [] "qux"
186 |                 ]
187 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Seonbi: SmartyPants for Korean language
  2 | =======================================
  3 | 
  4 | [![][releases-badge]][releases] [![][hackage-badge]][hackage] [![][dockerhub-badge]][dockerhub] [![][ci-status-badge]][ci]
  5 | 
  6 | [![](https://dahlia.github.io/seonbi/showcase.svg)][demo web app]
  7 | 
  8 | (TL;DR: See the [demo web app].)
  9 | 
 10 | Seonbi (선비) is an HTML preprocessor that makes typographic adjustments
 11 | to an HTML so that the result uses accurate punctuations according to
 12 | the modern Korean orthography.
 13 | (It's similar to what [SmartyPants] does for text written in English.)
 14 | 
 15 | It also transforms `ko-Kore` text (國漢文混用; [Korean mixed script]) into
 16 | `ko-Hang` text (한글전용; Hangul-only script).
 17 | 
 18 | Seonbi provides a Haskell library, a CLI, and an HTTP API; any of them can
 19 | perform the following transformations:
 20 | 
 21 |  -  All hanja words (e.g., `漢字`) into corresponding hangul-only words
 22 |     (e.g., `한자`)
 23 |  -  Straight quotes and apostrophes (`"` & `'`) into curly quotes HTML
 24 |     entities (`“`, `”`, `‘`, & `’`)
 25 |  -  Three consecutive periods (`...` or `。。。`) into an ellipsis entity (`…`)
 26 |  -  Classical (Chinese-style) stops (`。`, `、`, `？`, & `！`) into modern
 27 |     (English-style) stops (`.`, `,`, `?`, & `!`)
 28 |  -  Pairs of less-than and greater-than inequality symbols (`<` & `>`) into
 29 |     pairs of proper angle quotes (`〈` & `〉`)
 30 |  -  Pairs of two consecutive inequality symbols (`<<` & `>>`) into
 31 |     pairs of proper double angle quotes (`《` & `》`)
 32 |  -  A hyphen (`-`) or hangul vowel *eu* (`ㅡ`) surrounded by spaces, or
 33 |     two/three consecutive hyphens (`--` or `---`) into a proper em dash (`—`)
 34 |  -  A less-than inequality symbol followed by a hyphen or an equality
 35 |     symbol (`<-`, `<=`) into arrows to the left (`←`, `⇐`)
 36 |  -  A hyphen or an equality symbol followed by a greater-than inequality
 37 |     symbol (`->`, `=>`) into arrows to the right (`→`, `⇒`)
 38 |  -  A hyphen or an equality symbol wrapped by inequality symbols (`<->`, `<=>`)
 39 |     into bi-directional arrows (`↔`, `⇔`)
 40 | 
 41 | Each transformations can be partially turned on and off, and some
 42 | transformations have many options.
 43 | 
 44 | All transformations work with both plain texts and rich text tree.
 45 | In a similar way to SmartyPants, it does not modify characters within
 46 | several sensitive HTML elements like `<pre>`/`<code>`/`<script>`/`<kbd>`.
 47 | Chinese/Japanese stops or hanzi/kanji characters inside elements with
 48 | `lang="zh"`/`lang="ja"`[^1] are never transformed.
 49 | 
 50 | [^1]: Technically, only Korean contents and language-unspecified elements
 51 |       are transformed.  Elements having `lang` attribute with language tags
 52 |       referring to any Korean language are treated as Korean contents,
 53 |       e.g., `ko`, `ko-Hang`, `kor-KP`, `kor-Kore`.
 54 | 
 55 | [releases]: https://github.com/dahlia/seonbi/releases
 56 | [releases-badge]: https://img.shields.io/github/v/release/dahlia/seonbi
 57 | [hackage]: https://hackage.haskell.org/package/seonbi
 58 | [hackage-badge]: https://img.shields.io/hackage/v/seonbi
 59 | [dockerhub]: https://hub.docker.com/r/dahlia/seonbi
 60 | [dockerhub-badge]: https://img.shields.io/docker/v/dahlia/seonbi?label=docker%20image&sort=semver
 61 | [ci]: https://github.com/dahlia/seonbi/actions
 62 | [ci-status-badge]: https://github.com/dahlia/seonbi/workflows/build/badge.svg
 63 | [demo web app]: https://dahlia.github.io/seonbi/
 64 | [SmartyPants]: https://daringfireball.net/projects/smartypants/
 65 | [Korean mixed script]: https://en.wikipedia.org/wiki/Korean_mixed_script
 66 | 
 67 | 
 68 | End-user apps
 69 | -------------
 70 | 
 71 | Technically, Seonbi is basically exposed as a software component,
 72 | which is also known as API (application programming interface), to be used
 73 | as a module of other softwares.
 74 | 
 75 | However, as these official interfaces are not for humans but machines, it's
 76 | not easy to use for end-users whom haven't experienced software
 77 | programming.  For such end-users, here's the list of end-user apps:
 78 | 
 79 |  -  [Demo web app]: The official web app.  Not that mobile-friendly
 80 |     (usable though).
 81 |  -  [Seonbi Translator] (선비 번역기): An iOS shortcut made by
 82 |     [Heechan Bak] ([@channprj]).
 83 |  -  [seonbi-webapp]: Probably aims to replace the official web app.
 84 |     Made by Lee Dogeon ([@moreal]).
 85 |  -  [seonbi-ext]: A browser extension made by Lee Dogeon ([@moreal]).
 86 | 
 87 | [Seonbi Translator]: https://www.icloud.com/shortcuts/1e468a5dd0104fc5b69fe803952fc154
 88 | [Heechan Bak]: https://chann.dev/
 89 | [@channprj]: https://github.com/channprj
 90 | [seonbi-webapp]: https://github.com/moreal/seonbi-webapp
 91 | [@moreal]: https://github.com/moreal
 92 | [seonbi-ext]: https://github.com/moreal/seonbi-ext
 93 | 
 94 | 
 95 | Installation
 96 | ------------
 97 | 
 98 | Seonbi provides official executable binaries for Linux (x86_64), macOS (Apple
 99 | silicon & Intel), and Windows (64-bit).  You can download them from
100 | the [releases] page.
101 | 
102 | If you prefer [Scoop] on Windows use the Seonbi official bucket:
103 | 
104 |     scoop bucket add seonbi https://github.com/dahlia/seonbi.git
105 |     scoop install seonbi
106 | 
107 | It is also distributed as a [Docker image][dockerhub]:
108 | 
109 |     $ echo '訓民正音' | docker run -i dahlia/seonbi:latest seonbi
110 |     훈민정음
111 | 
112 | If you'd like to use it on GitHub Actions there is [action
113 | `dahlia/seonbi/setup`][dahlia/seonbi/setup]:
114 | 
115 |     - uses: dahlia/seonbi/setup
116 |     - run: seonbi -o output.html input.html
117 | 
118 | If you want to use it as a Haskell library install the [seonbi][hackage] package
119 | using Stack or Cabal.
120 | 
121 | [Scoop]: https://scoop.sh/
122 | [dahlia/seonbi/setup]: https://github.com/dahlia/seonbi/tree/main/setup
123 | 
124 | 
125 | CLI
126 | ---
127 | 
128 | The `seonbi` command basically takes the input HTML as standard input, and
129 | then transforms it into the output HTML as standard output:
130 | 
131 |     seonbi < input.html > output.html
132 | 
133 | You could pass a filename as an argument instead (and it is `-` by default):
134 | 
135 |     seonbi input.html > output.html
136 | 
137 | There is `-o`/`--output` option as well:
138 | 
139 |     seonbi -o output.html input.html
140 | 
141 | Although it automatically detects text encoding of the input file,
142 | you could explicitly specify `-e`/`--encoding`:
143 | 
144 |     seonbi -e euc-kr -o output.html input.html
145 | 
146 | Although there are several style options, e.g., `-q`/`--quote`, `-c`/`--cite`,
147 | `-r`/`--render-hanja`, in most cases, giving `-p`/`--preset` is enough:
148 | 
149 |     echo '平壤 冷麵' | seonbi -p ko-kr  # 평양 냉면
150 |     echo '平壤 冷麵' | seonbi -p ko-kp  # 평양 랭면
151 | 
152 | Read `-h`/`--help` for details:
153 | 
154 |     seonbi --help
155 | 
156 | 
157 | HTTP API
158 | --------
159 | 
160 | The `seonbi-api` command starts an HTTP server that takes `POST` requests
161 | with an HTML source with transformation options, and responds with
162 | a transformed result HTML.  You can decide a hostname and a port number
163 | with `-H`/`--host` and `-p`/`--port` options:
164 | 
165 |     seonbi-api -H 0.0.0.0 -p 3800
166 | 
167 | The following is an example request:
168 | 
169 |     POST / HTTP/1.1
170 |     Content-Type: application/json
171 |     Host: localhost:3800
172 | 
173 |     {
174 |       "preset": "ko-kr",
175 |       "contentType": "text/html",
176 |       "sourceHtml": "<p>하늘과 바람과 별과 詩</p>"
177 |     }
178 | 
179 | The HTTP API server would respond like this:
180 | 
181 |     HTTP/1.1 200 OK
182 |     Content-Type: application/json
183 |     Server: Seonbi/0.3.0
184 | 
185 |     {
186 |       "success": true,
187 |       "contentType": "text/html",
188 |       "resultHtml": "<p>하늘과 바람과 별과 시</p>"
189 |     }
190 | 
191 | If a web app needs to use the HTTP API server, [CORS] should be configured
192 | through `--allow-origin`/`-o` option:
193 | 
194 |     seonbi-api -o https://example.com
195 | 
196 | To learn more about parameters interactively, try the [demo web app].
197 | 
198 | [CORS]: https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS
199 | 
200 | 
201 | Haskell API
202 | -----------
203 | 
204 | All functions and types lie inside `Text.Seonbi` module and its submodules.
205 | The highest-level API is `Text.Seonbi.Facade` module.
206 | 
207 | See also the [API docs] or [Hackage].
208 | 
209 | [API docs]: https://dahlia.github.io/seonbi/docs/
210 | 
211 | 
212 | Deno API
213 | --------
214 | 
215 | There is a simple client library for Deno as well.  See also
216 | the [scripts/deno/](scripts/deno/) directory.
217 | 
218 | 
219 | License
220 | -------
221 | 
222 | Distributed under LGPL 2.1 or later.
223 | 
224 | 
225 | Etymology
226 | ---------
227 | 
228 | *[Seonbi]* (선비) means a classical scholar during Joseon periods (14c–19c).
229 | Today there's a meme that calls a person who feels morally superior or has
230 | elitism *seonbi* in the Korean internet.  So *seonbi* and *smarty pants* have
231 | some things in common.
232 | 
233 | [Seonbi]: https://en.wikipedia.org/wiki/Seonbi
234 | 


--------------------------------------------------------------------------------
/src/Text/Seonbi/Html/Tag.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE LambdaCase #-}
  2 | module Text.Seonbi.Html.Tag
  3 |     ( HtmlTag (..)
  4 |     , HtmlTagKind (..)
  5 |     , headingLevel
  6 |     , headingTag
  7 |     , headingTag'
  8 |     , htmlTagKind
  9 |     , htmlTagName
 10 |     , htmlTagNames
 11 |     , htmlTags
 12 |     ) where
 13 | 
 14 | import Data.Maybe
 15 | import Data.Map.Strict
 16 | import Data.Set
 17 | import Data.Text
 18 | 
 19 | -- $setup
 20 | -- >>> import Control.Applicative
 21 | -- >>> import Test.QuickCheck
 22 | -- >>> import Test.QuickCheck.Gen
 23 | -- >>> :{
 24 | -- instance Arbitrary HtmlTag where
 25 | --     arbitrary = elements $ Data.Set.toList htmlTags
 26 | -- :}
 27 | 
 28 | -- | The six [kinds of HTML elements
 29 | -- ](https://www.w3.org/TR/html5/syntax.html#writing-html-documents-elements).
 30 | data HtmlTagKind
 31 |     = Void
 32 |     | Template'
 33 |     | RawText
 34 |     | EscapableRawText
 35 |     | Foreign
 36 |     | Normal
 37 |     deriving (Eq, Ord, Show)
 38 | 
 39 | -- | HTML tags.  This enumeration type contains both HTML 5 and 4 tags for
 40 | -- maximum compatibility.
 41 | data HtmlTag
 42 |     -- CHECK: When a new tag is added, add it into the list of htmlTags (see
 43 |     -- below).
 44 |     = A
 45 |     | Abbr
 46 |     | Acronym
 47 |     | Address
 48 |     | Area
 49 |     | Article
 50 |     | Aside
 51 |     | Audio
 52 |     | B
 53 |     | Base
 54 |     | Bdi
 55 |     | Bdo
 56 |     | Big
 57 |     | BlockQuote
 58 |     | Body
 59 |     | BR
 60 |     | Button
 61 |     | Canvas
 62 |     | Caption
 63 |     | Center
 64 |     | Cite
 65 |     | Code
 66 |     | Col
 67 |     | ColGroup
 68 |     | Data
 69 |     | DataList
 70 |     | DD
 71 |     | Del
 72 |     | Details
 73 |     | Dfn
 74 |     | Dialog
 75 |     | Div
 76 |     | DL
 77 |     | DT
 78 |     | Em
 79 |     | Embed
 80 |     | FieldSet
 81 |     | FigCaption
 82 |     | Figure
 83 |     | Footer
 84 |     | Font
 85 |     | Form
 86 |     | H1
 87 |     | H2
 88 |     | H3
 89 |     | H4
 90 |     | H5
 91 |     | H6
 92 |     | Head
 93 |     | Header
 94 |     | HR
 95 |     | Html
 96 |     | I
 97 |     | IFrame
 98 |     | Img
 99 |     | Input
100 |     | Ins
101 |     | Kbd
102 |     | Label
103 |     | Legend
104 |     | LI
105 |     | Link
106 |     | Main
107 |     | Map
108 |     | Mark
109 |     | Meta
110 |     | Meter
111 |     | Nav
112 |     | NoBR
113 |     | NoScript
114 |     | Object
115 |     | OL
116 |     | OptGroup
117 |     | Option
118 |     | Output
119 |     | P
120 |     | Param
121 |     | Picture
122 |     | Pre
123 |     | Progress
124 |     | Q
125 |     | RB
126 |     | RP
127 |     | RT
128 |     | RTC
129 |     | Ruby
130 |     | S
131 |     | Samp
132 |     | Script
133 |     | Select
134 |     | Section
135 |     | Small
136 |     | Source
137 |     | Span
138 |     | Strike
139 |     | Strong
140 |     | Style
141 |     | Sub
142 |     | Summary
143 |     | Sup
144 |     | Table
145 |     | TBody
146 |     | TD
147 |     | Template
148 |     | TFoot
149 |     | TextArea
150 |     | TH
151 |     | THead
152 |     | Time
153 |     | Title
154 |     | TR
155 |     | Track
156 |     | TT
157 |     | U
158 |     | UL
159 |     | Var
160 |     | Video
161 |     | WBR
162 |     | XMP
163 |     deriving (Eq, Ord, Show)
164 | 
165 | -- | List all supported HTML tags.
166 | --
167 | -- >>> htmlTags
168 | -- fromList [A,Abbr,Acronym,Address,...,UL,Var,Video,WBR,XMP]
169 | htmlTags :: Set HtmlTag
170 | htmlTags = Data.Set.fromList
171 |     [ A, Abbr, Acronym, Address, Area, Article, Aside, Audio
172 |     , B, Base, Bdi, Bdo, Big, BlockQuote, Body, BR, Button
173 |     , Canvas, Caption, Center, Cite, Code, Col, ColGroup
174 |     , Data, DataList, DD, Del, Details, Dfn, Dialog, Div, DL, DT
175 |     , Em, Embed
176 |     , FieldSet, FigCaption, Figure, Font, Footer, Form
177 |     , H1, H2, H3, H4, H5, H6, Head, Header, HR, Html
178 |     , I, IFrame, Img, Input, Ins
179 |     , Kbd
180 |     , Label, Legend, LI, Link
181 |     , Main, Map, Mark, Meta, Meter
182 |     , Nav, NoBR, NoScript
183 |     , Object, OL, OptGroup, Option, Output
184 |     , P, Param, Picture, Pre, Progress
185 |     , Q
186 |     , RB, RP, RT, RTC, Ruby
187 |     , S, Samp, Script, Select, Section, Small, Source
188 |     , Span, Strike, Strong, Style, Sub, Summary, Sup
189 |     , Table, TBody, TD, Template, TFoot, TextArea
190 |     , TH, THead, Time, Title, TR, Track, TT
191 |     , U, UL
192 |     , Var, Video
193 |     , WBR
194 |     , XMP
195 |     ]
196 | 
197 | -- | The name of an 'HtmlTag' in lowercase.
198 | --
199 | -- >>> htmlTagName TextArea
200 | -- "textarea"
201 | --
202 | -- prop> \ t -> htmlTagName t == (toLower $ pack $ show (t :: HtmlTag))
203 | htmlTagName :: HtmlTag -> Text
204 | htmlTagName = toLower . pack . show
205 | 
206 | -- | The map of tag names to 'HtmlTag' values.
207 | --
208 | -- >>> :set -XOverloadedStrings
209 | -- >>> Data.Map.Strict.lookup "blockquote" htmlTagNames
210 | -- Just BlockQuote
211 | --
212 | -- prop> \ t -> Data.Map.Strict.lookup (htmlTagName t) htmlTagNames == Just t
213 | htmlTagNames :: Map Text HtmlTag
214 | htmlTagNames =
215 |     Data.Map.Strict.fromList
216 |         [(htmlTagName t, t) | t <- Data.Set.toList htmlTags]
217 | 
218 | -- | The kind of an 'HtmlTag'.
219 | --
220 | -- >>> Data.Set.filter ((== EscapableRawText) . htmlTagKind) htmlTags
221 | -- fromList [TextArea,Title]
222 | htmlTagKind :: HtmlTag -> HtmlTagKind
223 | htmlTagKind = \ case
224 |     A -> Normal
225 |     Abbr -> Normal
226 |     Acronym -> Normal
227 |     Address -> Normal
228 |     Area -> Void
229 |     Article -> Normal
230 |     Aside -> Normal
231 |     Audio -> Normal
232 |     B -> Normal
233 |     Base -> Void
234 |     Bdi -> Normal
235 |     Bdo -> Normal
236 |     Big -> Normal
237 |     BlockQuote -> Normal
238 |     Body -> Normal
239 |     BR -> Void
240 |     Button -> Normal
241 |     Canvas -> Foreign
242 |     Caption -> Normal
243 |     Center -> Normal
244 |     Cite -> Normal
245 |     Code -> Normal
246 |     Col -> Void
247 |     ColGroup -> Normal
248 |     Data -> Normal
249 |     DataList -> Normal
250 |     DD -> Normal
251 |     Del -> Normal
252 |     Details -> Normal
253 |     Dfn -> Normal
254 |     Dialog -> Normal
255 |     Div -> Normal
256 |     DL -> Normal
257 |     DT -> Normal
258 |     Em -> Normal
259 |     Embed -> Void
260 |     FieldSet -> Normal
261 |     FigCaption -> Normal
262 |     Figure -> Normal
263 |     Font -> Normal
264 |     Footer -> Normal
265 |     Form -> Normal
266 |     H1 -> Normal
267 |     H2 -> Normal
268 |     H3 -> Normal
269 |     H4 -> Normal
270 |     H5 -> Normal
271 |     H6 -> Normal
272 |     Head -> Normal
273 |     Header -> Normal
274 |     HR -> Void
275 |     Html -> Normal
276 |     I -> Normal
277 |     IFrame -> Normal
278 |     Img -> Void
279 |     Input -> Void
280 |     Ins -> Normal
281 |     Kbd -> Normal
282 |     Label -> Normal
283 |     Legend -> Normal
284 |     LI -> Normal
285 |     Link -> Void
286 |     Main -> Normal
287 |     Map -> Normal
288 |     Mark -> Normal
289 |     Meta -> Void
290 |     Meter -> Normal
291 |     Nav -> Normal
292 |     NoBR -> Normal
293 |     NoScript -> Normal
294 |     Object -> Normal
295 |     OL -> Normal
296 |     OptGroup -> Normal
297 |     Option -> Normal
298 |     Output -> Normal
299 |     P -> Normal
300 |     Param -> Void
301 |     Picture -> Normal
302 |     Pre -> Normal
303 |     Progress -> Normal
304 |     Q -> Normal
305 |     RB -> Normal
306 |     RP -> Normal
307 |     RT -> Normal
308 |     RTC -> Normal
309 |     Ruby -> Normal
310 |     S -> Normal
311 |     Samp -> Normal
312 |     Script -> RawText
313 |     Select -> Normal
314 |     Section -> Normal
315 |     Small -> Normal
316 |     Source -> Void
317 |     Span -> Normal
318 |     Strike -> Normal
319 |     Strong -> Normal
320 |     Style -> RawText
321 |     Sub -> Normal
322 |     Summary -> Normal
323 |     Sup -> Normal
324 |     Table -> Normal
325 |     TBody -> Normal
326 |     TD -> Normal
327 |     Template -> Template'
328 |     TFoot -> Normal
329 |     TextArea -> EscapableRawText
330 |     TH -> Normal
331 |     THead -> Normal
332 |     Time -> Normal
333 |     Title -> EscapableRawText
334 |     TR -> Normal
335 |     Track -> Void
336 |     TT -> Normal
337 |     U -> Normal
338 |     UL -> Normal
339 |     Var -> Normal
340 |     Video -> Normal
341 |     WBR -> Void
342 |     XMP -> RawText
343 | 
344 | -- | Get the heading level of an 'HtmlTag', if it is a heading tag
345 | -- ('H1' to 'H6').
346 | --
347 | -- >>> headingLevel H1
348 | -- Just 1
349 | -- >>> headingLevel H6
350 | -- Just 6
351 | -- >>> headingLevel P
352 | -- Nothing
353 | headingLevel :: HtmlTag -> Maybe Int
354 | headingLevel = \ case
355 |     H1 -> Just 1
356 |     H2 -> Just 2
357 |     H3 -> Just 3
358 |     H4 -> Just 4
359 |     H5 -> Just 5
360 |     H6 -> Just 6
361 |     _ -> Nothing
362 | 
363 | -- | Get the heading tag with the given heading level.  If the level is
364 | -- invalid, then 'Nothing' is returned.
365 | --
366 | -- >>> headingTag 1
367 | -- Just H1
368 | -- >>> headingTag 6
369 | -- Just H6
370 | -- >>> headingTag 7
371 | -- Nothing
372 | headingTag :: Int -> Maybe HtmlTag
373 | headingTag = \ case
374 |     1 -> Just H1
375 |     2 -> Just H2
376 |     3 -> Just H3
377 |     4 -> Just H4
378 |     5 -> Just H5
379 |     6 -> Just H6
380 |     _ -> Nothing
381 | 
382 | -- | Get the heading tag with the given heading level.  If the level is
383 | -- greater than 6, then 'H6' is returned.  If the level is less than 1,
384 | -- then 'H1' is returned.
385 | --
386 | -- >>> headingTag' 1
387 | -- H1
388 | -- >>> headingTag' 6
389 | -- H6
390 | -- >>> headingTag' 0
391 | -- H1
392 | -- >>> headingTag' 7
393 | -- H6
394 | headingTag' :: Int -> HtmlTag
395 | headingTag' level =
396 |     fromMaybe (if level > 6 then H6 else H1) $ headingTag level
397 | 


--------------------------------------------------------------------------------
/app/seonbi-api.hs:
--------------------------------------------------------------------------------
  1 | {-# OPTIONS_GHC -fno-warn-orphans #-}
  2 | {-# LANGUAGE CPP #-}
  3 | {-# LANGUAGE FlexibleInstances #-}
  4 | {-# LANGUAGE NamedFieldPuns #-}
  5 | {-# LANGUAGE OverloadedLists #-}
  6 | {-# LANGUAGE OverloadedStrings #-}
  7 | module Main (main) where
  8 | 
  9 | import Control.Concurrent (threadDelay)
 10 | import Control.Monad
 11 | import Data.Maybe (catMaybes)
 12 | import Data.String
 13 | import Data.Version
 14 | import GHC.Exts (IsList (..))
 15 | import System.IO
 16 | 
 17 | import Data.Aeson
 18 | #if MIN_VERSION_aeson(2,0,0)
 19 | import qualified Data.Aeson.Key
 20 | #endif
 21 | import qualified Data.Aeson.Types
 22 | import qualified Data.ByteString as B
 23 | import qualified Data.Map.Strict as M
 24 | import qualified Data.Set as S
 25 | import Data.Text
 26 | import Data.Text.Encoding
 27 | import Network.Wai
 28 | import Network.HTTP.Types
 29 | import Network.Wai.Handler.Warp
 30 | import Options.Applicative
 31 | 
 32 | import qualified Paths_seonbi as Meta
 33 | import Text.Seonbi.Facade
 34 | import Text.Seonbi.Trie as Trie
 35 | 
 36 | data Input = Input
 37 |     { source :: Text
 38 |     , configuration :: Configuration IO ()
 39 |     , warnings :: [Text]
 40 |     } deriving (Show)
 41 | 
 42 | instance FromJSON Input where
 43 |     parseJSON = withObject "Input" $ \ v -> do
 44 |         sourceMaybe <- v .:? "content"
 45 |         (source', w1) <- case sourceMaybe of
 46 |             Just s -> return (s, Nothing)
 47 |             Nothing -> do
 48 |                 sourceHtml' <- v .:? "sourceHtml"
 49 |                 case sourceHtml' of
 50 |                     Just h -> return
 51 |                         ( h
 52 |                         , Just $ "key \"sourceHtml\" is deprecated in " <>
 53 |                                  "favour of \"content\""
 54 |                         )
 55 |                     Nothing -> fail "key \"content\" not present"
 56 |         preset <- v .:? "preset"
 57 |         contentTypeMaybe <- v .:? "contentType"
 58 |         (contentType', w2) <- case contentTypeMaybe of
 59 |             Just t -> return (t, Nothing)
 60 |             Nothing -> do
 61 |                 xhtml <- v .:? "xhtml"
 62 |                 case xhtml of
 63 |                     Just x -> return
 64 |                         ( if x then "application/xhtml+xml" else "text/html"
 65 |                         , Just $ "key \"xhtml\" is deprecated in favour of " <>
 66 |                                  "\"contentType\""
 67 |                         )
 68 |                     Nothing -> return ("text/html", Nothing)
 69 |         config <- case preset of
 70 |             Just locale ->
 71 |                 let presets' = presets :: M.Map Text (Configuration IO ())
 72 |                     m = M.lookup (replace "_" "-" $ toLower locale) presets'
 73 |                 in
 74 |                     case m of
 75 |                         Just p -> return p
 76 |                         Nothing -> fail $ unpack $ Data.Text.concat
 77 |                             [ "No such preset: "
 78 |                             , locale
 79 |                             , "; available presets: "
 80 |                             , intercalate ", " (M.keys presets')
 81 |                             ]
 82 |             Nothing -> do
 83 |                 quote' <- v .:? "quote"
 84 |                 cite' <- v .:? "cite"
 85 |                 arrow' <- v .:? "arrow"
 86 |                 ellipsis' <- v .:? "ellipsis" .!= False
 87 |                 emDash' <- v .:? "emDash" .!= False
 88 |                 stop' <- v .:? "stop"
 89 |                 hanja' <- v .:? "hanja" .!= Nothing
 90 |                 return Configuration
 91 |                     { debugLogger = Nothing
 92 |                     , contentType = contentType'
 93 |                     , quote = quote'
 94 |                     , cite = cite'
 95 |                     , arrow = arrow'
 96 |                     , ellipsis = ellipsis'
 97 |                     , emDash = emDash'
 98 |                     , stop = stop'
 99 |                     , hanja = hanja'
100 |                     }
101 |         return $ Input
102 |             { source = source'
103 |             , configuration = config { contentType = contentType' }
104 |             , warnings = catMaybes [w1, w2]
105 |             }
106 | 
107 | instance FromJSON ContentType where
108 |     parseJSON = withText "ContentType" $ \ t ->
109 |         case contentTypeFromText t of
110 |             Just ctype ->
111 |                 if ctype `S.member` contentTypes
112 |                 then return ctype
113 |                 else fail $ unpack $ Data.Text.concat
114 |                     [ "Unknown content type: "
115 |                     , t
116 |                     , "; available content types: "
117 |                     , availables
118 |                     ]
119 | 
120 |             Nothing -> fail $ unpack $ Data.Text.concat
121 |                 [ "Invalid content type: "
122 |                 , t
123 |                 , "; available content types: "
124 |                 , availables
125 |                 ]
126 |       where
127 |         availables :: Text
128 |         availables =
129 |             intercalate ", " $ contentTypeText <$> S.elems contentTypes
130 | 
131 | instance FromJSON QuoteOption
132 | instance FromJSON CiteOption
133 | instance FromJSON ArrowOption
134 | instance FromJSON StopOption
135 | instance FromJSON HanjaRenderingOption
136 | 
137 | instance FromJSON HanjaOption where
138 |     parseJSON = withObject "HanjaOption" $ \ v -> HanjaOption
139 |         <$> v .: "rendering"
140 |         <*> v .: "reading"
141 | 
142 | instance FromJSON HanjaReadingOption where
143 |     parseJSON = withObject "HanjaReadingOption" $ \ v -> do
144 |         initialSoundLaw <- v .:? "initialSoundLaw" .!= False
145 |         wordMap <- v .:? "dictionary" .!= []
146 |         let wordPairs = GHC.Exts.toList (wordMap :: Object)
147 |         dictionary <- forM wordPairs $ \ (key, val) -> do
148 |             val' <- withText "Hangul string" return val
149 | #if MIN_VERSION_aeson(2,0,0)
150 |             return (Data.Aeson.Key.toText key, val')
151 | #else
152 |             return (key, val')
153 | #endif
154 |         let customDict = Trie.fromList dictionary
155 |         dictIds <- v .:? "useDictionaries" .!= []
156 |         useDictionaries <- forM (dictIds :: Array) $
157 |             withText "Dictionary ID string" getDictById
158 |         let dict = Prelude.foldl unionL customDict useDictionaries
159 |         return $ HanjaReadingOption initialSoundLaw dict
160 |       where
161 |         getDictById :: Text -> Data.Aeson.Types.Parser HanjaDictionary
162 |         getDictById "kr-stdict" = return southKoreanDictionaryUnsafe
163 |         getDictById dictId = fail ("No such dictionary ID: " ++ unpack dictId)
164 |         southKoreanDictionaryUnsafe :: HanjaDictionary
165 |         southKoreanDictionaryUnsafe = case hanja ko_KR' of
166 |             Just HanjaOption { reading = HanjaReadingOption { dictionary } } ->
167 |                 dictionary
168 |             Nothing ->
169 |                 Trie.empty
170 |         ko_KR' :: Configuration IO ()
171 |         ko_KR' = ko_KR
172 | 
173 | 
174 | app :: AppOptions -> Application
175 | app AppOptions { allowOrigin, debugDelayMs } request respond =
176 |     case requestMethod request of
177 |         "POST" -> do
178 |             inputJson <- lazyRequestBody request
179 |             threadDelay (debugDelayMs * 1000)
180 |             case eitherDecode' inputJson of
181 |                 Right (Input source config warnings) -> do
182 |                     result <- transformHtmlText config source
183 |                     let type' = contentType config
184 |                     let warningComments =
185 |                             if Prelude.null warnings
186 |                             then Data.Text.empty
187 |                             else Data.Text.concat
188 |                                 [ "<!--\n"
189 |                                 , Data.Text.intercalate "\n" warnings
190 |                                 , "\n-->"
191 |                                 ]
192 |                     respond' status200 $ object $
193 |                         [ "success" .= Bool True
194 |                         , "content" .= String result
195 |                         , "warnings" .= Array
196 |                             (GHC.Exts.fromList $ String <$> warnings)
197 |                         , "contentType" .= String (contentTypeText type')
198 |                         ]
199 |                         ++
200 |                         [ "resultHtml" .= String (warningComments <> result)
201 |                         | type' == "text/html" ||
202 |                           type' == "application/xhtml+xml"
203 |                         ]
204 |                 Left msg -> respond' status400 $ object
205 |                     [ "success" .= Bool False
206 |                     , "message" .= String (pack msg)
207 |                     ]
208 |         "OPTIONS" ->
209 |             respond' status200 Null
210 |         method -> respond' status405 $ object
211 |             [ "success" .= Bool False
212 |             , "message" .= String ("Unsupported method: " <> decodeUtf8 method)
213 |             ]
214 |   where
215 |     respond' :: ToJSON a => Status -> a -> IO ResponseReceived
216 |     respond' status value' =
217 |         respond $ responseLBS status headers (encode value')
218 |     headers :: [Header]
219 |     headers = headerAdder
220 |         [ ("Content-Type", "application/json")
221 |         , ("Access-Control-Allow-Headers", "*")
222 |         , ("Vary", "origin")
223 |         ]
224 |     headerAdder :: [Header] -> [Header]
225 |     headerAdder = case allowOrigin of
226 |         Just origin -> (("Access-Control-Allow-Origin", origin) :)
227 |         Nothing -> id
228 | 
229 | string :: IsString a => ReadM a
230 | string = maybeReader (Just . fromString)
231 | 
232 | showHostPreference :: HostPreference -> String
233 | showHostPreference h = case show h of
234 |     "HostAny" -> "[::]"
235 |     "HostIPv4" -> "0.0.0.0"
236 |     "HostIPv4Only" -> "0.0.0.0"
237 |     "HostIPv6" -> "[::]"
238 |     "HostIPv6Only" -> "[::]"
239 |     'H' : 'o' : 's' : 't' : ' ' : '"' : a ->
240 |         Prelude.take (Prelude.length a - 1) a
241 |     _ -> "?"
242 | 
243 | data CliOptions = CliOptions
244 |     { serverSettings :: Settings
245 |     , appOptions :: AppOptions
246 |     }
247 | 
248 | data AppOptions = AppOptions
249 |     { allowOrigin :: Maybe B.ByteString
250 |     , debugDelayMs :: Int
251 |     } deriving (Show, Eq)
252 | 
253 | parser :: Parser CliOptions
254 | parser = CliOptions
255 |     <$> ( setHost
256 |         <$> option string
257 |             ( long "host"
258 |             <> short 'H'
259 |             <> metavar "HOST"
260 |             <> value "*"
261 |             <> help "Host address to listen (default: [::/0])"
262 |             )
263 |         <*> ((`setPort` defaultSettings)
264 |             <$> option auto
265 |                 ( long "port"
266 |                 <> short 'p'
267 |                 <> metavar "PORT"
268 |                 <> value 3800
269 |                 <> showDefault
270 |                 <> help "Port number to listen"
271 |                 )
272 |             )
273 |         )
274 |     <*> ( AppOptions
275 |         <$> (
276 |                 optional
277 |                     ( strOption
278 |                         ( long "allow-origin"
279 |                         <> short 'o'
280 |                         <> metavar "ORIGIN"
281 |                         <> help "Allow cross-origin (i.e., CORS)"
282 |                         )
283 |                     )
284 |                 <|> pure Nothing
285 |             )
286 |         <*> option auto
287 |             ( long "debug-delay"
288 |             <> metavar "MS"
289 |             <> value 0
290 |             <> showDefault
291 |             <> help "Delay time for client development"
292 |             )
293 |         )
294 |     <**> helper
295 | 
296 | parserInfo :: ParserInfo CliOptions
297 | parserInfo = info parser
298 |     ( fullDesc
299 |     <> progDesc "Seonbi HTTP API server"
300 |     )
301 | 
302 | serverName :: B.ByteString
303 | serverName =
304 |     "Seonbi/" `B.append` encodeUtf8 (pack $ showVersion Meta.version)
305 | 
306 | main :: IO ()
307 | main = do
308 |     CliOptions
309 |         { serverSettings = settings
310 |         , appOptions
311 |         } <- execParser parserInfo
312 |     let serverSettings' = setServerName serverName settings
313 |     let netloc = showHostPreference (getHost serverSettings') ++ ":" ++
314 |             show (getPort serverSettings')
315 |     let url = "http://" ++ netloc ++ "/"
316 |     hPutStrLn stderr $ "seonbi-api v" ++ showVersion Meta.version
317 |     hPutStrLn stderr url
318 |     runSettings serverSettings' $ app appOptions
319 | 


--------------------------------------------------------------------------------
/scripts/ko-kr-stdict/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | # Extract Sino-Korean words from Standard Korean Language Dictionary
  4 | # Copyright (C) 2019--2023  Hong Minhee
  5 | #
  6 | # This program is free software: you can redistribute it and/or modify
  7 | # it under the terms of the GNU General Public License as published by
  8 | # the Free Software Foundation, either version 3 of the License, or
  9 | # (at your option) any later version.
 10 | #
 11 | # This program is distributed in the hope that it will be useful,
 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | # GNU General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 18 | import argparse
 19 | import csv
 20 | import functools
 21 | import io
 22 | import itertools
 23 | import os
 24 | import os.path
 25 | import re
 26 | import sqlite3
 27 | import sys
 28 | import tempfile
 29 | import xml.dom.pulldom
 30 | import zipfile
 31 | 
 32 | HANJA_PATTERN = '''
 33 |     # Ideographic Description Character
 34 |     [\u2f00-\u2fff] |
 35 |     # U+3007 IDEOGRAPHIC NUMBER ZERO (〇)
 36 |     \u3007 |
 37 |     # CJK Unified Ideographs Extension A
 38 |     [\u3400-\u4dbf] |
 39 |     # CJK Unified Ideographs
 40 |     [\u4e00-\u9fcc] |
 41 |     # CJK Compatibility Ideographs
 42 |     [\uf900-\ufaff] |
 43 |     # CJK Unified Ideographs Extension B
 44 |     [\U00020000-\U0002a6d6] |
 45 |     # CJK Unified Ideographs Extension C
 46 |     [\U0002a700-\U0002b734] |
 47 |     # CJK Unified Ideographs Extension D
 48 |     [\U0002b740-\U0002b81d] |
 49 |     # CJK Unified Ideographs Extension E
 50 |     [\U0002b820-\U0002cea1] |
 51 |     # CJK Unified Ideographs Extension F
 52 |     [\U0002ceb0-\U0002ebe0] |
 53 |     # CJK Compatibility Ideographs Supplement
 54 |     [\U0002f800-\U0002fa1f]
 55 | '''
 56 | HANJA_RE = re.compile(HANJA_PATTERN, re.VERBOSE)
 57 | SQUARE_BRACKETS_HANJA_RE = re.compile(
 58 |     f'\\[((?:{HANJA_PATTERN})+)\\]$'
 59 | , re.VERBOSE | re.UNICODE)
 60 | HANJA_ONLY_RE = re.compile(f'^(?:(?:{HANJA_PATTERN})▽?)+$', re.VERBOSE)
 61 | DISAMBIGUATOR = re.compile(r'\d{2}$|(?:^|(?<=[가-힣]))-(?:(?=[가-힣])|$)')
 62 | 
 63 | 
 64 | EQU_RE = re.compile('<equ>&#x([A-Fa-f0-9]+);</equ>')
 65 | 
 66 | EQU_TABLE = {
 67 |     0xe000: '⿰魚空',  # https://www.cns11643.gov.tw/wordView.jsp?ID=948526
 68 |     0xe004: '氛',
 69 |     0xe005: '⿱艹詢',  # https://hc.jsecs.org/irg/ws2021/app/?id=03429
 70 |     0xe008: '옴',  # https://en.wikipedia.org/wiki/Om
 71 |     0x1e45: 'n',  # n in linga/lingam.  https://en.wikipedia.org/wiki/Lingam
 72 | }
 73 | 
 74 | 
 75 | def expand_equ(match):
 76 |     codepoint = int(match.group(1), 16)
 77 |     character = chr(codepoint)
 78 |     if HANJA_RE.match(character):
 79 |         return character
 80 |     return EQU_TABLE.get(codepoint, match.group(0))
 81 | 
 82 | 
 83 | def filter_xml(xml_path, hanja_only=True):
 84 |     doc = xml.dom.pulldom.parse(xml_path)
 85 |     has_hanja = HANJA_RE.search
 86 |     match_square_brackets_hanja = SQUARE_BRACKETS_HANJA_RE.search
 87 |     if hanja_only:
 88 |         includes = HANJA_ONLY_RE.match
 89 |     else:
 90 |         includes = has_hanja
 91 |     equ_re = EQU_RE
 92 |     START_ELEMENT = xml.dom.pulldom.START_ELEMENT
 93 |     END_ELEMENT = xml.dom.pulldom.END_ELEMENT
 94 |     CHARACTERS = xml.dom.pulldom.CHARACTERS
 95 |     word = None
 96 |     hangul = None
 97 |     hangul_filled = False
 98 |     origin = None
 99 |     origin_lang = None
100 |     origin_type = None
101 |     sense = None
102 |     skip = False
103 |     for ev, node in doc:
104 |         tag = node.tagName if ev == START_ELEMENT or ev == END_ELEMENT else ''
105 |         if ev == START_ELEMENT and tag == 'item':
106 |             word = None
107 |             hangul = None
108 |             hangul_filled = False
109 |             origin = None
110 |             origin_lang = None
111 |             origin_type = None
112 |             sense = None
113 |             skip = False
114 |             continue
115 |         elif ev == START_ELEMENT and tag in ('relation_info', 'conju_info'):
116 |             skip = True
117 |             continue
118 |         elif ev == END_ELEMENT and tag in ('relation_info', 'conju_info'):
119 |             skip = False
120 |             continue
121 |         elif skip:
122 |             continue
123 |         if word is None:
124 |             if ev == START_ELEMENT and tag == 'word_info':
125 |                 word = {'meaning': [], 'origin': []}
126 |                 hangul = None
127 |                 hangul_filled = False
128 |                 origin = None
129 |                 origin_lang = None
130 |                 origin_type = None
131 |                 sense = None
132 |         else:
133 |             if hangul is None and not hangul_filled:
134 |                 if ev == START_ELEMENT and tag == 'word':
135 |                     hangul = ''
136 |             elif not hangul_filled:
137 |                 if ev == CHARACTERS:
138 |                     hangul += node.data
139 |                 elif ev == END_ELEMENT and tag == 'word':
140 |                     word['reading'] = DISAMBIGUATOR.sub('', hangul.strip()) \
141 |                         .strip('-') \
142 |                         .replace('^', ' ')
143 |                     hangul = None
144 |                     hangul_filled = True
145 |             if origin is None:
146 |                 if ev == START_ELEMENT and tag == 'original_language_info':
147 |                     origin = ('', None)
148 |             else:
149 |                 if origin_lang is None:
150 |                     if ev == START_ELEMENT and tag == 'original_language':
151 |                         origin_lang = ''
152 |                 else:
153 |                     if ev == CHARACTERS:
154 |                         origin_lang += node.data
155 |                     elif ev == END_ELEMENT and tag == 'original_language':
156 |                         origin = (origin_lang, origin[1])
157 |                         origin_lang = None
158 |                 if origin_type is None:
159 |                     if ev == START_ELEMENT and tag == 'language_type':
160 |                         origin_type = ''
161 |                 else:
162 |                     if ev == CHARACTERS:
163 |                         origin_type += node.data
164 |                     elif ev == END_ELEMENT and tag == 'language_type':
165 |                         origin = (origin[0], origin_type.strip())
166 |                         origin_type = None
167 |                 if ev == END_ELEMENT and tag == 'original_language_info':
168 |                     if origin[1] == '/(병기)':
169 |                         if word['origin']:
170 |                             word['origin'].append('')
171 |                     else:
172 |                         if has_hanja(origin[0]):
173 |                             square_brackets_hanja = \
174 |                                 match_square_brackets_hanja(origin[0])
175 |                             if square_brackets_hanja:
176 |                                 origin_sub = square_brackets_hanja.group(1)
177 |                             else:
178 |                                 origin_sub = origin[0]
179 |                         else:
180 |                             origin_sub = origin[0]
181 |                         if word['origin']:
182 |                             word['origin'][-1] += origin_sub
183 |                         else:
184 |                             word['origin'].append(origin_sub)
185 |                     origin = None
186 |             if sense is None:
187 |                 if ev == START_ELEMENT and tag == 'definition':
188 |                     sense = ''
189 |             else:
190 |                 if ev == CHARACTERS:
191 |                     sense += node.data
192 |                 elif ev == END_ELEMENT and tag == 'definition':
193 |                     sense = sense.replace('\r', '').replace('\n', ' ')
194 |                     word['meaning'].append(sense)
195 |                     sense = None
196 |             if ev == END_ELEMENT and tag == 'word_info':
197 |                 word['origin'] = list(filter(includes, word['origin']))
198 |                 if not word['origin']:
199 |                     continue
200 |                 if any(m.startswith('→ ') or
201 |                        m.endswith(' 우리 한자음으로 읽은 이름.')
202 |                        for m in word['meaning']):
203 |                     word = None
204 |                     continue
205 |                 reading = word['reading']
206 |                 meaning = ' '.join(
207 |                     f'{i + 1}. {m}' for (i, m) in enumerate(word['meaning'])
208 |                 )
209 |                 for hanja in word['origin']:
210 |                     hanja = hanja.strip().replace('▽', '')
211 |                     if len(hanja) < 2:  # Ignore single character words
212 |                         continue
213 |                     yield hanja, reading, meaning
214 |                 word = None
215 | 
216 | 
217 | def main():
218 |     parser = argparse.ArgumentParser(
219 |         description=(
220 |             'Extract Sino-Korean words from Standard Korean Language Dictionary'
221 |         )
222 |     )
223 |     parser.add_argument(
224 |         'zip_file',
225 |         metavar='FILE',
226 |         type=argparse.FileType('rb'),
227 |         help=(
228 |             'a dictionary .zip file consists of .xml files.  download one '
229 |             'from https://stdict.korean.go.kr/'
230 |         )
231 |     )
232 |     parser.add_argument(
233 |         '-o', '--output-file',
234 |         metavar='FILE',
235 |         type=argparse.FileType('w'),
236 |         help='write to the specified file instead of the standard output'
237 |     )
238 |     parser.add_argument(
239 |         '-m', '--meaning-column',
240 |         action='store_true',
241 |         default=False,
242 |         help='include the meaning column'
243 |     )
244 |     parser.add_argument(
245 |         '-v', '--verbose',
246 |         action='store_true',
247 |         default=False,
248 |         help='print diagnostic log messages'
249 |     )
250 |     parser.add_argument(
251 |         '--memory-db',
252 |         action='store_true',
253 |         default=False,
254 |         help='place an intermediate SQLite database in the memory'
255 |     )
256 |     args = parser.parse_args()
257 |     log = functools.partial(print, file=sys.stderr) \
258 |           if args.verbose \
259 |           else lambda *_: None
260 |     with args.zip_file as f, \
261 |          zipfile.ZipFile(f) as zf, \
262 |          tempfile.TemporaryDirectory() as td, \
263 |          args.output_file or io.TextIOWrapper(
264 |              sys.stdout.buffer,
265 |              encoding='utf-8',
266 |              newline='',
267 |              write_through=True
268 |          ) as bstdout:
269 |         log("Extracting a zip file...")
270 |         zf.extractall(td)
271 |         log("Successfully extracted.")
272 |         db_path = ':memory:' \
273 |                   if args.memory_db \
274 |                   else os.path.join(td, '.__tmpdic__.db')
275 |         with sqlite3.connect(db_path,
276 |                              isolation_level=None) as db:
277 |             log("Created a temporary SQLite database.")
278 |             cursor = db.cursor()
279 |             cursor.execute('''
280 |                 CREATE TABLE dic (
281 |                     hanja text PRIMARY KEY,
282 |                     reading text,
283 |                     meaning hanja
284 |                 )
285 |             ''')
286 |             log("Created a temporary SQLite table.")
287 |             for filename in os.listdir(td):
288 |                 if filename.startswith('.'):
289 |                     continue
290 |                 log("Reading an XML file... ({0})".format(filename))
291 |                 try:
292 |                     words = filter_xml(os.path.join(td, filename))
293 |                     for hanja, reading, meaning in words:
294 |                         cursor.execute(
295 |                             'SELECT meaning FROM dic WHERE hanja = ?',
296 |                             (hanja,)
297 |                         )
298 |                         existing = cursor.fetchone()
299 |                         if existing:
300 |                             if len(existing[0]) < len(meaning):
301 |                                 cursor.execute('''
302 |                                     UPDATE dic
303 |                                     SET reading = ?, meaning = ?
304 |                                     WHERE hanja = ?
305 |                                 ''', (reading, meaning, hanja))
306 |                             continue
307 |                         cursor.execute('''
308 |                             INSERT INTO dic (hanja, reading, meaning)
309 |                             VALUES (?, ?, ?)
310 |                         ''', (hanja, reading, meaning))
311 |                 except (KeyboardInterrupt, BrokenPipeError):
312 |                     raise SystemExit(130)
313 |             log("Filled the temporary table with the data.")
314 |             if args.meaning_column:
315 |                 cursor.execute('SELECT hanja, reading, meaning FROM dic')
316 |             else:
317 |                 cursor.execute('SELECT hanja, reading FROM dic')
318 |             log("Writing data into the file...")
319 |             writer = csv.writer(bstdout, 'excel-tab')
320 |             write = writer.writerow
321 |             for row in cursor:
322 |                 write(tuple(row))
323 | 
324 | 
325 | if __name__ == '__main__':
326 |     main()
327 | 


--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
  1 | Seonbi changelog
  2 | ================
  3 | 
  4 | Version 0.6.0
  5 | -------------
  6 | 
  7 | To be released.
  8 | 
  9 |  -  Updated Unicode Han Database `kHangul` data
 10 |     (*src/Text/Seonbi/Unicode/kHangul.json*) to the version 16.0.0.
 11 |     [[#36] by Lee Dogeon]
 12 | 
 13 | [#36]: https://github.com/dahlia/seonbi/pull/36
 14 | 
 15 | 
 16 | Version 0.5.0
 17 | -------------
 18 | 
 19 | Released on January 31, 2025.
 20 | 
 21 |  -  Stops normalizer now normalizes question marks and exclamation marks too.
 22 |     The below functions were added.  [[#34]]
 23 | 
 24 |     Haskell API-wise, the below record fields were added to
 25 |     the `Text.Seonbi.Punctuation.Stops` data type:
 26 | 
 27 |      -  `questionMark` field
 28 |      -  `exclamationMark` field
 29 | 
 30 |  -  Added prebuilt executable binaries for Linux ARM64 (linux-arm64).
 31 | 
 32 |  -  The official Docker images are now multi-platform (linux/amd64 and
 33 |     linux/arm64).  [[#35]]
 34 | 
 35 |  -  Updated the *Standard Korean Language Dictionary* data
 36 |     (*data/ko-kr-stdict.tsv*) to the revision 2025-01.
 37 | 
 38 | [#34]: https://github.com/dahlia/seonbi/issues/34
 39 | [#35]: https://github.com/dahlia/seonbi/issues/35
 40 | 
 41 | 
 42 | Version 0.4.0
 43 | -------------
 44 | 
 45 | Released on September 8, 2024.
 46 | 
 47 |  -  Introduced GitHub action `dahlia/seonbi/setup` to install Seonbi during
 48 |     GitHub Actions workflow.
 49 | 
 50 |  -  Since this version, it supports GHC 9.4.* at most.
 51 | 
 52 |  -  The `iconv` flag became ignored on Windows, because the flag had never
 53 |     worked on Windows.
 54 | 
 55 |  -  Previously, `ContentType` type (and `-t`/`--content-type` in CLI and
 56 |     `"contentType"` field in HTTP API) was a mere case-insensitive string.
 57 |     Now, it is loosely structured so that it can contain parameters.  You can
 58 |     treat it as a media type (MIME type).
 59 | 
 60 |     For example, `text/markdown` and `text/markdown; variant=Original` both
 61 |     use the same parser and formatter under the hood.
 62 | 
 63 |      -  `Text.Seonbi.ContentTypes.ContentType` type became an alias of
 64 |         `Network.HTTP.Media.MediaType` type (was `CI Text`).
 65 |      -  The type of `Text.Seonbi.ContentTypes.contentTypeFromText` function
 66 |         became `Text -> Maybe ContentType` (was `Text -> ContentType`).
 67 |      -  `Text.Seonbi.ContentTypes.transformWithContentType` function became
 68 |         able to handle media types with parameters, and now it falls back
 69 |         more-specific media types to less-specific media types if there is no
 70 |         exactly matched one.
 71 | 
 72 |  -  For `text/html` and `application/xhtml+xml` formats, some obsolete HTML
 73 |     tags are no more ignored, but now recognized as valid HTML tags.  Here are
 74 |     the list of added tags:
 75 | 
 76 |      -  `<center>`
 77 |      -  `<font>`
 78 |      -  `<nobr>`
 79 |      -  `<strike>`
 80 |      -  `<xmp>`
 81 | 
 82 |     Haskell API-wise, the following constructors were added to
 83 |     the `Text.Seonbi.Html.Tag.HtmlTag` type:
 84 | 
 85 |      -  `Center`
 86 |      -  `Font`
 87 |      -  `NoBR`
 88 |      -  `Strike`
 89 |      -  `XMP`
 90 | 
 91 |  -  Deno module became possible to use an already deployed Seonbi API server
 92 |     instead of spawning a local API server.  The `new Seonbi()` constructor
 93 |     now takes `{ apiUrl: string }` besides existing configuration forms.
 94 | 
 95 |  -  Updated the *Standard Korean Language Dictionary* data
 96 |     (*data/ko-kr-stdict.tsv*) to the revision 2024-09.
 97 | 
 98 | 
 99 | Version 0.3.6
100 | -------------
101 | 
102 | Released on February 16, 2023.
103 | 
104 |  -  The `iconv` flag became ignored on Windows, because the flag had never
105 |     worked on Windows.
106 | 
107 |  -  Fixed a bug that `seonbi.exe --help` had suddenly terminated with
108 |     the below error message on Windows:  [[#30]]
109 | 
110 |     ~~~~
111 |     <stdout>: commitBuffer: invalid argument (invalid character)
112 |     ~~~~
113 | 
114 | [#30]: https://github.com/dahlia/seonbi/issues/30
115 | 
116 | 
117 | Version 0.3.5
118 | -------------
119 | 
120 | Released on February 13, 2023.
121 | 
122 |  -  Official builds for Windows now contain required dynamic libraries (_*.dll_)
123 |     besides executables (*seonbi.exe* & *seonbi-api.exe*).  [[#2]]
124 |  -  Fixed a bug where Sino-Korean words that were not listed in the dictionary
125 |     sometimes had been inaccurately phoneticized so that Initial Sound Law
126 |     (頭音法則) had been applied to non-initial hanja of a word too. [[#20]]
127 |  -  Updated the *Standard Korean Language Dictionary* data
128 |     (*data/ko-kr-stdict.tsv*) to the revision 2023-02.
129 | 
130 | [#2]: https://github.com/dahlia/seonbi/issues/2
131 | [#20]: https://github.com/dahlia/seonbi/issues/20
132 | 
133 | 
134 | Version 0.3.4
135 | -------------
136 | 
137 | Released on January 18, 2023.
138 | 
139 |  -  Updated the *Standard Korean Language Dictionary* data
140 |     (*data/ko-kr-stdict.tsv*) to the revision 2022-12.
141 | 
142 | 
143 | Version 0.3.3
144 | -------------
145 | 
146 | Released on September 9, 2022.
147 | 
148 |  -  Updated the *Standard Korean Language Dictionary* data
149 |     (*data/ko-kr-stdict.tsv*) to the revision 2022-09.
150 |  -  Fixed a bug that Sino-Korean words are not phoneticized to Korean custom
151 |     readings (習慣音) according to folk readings (俗音) and erroneous
152 |     readings (訛音).  Now such words are phoneticized to the most usual readings
153 |     according to *Standard Korean Language Dictionary*.  For example,
154 |     *困難* is no more phoneticized to *곤난* but *곤란* instead. [[#22]]
155 | 
156 | [#22]: https://github.com/dahlia/seonbi/issues/22
157 | 
158 | 
159 | Version 0.3.2
160 | -------------
161 | 
162 | Released on August 11, 2022.
163 | 
164 |  -  HTTP API now allows any headers from cross-origin.  (Previously, it allowed
165 |     only `Content-Type`.)
166 |  -  HTTP API now respond with header `Vary: Origin` for more accurate cache
167 |     control.
168 | 
169 | 
170 | Version 0.3.1
171 | -------------
172 | 
173 | Released on November 27, 2021.
174 | 
175 |  -  Added prebuilt executable binaries for Apple Silicon (macos-arm64).
176 | 
177 | 
178 | Version 0.3.0
179 | -------------
180 | 
181 | Released on November 18, 2021.
182 | 
183 |  -  Since this version, it requires GHC 8.8.* at least, and supports GHC 9.0.*
184 |     at most.
185 | 
186 |  -  Now supports several content types besides HTML/XHTML.  The below list
187 |     shows all supported content types:  [[#18]]
188 | 
189 |      -  `text/html` (previously non-XHTML mode)
190 |      -  `application/xhtml+xml` (previously XHTML mode)
191 |      -  `text/plain` (added)
192 |      -  `text/markdown` (added)
193 | 
194 |     The below Haskell APIs changed:
195 | 
196 |      -  Added `Text.Seonbi.ContentTypes` module.
197 |      -  Added `contentType` field for `Configuration m a`.
198 |      -  Removed `xhtml` field for `Configuration m a` in favor of
199 |         new `contentType` field for the same type.
200 | 
201 |     The below CLI options changed:
202 | 
203 |      -  Added `-t`/`--content-type` option with the default value `text/html`.
204 |      -  Removed Removed `-x`/`--xhtml` option in favor of new
205 |         `-t`/`--content-type` option.  In order to use XHTML mode, give it
206 |         `-t application/xhtml+xml` option.
207 | 
208 |     The below HTTP APIs changed:
209 | 
210 |      -  Added a mandatory field `"content"` to requests.
211 |      -  Deprecated the `"sourceHtml"` field of requests in favor of the new
212 |         `"content"` field.
213 |      -  Added an optional field `"contentType"` with the default value
214 |         `"text/html"` to requests.
215 |      -  Deprecated the `"xhtml"` field of requests in favor of the new
216 |         `"contentType"` field.  The legacy field will be gone in the next
217 |         minor release.
218 |         In order to use XHTML mode, configure `"contentType"` field with
219 |         `"application/xhtml+xml"`.
220 |      -  Added `"content"` field to responses.
221 |      -  Deprecated the `"resultHtml"` field of responses in favor of the new
222 |         `"content"` field.  The legacy field is not provided for non-HTML
223 |         types, and will be gone in the next minor release.
224 |      -  Added `"contentType"` field to responses.
225 |      -  Added `"warnings"` field to responses.
226 | 
227 |  -  Added `Text.Seonbi.Html.Lang` module.
228 | 
229 |  -  Some transformations inappropriate for non-Korean contents are no more
230 |     applied to elements written in other languages than Korean.  The below
231 |     functions respect elements `lang` attributes:  [[#10]]
232 | 
233 |      -  `Text.Seonbi.Hanja.phoneticizeHanja`
234 |      -  `Text.Seonbi.Punctuation.normalizeStops`
235 | 
236 |  -  Removed several functions from `Text.Seonbi.Trie` module:
237 | 
238 |      -  `toListBy`
239 |      -  `lookupBy`
240 |      -  `submap`
241 |      -  `match`
242 |      -  `matches`
243 |      -  `alterBy`
244 |      -  `adjust`
245 |      -  `delete`
246 |      -  `mapBy`
247 |      -  `filterMap`
248 | 
249 |  -  `Text.Seonbi.Trie.Trie` type is not an instance of the following typeclasses
250 |     anymore:
251 | 
252 |      -  `Generic a => Generic (Trie a)`
253 |      -  `Binary a => Binary (Trie a)`
254 |      -  `Generic1 Trie`
255 |      -  `type Rep (Trie a)`
256 |      -  `type Rep1 Trie`
257 | 
258 |  -  Added `Text.Seonbi.Html.Printer.printText` function.
259 |  -  Added `Text.Seonbi.Html.Tag.headingLevel` function.
260 |  -  Added `Text.Seonbi.Html.Tag.headingTag` function.
261 |  -  Added `Text.Seonbi.Html.Tag.headingTag'` function.
262 |  -  Added `Text.Seonbi.Html.TagStack.last` function.
263 | 
264 | [#10]: https://github.com/dahlia/seonbi/issues/10
265 | [#18]: https://github.com/dahlia/seonbi/issues/18
266 | 
267 | 
268 | Version 0.2.3
269 | -------------
270 | 
271 | Released on September 26, 2021.
272 | 
273 |  -  Fixed stops normalizer's bug where trailing spaces following stops had been
274 |     trimmed after normalized.
275 |  -  Fixed stops normalizer's buf where unnecessary trailing spaces following
276 |     stops had been inserted after normalized.  In particular, unnecessary
277 |     spaces between stops and closing parentheses/brackets are no more inserted.
278 | 
279 | 
280 | Version 0.2.2
281 | -------------
282 | 
283 | Released on September 25, 2021.
284 | 
285 |  -  Fixed stops normalizer's bug where unnecessary trailing spaces following
286 |     stops had been left even after normalized.
287 |  -  Fixed stops normalizer's bug where commas followed by tag boundaries had
288 |     been not normalized.
289 | 
290 | 
291 | Version 0.2.1
292 | -------------
293 | 
294 | Released on September 23, 2021.
295 | 
296 |  -  Updated the *Standard Korean Language Dictionary* data
297 |     (*data/ko-kr-stdict.tsv*) to the revision 2021-09.
298 | 
299 | 
300 | Version 0.2.0
301 | -------------
302 | 
303 | Released on May 26, 2021.
304 | 
305 |  -  Added stops (periods/commas/interpuncts) normalizer.
306 | 
307 |     Haskell API-wise, the below types and functions were added:
308 | 
309 |      -  `Text.Seonbi.Punctuation.Stops` data type
310 |      -  `Text.Seonbi.Punctuation.normalizeStops` function
311 |      -  `Text.Seonbi.Punctuation.horizontalStops` function
312 |      -  `Text.Seonbi.Punctuation.horizontalStopsWithSlashes` function
313 |      -  `Text.Seonbi.Punctuation.verticalStops` function
314 |      -  `Text.Seonbi.Facade.StopOption` data type
315 |      -  `stop` field in `Text.Seonbi.Facade.Configuration` data constructor
316 | 
317 |     CLI-wise, the `-s`/`--stop` option was added.
318 | 
319 |     HTTP API-wise, the optional field `"stop"` was added.
320 | 
321 |  -  `Text.Seonbi.Punctuation.transformEllipsis` became aware of Chinese stops
322 |     (`。。。`) besides Western stops (`...`).
323 | 
324 |  -  Added options to use horizontal/vertical corner brackets for quotes.
325 | 
326 |     Haskell API-wise, the below functions were added:
327 | 
328 |      -  `Text.Seonbi.Punctuation.verticalCornerBrackets` function
329 |      -  `Text.Seonbi.Punctuation.horizontalCornerBrackets` function
330 |      -  `Text.Seonbi.Punctuation.verticalCornerBracketsWithQ` function
331 |      -  `Text.Seonbi.Punctuation.horizontalCornerBracketsWithQ` function
332 |      -  `VerticalCornerBrackets` data constructor for
333 |         `Text.Seonbi.Facade.QuoteOption` type
334 |      -  `HorizontalCornerBrackets` data constructor for
335 |         `Text.Seonbi.Facade.QuoteOption` type
336 |      -  `VerticalCornerBracketsWithQ` data constructor for
337 |         `Text.Seonbi.Facade.QuoteOption` type
338 |      -  `HorizontalCornerBracketsWithQ` data constructor for
339 |         `Text.Seonbi.Facade.QuoteOption` type
340 | 
341 |     CLI-wise, the `-q`/`--quote` option became to have the below new styles:
342 | 
343 |      -  `vertical-corner-brackets`
344 |      -  `horizontal-corner-brakcets`
345 |      -  `vertical-corner-brackets-with-q`
346 |      -  `horizontal-corner-brakcets-with-q`
347 | 
348 |     HTTP API-wise, the optional field `"quote"` became to have the below new
349 |     styles:
350 | 
351 |      -  `VerticalCornerBrackets`
352 |      -  `HorizontalCornerBrackets`
353 |      -  `VerticalCornerBracketsWithQ`
354 |      -  `HorizontalCornerBracketsWithQ`
355 | 
356 |  -  The CLI option `-x`/`--xhtml` became usable with the `-p`/`--preset` option
357 |     at a time.
358 | 
359 |  -  Updated the *Standard Korean Language Dictionary* data
360 |     (*data/ko-kr-stdict.tsv*) to the revision 2021-05.
361 | 
362 | 
363 | Version 0.1.1
364 | -------------
365 | 
366 | Released on October 7, 2019.
367 | 
368 |  -  Added the `embed-dictionary` flag to the Cabal package.
369 |  -  Fixed a bug that *Standard Korean Language Dictionary* had not been
370 |     included in executables if `flag(static)` is turned on.  [[#1]]
371 | 
372 | [#1]: https://github.com/dahlia/seonbi/issues/1
373 | 
374 | 
375 | Version 0.1.0
376 | -------------
377 | 
378 | Released on October 6, 2019.
379 | 


--------------------------------------------------------------------------------
/test/Text/Seonbi/Html/ScannerSpec.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedLists #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | module Text.Seonbi.Html.ScannerSpec (spec) where
  4 | 
  5 | import Test.Hspec
  6 | 
  7 | import Text.Seonbi.Html.Entity
  8 | import Text.Seonbi.Html.Scanner
  9 | import Text.Seonbi.Html.Tag
 10 | import Text.Seonbi.Html.TextNormalizer
 11 | 
 12 | shouldBeDone :: (Eq a, Show a) => Result a -> a -> Expectation
 13 | shouldBeDone (Done "" r) expected = do
 14 |     r `shouldBe` expected
 15 | shouldBeDone result expected =
 16 |     expectationFailure $ "Expected Done \"\" " ++ show expected ++
 17 |         ", but got " ++ show result
 18 | 
 19 | spec :: Spec
 20 | spec =
 21 |     describe "scanHtml" $ do
 22 |         it "returns an empty list if the input is empty" $
 23 |             scanHtml "" `shouldBeDone` []
 24 | 
 25 |         it "parses text nodes" $
 26 |             scanHtml "foobar" `shouldBeDone`
 27 |                 [HtmlText { tagStack = [], rawText = "foobar" }]
 28 | 
 29 |         it "parses HTML comments" $ do
 30 |             scanHtml "<!-- foo -->" `shouldBeDone`
 31 |                 [HtmlComment { tagStack = [], comment = " foo " }]
 32 |             scanHtml "<!-- foo- -->" `shouldBeDone`
 33 |                 [HtmlComment { tagStack = [], comment = " foo- " }]
 34 |             scanHtml "<!-- foo-> -->" `shouldBeDone`
 35 |                 [HtmlComment { tagStack = [], comment = " foo-> " }]
 36 |             scanHtml "<!-- foo-- -->" `shouldBeDone`
 37 |                 [HtmlComment { tagStack = [], comment = " foo-- " }]
 38 |             scanHtml "foo <!-- bar -->" `shouldBeDone`
 39 |                 [ HtmlText { tagStack = [], rawText = "foo " }
 40 |                 , HtmlComment { tagStack = [], comment = " bar " }
 41 |                 ]
 42 |             scanHtml "<!-- foo --> bar" `shouldBeDone`
 43 |                 [ HtmlComment { tagStack = [], comment = " foo " }
 44 |                 , HtmlText { tagStack = [], rawText = " bar" }
 45 |                 ]
 46 |             scanHtml "foo <!-- bar --> baz" `shouldBeDone`
 47 |                 [ HtmlText { tagStack = [], rawText = "foo " }
 48 |                 , HtmlComment { tagStack = [], comment = " bar " }
 49 |                 , HtmlText { tagStack = [], rawText = " baz" }
 50 |                 ]
 51 |             scanHtml "<p>foo <!-- bar baz --> qux</p>" `shouldBeDone`
 52 |                 [ HtmlStartTag { tagStack = [], tag = P, rawAttributes = "" }
 53 |                 , HtmlText { tagStack = [P], rawText = "foo " }
 54 |                 , HtmlComment { tagStack = [P], comment = " bar baz " }
 55 |                 , HtmlText { tagStack = [P], rawText = " qux" }
 56 |                 , HtmlEndTag { tagStack = [], tag = P }
 57 |                 ]
 58 |             scanHtml "<p>foo <!-- <b>bar</b> baz --> qux</p>" `shouldBeDone`
 59 |                 [ HtmlStartTag { tagStack = [], tag = P, rawAttributes = "" }
 60 |                 , HtmlText { tagStack = [P], rawText = "foo " }
 61 |                 , HtmlComment { tagStack = [P], comment = " <b>bar</b> baz " }
 62 |                 , HtmlText { tagStack = [P], rawText = " qux" }
 63 |                 , HtmlEndTag { tagStack = [], tag = P }
 64 |                 ]
 65 | 
 66 |         it "parses CDATA sections" $ do
 67 |             scanHtml "<![CDATA[foo]]>" `shouldBeDone`
 68 |                 [HtmlCdata { tagStack = [], text = "foo" }]
 69 |             scanHtml "foo <![CDATA[bar]]>" `shouldBeDone`
 70 |                 [ HtmlText { tagStack = [], rawText = "foo " }
 71 |                 , HtmlCdata { tagStack = [], text = "bar" }
 72 |                 ]
 73 |             scanHtml "<![CDATA[foo]]> bar" `shouldBeDone`
 74 |                 [ HtmlCdata { tagStack = [], text = "foo" }
 75 |                 , HtmlText { tagStack = [], rawText = " bar" }
 76 |                 ]
 77 |             scanHtml "foo <![CDATA[bar]]> baz" `shouldBeDone`
 78 |                 [ HtmlText { tagStack = [], rawText = "foo " }
 79 |                 , HtmlCdata { tagStack = [], text = "bar" }
 80 |                 , HtmlText { tagStack = [], rawText = " baz" }
 81 |                 ]
 82 |             scanHtml "<![CDATA[foo] ]]>" `shouldBeDone`
 83 |                 [HtmlCdata { tagStack = [], text = "foo] " }]
 84 |             scanHtml "<p>foo <![CDATA[bar baz]]> qux</p>" `shouldBeDone`
 85 |                 [ HtmlStartTag { tagStack = [], tag = P, rawAttributes = "" }
 86 |                 , HtmlText { tagStack = [P], rawText = "foo " }
 87 |                 , HtmlCdata { tagStack = [P], text = "bar baz" }
 88 |                 , HtmlText { tagStack = [P], rawText = " qux" }
 89 |                 , HtmlEndTag { tagStack = [], tag = P }
 90 |                 ]
 91 |             scanHtml "<p>foo <![CDATA[<b>bar</b> baz]]> qux</p>" `shouldBeDone`
 92 |                 [ HtmlStartTag { tagStack = [], tag = P, rawAttributes = "" }
 93 |                 , HtmlText { tagStack = [P], rawText = "foo " }
 94 |                 , HtmlCdata { tagStack = [P], text = "<b>bar</b> baz" }
 95 |                 , HtmlText { tagStack = [P], rawText = " qux" }
 96 |                 , HtmlEndTag { tagStack = [], tag = P }
 97 |                 ]
 98 | 
 99 |         it "treats malformed CDATA sections as text nodes" $
100 |             (normalizeText <$> scanHtml "<![CDATA[foo") `shouldBeDone`
101 |                 [HtmlText { tagStack = [], rawText = "<![CDATA[foo" }]
102 | 
103 |         it "parses html start tags" $ do
104 |             scanHtml "<p>" `shouldBeDone`
105 |                 [HtmlStartTag { tagStack = [], tag = P, rawAttributes = "" }]
106 |             scanHtml "<p><em>" `shouldBeDone`
107 |                 [ HtmlStartTag { tagStack = [], tag = P, rawAttributes = "" }
108 |                 , HtmlStartTag { tagStack = [P], tag = Em, rawAttributes = "" }
109 |                 ]
110 | 
111 |         it "parses HTML start tags having attributes" $ do
112 |             scanHtml "<p class=foo>" `shouldBeDone`
113 |                 [ HtmlStartTag
114 |                     { tagStack = []
115 |                     , tag = P
116 |                     , rawAttributes = " class=foo"
117 |                     }
118 |                 ]
119 |             scanHtml "<a href=\"https://example.com/\">" `shouldBeDone`
120 |                 [ HtmlStartTag
121 |                     { tagStack = []
122 |                     , tag = A
123 |                     , rawAttributes = " href=\"https://example.com/\""
124 |                     }
125 |                 ]
126 | 
127 |         it "parses html end tags" $ do
128 |             scanHtml "<p></p>" `shouldBeDone`
129 |                 [ HtmlStartTag { tagStack = [], tag = P, rawAttributes = "" }
130 |                 , HtmlEndTag { tagStack = [], tag = P }
131 |                 ]
132 |             scanHtml "<p><em>test</em></p>" `shouldBeDone`
133 |                 [ HtmlStartTag { tagStack = [], tag = P, rawAttributes = "" }
134 |                 , HtmlStartTag { tagStack = [P], tag = Em, rawAttributes = "" }
135 |                 , HtmlText { tagStack = [P, Em], rawText = "test" }
136 |                 , HtmlEndTag { tagStack = [P], tag = Em }
137 |                 , HtmlEndTag { tagStack = [], tag = P }
138 |                 ]
139 | 
140 |         it "can parse even if tags end in wrong order" $ do
141 |             scanHtml "<p><em>test</p></em>" `shouldBeDone`
142 |                 [ HtmlStartTag { tagStack = [], tag = P, rawAttributes = "" }
143 |                 , HtmlStartTag { tagStack = [P], tag = Em, rawAttributes = "" }
144 |                 , HtmlText { tagStack = [P, Em], rawText = "test" }
145 |                 , HtmlEndTag { tagStack = [Em], tag = P }
146 |                 , HtmlEndTag { tagStack = [], tag = Em }
147 |                 ]
148 |             scanHtml "<p><b class=\"baz\">Hel<i>lo Wo</b>rld</i></p>"
149 |                 `shouldBeDone`
150 |                 [ HtmlStartTag { tagStack = [], tag = P, rawAttributes = "" }
151 |                 , HtmlStartTag
152 |                     { tagStack = [P]
153 |                     , tag = B
154 |                     , rawAttributes = " class=\"baz\""
155 |                     }
156 |                 , HtmlText { tagStack = [P, B], rawText = "Hel" }
157 |                 , HtmlStartTag
158 |                     { tagStack = [P, B]
159 |                     , tag = I
160 |                     , rawAttributes = ""
161 |                     }
162 |                 , HtmlText { tagStack = [P, B, I], rawText = "lo Wo" }
163 |                 , HtmlEndTag { tagStack = [P, I], tag = B }
164 |                 , HtmlText { tagStack = [P, I], rawText = "rld" }
165 |                 , HtmlEndTag { tagStack = [P], tag = I }
166 |                 , HtmlEndTag { tagStack = [], tag = P }
167 |                 ]
168 | 
169 |         it "can parse XHTML-style self-closing tags" $
170 |             scanHtml "<p><em/>" `shouldBeDone`
171 |                 [ HtmlStartTag { tagStack = [], tag = P, rawAttributes = "" }
172 |                 , HtmlStartTag { tagStack = [P], tag = Em, rawAttributes = "" }
173 |                 , HtmlEndTag { tagStack = [P], tag = Em }
174 |                 ]
175 | 
176 |         it "emits both start and end tags for void tags (e.g., <hr>)" $ do
177 |             scanHtml "<p><hr>" `shouldBeDone`
178 |                 [ HtmlStartTag { tagStack = [], tag = P, rawAttributes = "" }
179 |                 , HtmlStartTag { tagStack = [P], tag = HR, rawAttributes = "" }
180 |                 , HtmlEndTag { tagStack = [P], tag = HR }
181 |                 ]
182 |             scanHtml
183 |                 "<div><p>foo</p><hr><p>bar <img src=\"a.jpg\"></p><hr></div>"
184 |                 `shouldBeDone`
185 |                 [ HtmlStartTag { tagStack = [], tag = Div, rawAttributes = "" }
186 |                 , HtmlStartTag { tagStack = [Div], tag = P, rawAttributes = "" }
187 |                 , HtmlText { tagStack = [Div, P], rawText = "foo" }
188 |                 , HtmlEndTag { tagStack = [Div], tag = P }
189 |                 , HtmlStartTag
190 |                     { tagStack = [Div]
191 |                     , tag = HR
192 |                     , rawAttributes = ""
193 |                     }
194 |                 , HtmlEndTag { tagStack = [Div], tag = HR }
195 |                 , HtmlStartTag { tagStack = [Div], tag = P, rawAttributes = "" }
196 |                 , HtmlText { tagStack = [Div, P], rawText = "bar " }
197 |                 , HtmlStartTag
198 |                     { tagStack = [Div, P]
199 |                     , tag = Img
200 |                     , rawAttributes = " src=\"a.jpg\""
201 |                     }
202 |                 , HtmlEndTag { tagStack = [Div, P], tag = Img }
203 |                 , HtmlEndTag { tagStack = [Div], tag = P }
204 |                 , HtmlStartTag
205 |                     { tagStack = [Div]
206 |                     , tag = HR
207 |                     , rawAttributes = ""
208 |                     }
209 |                 , HtmlEndTag { tagStack = [Div], tag = HR }
210 |                 , HtmlEndTag { tagStack = [], tag = Div }
211 |                 ]
212 |             scanHtml "<embed></embed>" `shouldBeDone`
213 |                 [ HtmlStartTag
214 |                     { tagStack = []
215 |                     , tag = Embed
216 |                     , rawAttributes = ""
217 |                     }
218 |                 , HtmlEndTag { tagStack = [], tag = Embed }
219 |                 ]
220 | 
221 |         it "can parses a flat element" $
222 |             scanHtml "<div>Hello</div>" `shouldBeDone`
223 |                 [ HtmlStartTag { tagStack = [], tag = Div, rawAttributes = "" }
224 |                 , HtmlText { tagStack = [Div], rawText = "Hello" }
225 |                 , HtmlEndTag { tagStack = [], tag = Div }
226 |                 ]
227 | 
228 |         it "can parses nested elements" $ do
229 |             scanHtml "<div><p class=foo>Hello</p><p class='bar'>World</p></div>"
230 |                 `shouldBeDone`
231 |                 [ HtmlStartTag { tagStack = [], tag = Div, rawAttributes = "" }
232 |                 , HtmlStartTag
233 |                     { tagStack = [Div]
234 |                     , tag = P
235 |                     , rawAttributes = " class=foo"
236 |                     }
237 |                 , HtmlText { tagStack = [Div, P], rawText = "Hello" }
238 |                 , HtmlEndTag { tagStack = [Div], tag = P }
239 |                 , HtmlStartTag
240 |                     { tagStack = [Div]
241 |                     , tag = P
242 |                     , rawAttributes = " class='bar'"
243 |                     }
244 |                 , HtmlText { tagStack = [Div, P], rawText = "World" }
245 |                 , HtmlEndTag { tagStack = [Div], tag = P }
246 |                 , HtmlEndTag { tagStack = [], tag = Div }
247 |                 ]
248 |             scanHtml "<p><b class=\"baz\">Hel<i>lo Wo</i>rld</b></p>"
249 |                 `shouldBeDone`
250 |                 [ HtmlStartTag { tagStack = [], tag = P, rawAttributes = "" }
251 |                 , HtmlStartTag
252 |                     { tagStack = [P]
253 |                     , tag = B
254 |                     , rawAttributes = " class=\"baz\""
255 |                     }
256 |                 , HtmlText { tagStack = [P, B], rawText = "Hel" }
257 |                 , HtmlStartTag
258 |                     { tagStack = [P, B]
259 |                     , tag = I
260 |                     , rawAttributes = ""
261 |                     }
262 |                 , HtmlText { tagStack = [P, B, I], rawText = "lo Wo" }
263 |                 , HtmlEndTag { tagStack = [P, B], tag = I }
264 |                 , HtmlText { tagStack = [P, B], rawText = "rld" }
265 |                 , HtmlEndTag { tagStack = [P], tag = B }
266 |                 , HtmlEndTag { tagStack = [], tag = P }
267 |                 ]
268 | 
269 |         it "can parses an HTML fragment having multiple root elements" $
270 |             scanHtml
271 |                 "<p\nid=\"a\">Hello <b>world</b>!</p>\n<p>Second paragraph.</p>"
272 |                 `shouldBeDone`
273 |                 [ HtmlStartTag
274 |                     { tagStack = []
275 |                     , tag = P
276 |                     , rawAttributes = "\nid=\"a\""
277 |                     }
278 |                 , HtmlText { tagStack = [P], rawText = "Hello " }
279 |                 , HtmlStartTag { tagStack = [P], tag = B, rawAttributes = "" }
280 |                 , HtmlText { tagStack = [P, B], rawText = "world" }
281 |                 , HtmlEndTag { tagStack = [P], tag = B }
282 |                 , HtmlText { tagStack = [P], rawText = "!" }
283 |                 , HtmlEndTag { tagStack = [], tag = P }
284 |                 , HtmlText { tagStack = [], rawText = "\n" }
285 |                 , HtmlStartTag { tagStack = [], tag = P, rawAttributes = "" }
286 |                 , HtmlText { tagStack = [P], rawText = "Second paragraph." }
287 |                 , HtmlEndTag { tagStack = [], tag = P }
288 |                 ]
289 | 
290 |         it "treats an invalid tag as a text node" $
291 |             (normalizeText <$> scanHtml "<invalid>") `shouldBeDone`
292 |                 [HtmlText { tagStack = [], rawText = "<invalid>" }]
293 | 


--------------------------------------------------------------------------------