", HtmlText [] " baz!"
32 | -- ]
33 | -- :}
34 | -- [HtmlText {tagStack = fromList [], rawText = "foo <bar> baz!"}]
35 | normalizeText :: [HtmlEntity] -> [HtmlEntity]
36 | normalizeText fragments =
37 | [ case map normalizeCdata frags of
38 | [f] ->
39 | f
40 | frags'@(HtmlText { tagStack = s }:_) ->
41 | HtmlText
42 | { tagStack = s
43 | , rawText = Data.Text.concat $ map rawText frags'
44 | }
45 | frags' ->
46 | throw $ AssertionFailed
47 | ("Unexpected error occured; grouping does not work well: " ++
48 | show frags')
49 | | frags <- groupBy isSibling fragments
50 | ]
51 | where
52 | isSibling :: HtmlEntity -> HtmlEntity -> Bool
53 | isSibling HtmlText { tagStack = a } HtmlText { tagStack = b } = a == b
54 | isSibling HtmlText { tagStack = a } HtmlCdata { tagStack = b } = a == b
55 | isSibling HtmlCdata { tagStack = a } HtmlText { tagStack = b } = a == b
56 | isSibling HtmlCdata { tagStack = a } HtmlCdata { tagStack = b } = a == b
57 | isSibling _ _ = False
58 |
59 | -- | Transform a given 'HtmlCdata' node into an equivalent 'HtmlText' node.
60 | --
61 | -- >>> import Text.Seonbi.Html.Tag
62 | -- >>> normalizeCdata HtmlCdata { tagStack = [P], text = "" }
63 | -- HtmlText {tagStack = fromList [P], rawText = "<p id="foo">"}
64 | normalizeCdata :: HtmlEntity -> HtmlEntity
65 | normalizeCdata HtmlCdata { tagStack = s, text = t } =
66 | HtmlText { tagStack = s, rawText = escapeHtmlEntities t }
67 | normalizeCdata entity = entity
68 |
69 | -- | Escape special (control) characters into corresponding character entities
70 | -- in the given HTML text.
71 | --
72 | -- >>> escapeHtmlEntities ""
73 | -- "<foo & "bar">"
74 | escapeHtmlEntities :: Text -> Text
75 | escapeHtmlEntities =
76 | Data.Text.concatMap $ \ case
77 | '<' -> "<"
78 | '>' -> ">"
79 | '&' -> "&"
80 | '"' -> """
81 | c -> Data.Text.singleton c
82 |
--------------------------------------------------------------------------------
/test/Text/Seonbi/Html/TextNormalizerSpec.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE OverloadedLists #-}
2 | {-# LANGUAGE OverloadedStrings #-}
3 | module Text.Seonbi.Html.TextNormalizerSpec (spec) where
4 |
5 | import Control.Monad
6 |
7 | import Test.Hspec
8 |
9 | import Text.Seonbi.Html.Entity
10 | import Text.Seonbi.Html.Tag
11 | import Text.Seonbi.Html.TagStack
12 | import Text.Seonbi.Html.TextNormalizer
13 |
14 | spec :: Spec
15 | spec = do
16 | specify "normalizeText" $
17 | normalizeText
18 | [ HtmlText { tagStack = [], rawText = "foo " }
19 | , HtmlText { tagStack = [], rawText = "& bar" }
20 | , HtmlCdata { tagStack = [], text = " & baz " }
21 | , HtmlStartTag { tagStack = [], tag = P, rawAttributes = "" }
22 | , HtmlText { tagStack = [P], rawText = "qux " }
23 | , HtmlCdata { tagStack = [P], text = "& \"quux\"" }
24 | , HtmlEndTag { tagStack = [], tag = P }
25 | , HtmlCdata { tagStack = [], text = " " }
26 | ] `shouldBe`
27 | [ HtmlText { tagStack = [], rawText = "foo & bar & baz " }
28 | , HtmlStartTag { tagStack = [], tag = P, rawAttributes = "" }
29 | , HtmlText
30 | { tagStack = [P]
31 | , rawText = "qux & "quux""
32 | }
33 | , HtmlEndTag { tagStack = [], tag = P }
34 | , HtmlText { tagStack = [], rawText = " <end>" }
35 | ]
36 |
37 | describe "normalizeCdata" $ do
38 | let s1 = [] :: HtmlTagStack
39 | let s2 = [Div, P] :: HtmlTagStack
40 | specify "HtmlStartTag" $ do
41 | let entity1 = HtmlStartTag
42 | { tagStack = s1
43 | , tag = P
44 | , rawAttributes = ""
45 | }
46 | normalizeCdata entity1 `shouldBe` entity1
47 | let entity2 = HtmlStartTag
48 | { tagStack = s2
49 | , tag = P
50 | , rawAttributes = " class=\"entity2\""
51 | }
52 | normalizeCdata entity2 `shouldBe` entity2
53 | let stacks = [s1, s2] :: [HtmlTagStack]
54 | forM_ stacks $ \ s -> do
55 | specify ("HtmlEndTag: " ++ show s) $ do
56 | let e = HtmlEndTag { tagStack = s, tag = P }
57 | normalizeCdata e `shouldBe` e
58 | specify ("HtmlText: " ++ show s) $ do
59 | let e = HtmlText { tagStack = s, rawText = "foo & bar" }
60 | normalizeCdata e `shouldBe` e
61 | specify ("HtmlComment: " ++ show s) $ do
62 | let e = HtmlComment { tagStack = s, comment = "foo" }
63 | normalizeCdata e `shouldBe` e
64 | specify ("HtmlCdata: " ++ show s) $ do
65 | let e = HtmlCdata { tagStack = s, text = "foo & bar
" }
66 | normalizeCdata e `shouldBe`
67 | HtmlText
68 | { tagStack = s
69 | , rawText = "<p>foo & bar</p>"
70 | }
71 |
72 | specify "escapeHtmlEntities" $ do
73 | escapeHtmlEntities "" `shouldBe`
74 | "<p id="foo">"
75 | escapeHtmlEntities "AT&T" `shouldBe`
76 | "AT&T"
77 |
--------------------------------------------------------------------------------
/scripts/deno/test.ts:
--------------------------------------------------------------------------------
1 | import {
2 | Configuration,
3 | DEFAULT_CONFIGURATION,
4 | Options,
5 | Seonbi,
6 | transform,
7 | } from "./mod.ts";
8 | import { assertEquals } from "https://deno.land/std@0.106.0/testing/asserts.ts";
9 |
10 | const hanjaInParens: Options = {
11 | contentType: "text/html",
12 | quote: "CurvedQuotes",
13 | cite: null,
14 | arrow: null,
15 | ellipsis: false,
16 | emDash: false,
17 | stop: null,
18 | hanja: {
19 | rendering: "HanjaInParentheses",
20 | reading: {
21 | initialSoundLaw: true,
22 | useDictionaries: ["kr-stdict"],
23 | dictionary: {},
24 | },
25 | },
26 | };
27 |
28 | const customDict: Options = {
29 | ...hanjaInParens,
30 | hanja: {
31 | rendering: "HanjaInParentheses",
32 | reading: {
33 | initialSoundLaw: true,
34 | useDictionaries: [],
35 | dictionary: { "言語": "말", "文字": "글" },
36 | },
37 | },
38 | };
39 |
40 | let config: Configuration = {
41 | ...DEFAULT_CONFIGURATION,
42 | process: { distType: "nightly" },
43 | };
44 |
45 | try {
46 | const binPath = Deno.env.get("SEONBI_API");
47 | if (binPath != null && "process" in config) config.process = { binPath };
48 | } catch (e) {
49 | if (!(e instanceof Deno.errors.PermissionDenied)) throw e;
50 | }
51 |
52 | try {
53 | const port = Deno.env.get("SEONBI_API_PORT");
54 | if (port != null && port.match(/^[0-9]+$/) && "process" in config) {
55 | config.port = parseInt(port);
56 | }
57 | } catch (e) {
58 | if (!(e instanceof Deno.errors.PermissionDenied)) throw e;
59 | }
60 |
61 | try {
62 | const apiUrl = Deno.env.get("SEONBI_API_URL");
63 | if (apiUrl != null) config = { apiUrl };
64 | } catch (e) {
65 | if (!(e instanceof Deno.errors.PermissionDenied)) throw e;
66 | }
67 |
68 | Deno.test("transform()", async () => {
69 | const koKr = await transform("
言語와 文字
", config);
70 | assertEquals(koKr, "언어와 문자
");
71 | });
72 |
73 | Deno.test("Seonbi#start()", async () => {
74 | const seonbi = new Seonbi(config);
75 | await seonbi.start();
76 | try {
77 | for (let i = 0; i < 5; i++) {
78 | try {
79 | const response = await fetch(seonbi.apiUrl);
80 | assertEquals(
81 | { message: "Unsupported method: GET", success: false },
82 | await response.json(),
83 | );
84 | break;
85 | } catch (e) {
86 | if (
87 | !(e instanceof TypeError) ||
88 | e.message.indexOf("os error 61") < 0 &&
89 | e.message.indexOf("os error 111") < 0
90 | ) {
91 | throw e;
92 | }
93 |
94 | return new Promise((r) => setTimeout(r, 1000));
95 | }
96 | }
97 | } finally {
98 | await seonbi.stop();
99 | }
100 | });
101 |
102 | function withSeonbi(fn: (s: Seonbi) => Promise): () => Promise {
103 | return async () => {
104 | const seonbi = new Seonbi(config);
105 | await seonbi.start();
106 | try {
107 | await fn(seonbi);
108 | } finally {
109 | await seonbi.stop();
110 | }
111 | };
112 | }
113 |
114 | function testWithSeonbi(label: string, fn: (s: Seonbi) => Promise): void {
115 | Deno.test(label, withSeonbi(fn));
116 | }
117 |
118 | testWithSeonbi("Seonbi#transform()", async (seonbi: Seonbi) => {
119 | assertEquals(
120 | await seonbi.transform("言語와 文字
"),
121 | "언어와 문자
",
122 | );
123 | assertEquals(
124 | await seonbi.transform("言語와 文字
", hanjaInParens),
125 | "언어(言語)와 문자(文字)
",
126 | );
127 | assertEquals(
128 | await seonbi.transform("言語와 文字
", customDict),
129 | "말(言語)와 글(文字)
",
130 | );
131 | });
132 |
--------------------------------------------------------------------------------
/src/Text/Seonbi/Unihan/KHangul.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE CPP #-}
2 | {-# LANGUAGE FlexibleInstances #-}
3 | {-# LANGUAGE LambdaCase #-}
4 | {-# LANGUAGE OverloadedStrings #-}
5 | {-# LANGUAGE TemplateHaskell #-}
6 | {-# LANGUAGE TypeSynonymInstances #-}
7 | module Text.Seonbi.Unihan.KHangul
8 | ( CharacterSet (..)
9 | , HanjaReadings
10 | , HanjaReadingCitation (..)
11 | , KHangulData
12 | , Purpose (..)
13 | , kHangulData
14 | , kHangulData'
15 | ) where
16 |
17 | import Data.Either
18 |
19 | import Data.Aeson
20 | import Data.Attoparsec.Text
21 | import Data.ByteString.Lazy (fromStrict)
22 | import Data.FileEmbed
23 | import Data.Map.Strict
24 | import Data.Set hiding (empty)
25 | import System.FilePath (takeDirectory, (>))
26 |
27 | -- $setup
28 | -- >>> import qualified Text.Show.Unicode
29 | -- >>> :set -interactive-print=Text.Show.Unicode.uprint
30 |
31 | -- | Maps all Hanja characters to their possible readings.
32 | type KHangulData = Map Char HanjaReadings
33 |
34 | -- | All readings of a Hanja character.
35 | type HanjaReadings = Map Char HanjaReadingCitation
36 |
37 | -- | Represents what standard a reading of character belongs to and a purpose
38 | -- of the reading.
39 | data HanjaReadingCitation =
40 | HanjaReadingCitation CharacterSet (Set Purpose) deriving (Eq, Ord, Show)
41 |
42 | -- | Represents character set standards for Korean writing system.
43 | data CharacterSet
44 | -- | KS X 1001 (정보 교환용 부호계).
45 | = KS_X_1001
46 | -- | KS X 1002 (정보 교환용 부호 확장 세트).
47 | | KS_X_1002
48 | -- | Represents that a Hanja character is not included in any Korean
49 | -- character set standards.
50 | | NonStandard
51 | deriving (Eq, Ord, Show)
52 |
53 | -- | Represents purposes of Hanja characters.
54 | data Purpose
55 | -- | Basic Hanja for educational use (漢文敎育用基礎漢字), a subset of
56 | -- Hanja defined in 1972 by a South Korean standard for educational use.
57 | = Education
58 | -- | Hanja for personal names (人名用漢字).
59 | | PersonalName
60 | deriving (Eq, Ord, Show)
61 |
62 | citationParser :: Parser HanjaReadingCitation
63 | citationParser = do
64 | charset' <- option NonStandard charset
65 | purposes <- many' purpose
66 | return $ HanjaReadingCitation charset' $ Data.Set.fromList purposes
67 | where
68 | charset :: Parser CharacterSet
69 | charset = do
70 | d <- digit
71 | case d of
72 | '0' -> return KS_X_1001
73 | '1' -> return KS_X_1002
74 | c -> fail ("Invalid kHangul character set code: " ++ show c)
75 | purpose :: Parser Purpose
76 | purpose = do
77 | l <- letter
78 | case l of
79 | 'E' -> return Education
80 | 'N' -> return PersonalName
81 | c -> fail ("Invalid kHangul purpose code: " ++ show c)
82 |
83 | instance FromJSON HanjaReadingCitation where
84 | parseJSON =
85 | withText "kHangul value (e.g., 0E, 1N, 0EN)" $ \ t ->
86 | case parseOnly (citationParser <* endOfInput) t of
87 | Right cite -> return cite
88 | Left msg -> fail msg
89 |
90 | kHangulData' :: Either String KHangulData
91 | kHangulData' = eitherDecode $
92 | fromStrict $(embedFile $ takeDirectory __FILE__ > "kHangul.json")
93 |
94 | -- | Data that map Hanja characters to their corresponding kHangul entries
95 | -- (i.e., Hanja readings and citations).
96 | --
97 | -- >>> import Data.Map.Strict as M
98 | -- >>> let Just entries = M.lookup '天' kHangulData
99 | -- >>> entries
100 | -- fromList [('천',HanjaReadingCitation KS_X_1001 (fromList [Education]))]
101 | kHangulData :: KHangulData
102 | kHangulData = fromRight empty kHangulData'
103 |
104 | {- HLINT ignore "Unused LANGUAGE pragma" -}
105 |
--------------------------------------------------------------------------------
/src/Text/Seonbi/Html/Printer.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE LambdaCase #-}
2 | {-# LANGUAGE OverloadedStrings #-}
3 | module Text.Seonbi.Html.Printer
4 | ( printHtml
5 | , printText
6 | , printXhtml
7 | ) where
8 |
9 | import Data.Char
10 | import Data.List
11 |
12 | import qualified Data.Text
13 | import Data.Text.Lazy
14 | import Data.Text.Lazy.Builder
15 | import HTMLEntities.Decoder
16 |
17 | import Text.Seonbi.Html.Entity
18 | import Text.Seonbi.Html.Tag
19 |
20 | -- $setup
21 | -- >>> :set -XOverloadedStrings
22 | -- >>> import Text.Seonbi.Html.Scanner
23 | -- >>> :set -interactive-print=Text.Show.Unicode.uprint
24 |
25 | -- | Print the list of 'HtmlEntity' into a lazy 'Text'.
26 | --
27 | -- >>> let Done "" tokens = scanHtml "Hello, \nworld !
"
28 | -- >>> printHtml tokens
29 | -- "Hello, \nworld !
"
30 | printHtml :: [HtmlEntity] -> Text
31 | printHtml = printHtml' False
32 |
33 | -- | Similar to 'printHtml' except it renders void (self-closing) tags as
34 | -- like @ @ instead of @ @.
35 | --
36 | -- >>> let Done "" tokens = scanHtml "Hello, \nworld !
"
37 | -- >>> printXhtml tokens
38 | -- "Hello, \nworld !
"
39 | --
40 | -- Note that normal tags are not rendered as self-closed; only void tags
41 | -- according to HTML specification are:
42 | --
43 | -- >>> let Done "" tokens' = scanHtml "
"
44 | -- >>> printXhtml tokens'
45 | -- "
"
46 | printXhtml :: [HtmlEntity] -> Text
47 | printXhtml = printHtml' True
48 |
49 | printHtml' :: Bool -> [HtmlEntity] -> Text
50 | printHtml' xhtml =
51 | Data.Text.Lazy.concat . Prelude.concatMap render . Data.List.groupBy isVoid
52 | where
53 | isVoid :: HtmlEntity -> HtmlEntity -> Bool
54 | isVoid (HtmlStartTag stck tg _) (HtmlEndTag stck' tg') =
55 | htmlTagKind tg == Void && stck == stck' && tg == tg'
56 | isVoid _ _ = False
57 | render :: [HtmlEntity] -> [Text]
58 | render [a@HtmlStartTag { tag = t, rawAttributes = at }, b@HtmlEndTag {}] =
59 | if isVoid a b
60 | then
61 | [ "<"
62 | , fromStrict (htmlTagName t)
63 | , renderAttrs at
64 | , if xhtml then "/>" else ">"
65 | ]
66 | else e a ++ e b
67 | render entities = Prelude.concatMap e entities
68 | e :: HtmlEntity -> [Text]
69 | e HtmlStartTag { tag = t, rawAttributes = a } =
70 | ["<", fromStrict (htmlTagName t), renderAttrs a, ">"]
71 | e HtmlEndTag { tag = t } = ["", fromStrict (htmlTagName t), ">"]
72 | e HtmlText { rawText = t } = [fromStrict t]
73 | e HtmlCdata { text = t } = [""]
74 | e HtmlComment { comment = c } = [""]
75 | renderAttrs :: Data.Text.Text -> Text
76 | renderAttrs "" = ""
77 | renderAttrs attrs
78 | | isSpace (Data.Text.head attrs) = fromStrict attrs
79 | | otherwise = ' ' `cons` fromStrict attrs
80 |
81 | -- | Print only the text contents (including CDATA sections) without tags
82 | -- into a lazy 'Text'.
83 | --
84 | -- >>> let Done "" tokens = scanHtml "Hello, \nworld !
"
85 | -- >>> printText tokens
86 | -- "Hello,\nworld!"
87 | --
88 | -- Entities are decoded:
89 | --
90 | -- >>> let Done "" tokens = scanHtml "<>"&
"
91 | -- >>> printText tokens
92 | -- "<>\"&"
93 | printText :: [HtmlEntity] -> Text
94 | printText [] = Data.Text.Lazy.empty
95 | printText (x:xs) =
96 | render x <> printText xs
97 | where
98 | render :: HtmlEntity -> Text
99 | render = \ case
100 | HtmlText { rawText = t } -> toLazyText $ htmlEncodedText t
101 | HtmlCdata { text = t } -> fromStrict t
102 | _ -> Data.Text.Lazy.empty
103 |
--------------------------------------------------------------------------------
/test/Text/Seonbi/Html/LangSpec.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE OverloadedLists #-}
2 | {-# LANGUAGE OverloadedStrings #-}
3 | module Text.Seonbi.Html.LangSpec (spec) where
4 |
5 | import Test.Hspec
6 |
7 | import Text.Seonbi.Html
8 | import Text.Seonbi.Html.Lang
9 |
10 | source :: [HtmlEntity]
11 | source =
12 | [ HtmlStartTag
13 | { tagStack = []
14 | , tag = P
15 | , rawAttributes = "id=\"foo\" lang=\"en\""
16 | }
17 | , HtmlText { tagStack = [P], rawText = "English" }
18 | , HtmlEndTag { tagStack = [], tag = P }
19 | , HtmlStartTag { tagStack = [], tag = Div, rawAttributes = "" }
20 | , HtmlStartTag
21 | { tagStack = [Div]
22 | , tag = P
23 | , rawAttributes = "class=bar lang=ja"
24 | }
25 | , HtmlStartTag { tagStack = [Div, P], tag = B, rawAttributes = "" }
26 | , HtmlText { tagStack = [Div, P, B], rawText = "日本語" }
27 | , HtmlEndTag { tagStack = [Div, P], tag = B }
28 | , HtmlStartTag
29 | { tagStack = [Div, P]
30 | , tag = Span
31 | , rawAttributes = "lang='yue-Hant'"
32 | }
33 | , HtmlText { tagStack = [Div, P, Span], rawText = "與" }
34 | , HtmlStartTag { tagStack = [Div, P, Span], tag = B, rawAttributes = "" }
35 | , HtmlText { tagStack = [Div, P, Span, B], rawText = "與粵語" }
36 | , HtmlEndTag { tagStack = [Div, P, Span], tag = B }
37 | , HtmlEndTag { tagStack = [Div, P], tag = Span }
38 | , HtmlEndTag { tagStack = [Div], tag = P }
39 | , HtmlEndTag { tagStack = [], tag = Div }
40 | ]
41 |
42 | annotated :: [LangHtmlEntity]
43 | annotated =
44 | [ LangHtmlEntity
45 | (Just "en")
46 | HtmlStartTag
47 | { tagStack = []
48 | , tag = P
49 | , rawAttributes = "id=\"foo\" lang=\"en\""
50 | }
51 | , LangHtmlEntity
52 | (Just "en")
53 | HtmlText { tagStack = [P], rawText = "English" }
54 | , LangHtmlEntity (Just "en") HtmlEndTag { tagStack = [], tag = P }
55 | , LangHtmlEntity
56 | Nothing
57 | HtmlStartTag { tagStack = [], tag = Div, rawAttributes = "" }
58 | , LangHtmlEntity
59 | (Just "ja")
60 | HtmlStartTag
61 | { tagStack = [Div]
62 | , tag = P
63 | , rawAttributes = "class=bar lang=ja"
64 | }
65 | , LangHtmlEntity
66 | (Just "ja")
67 | HtmlStartTag { tagStack = [Div, P], tag = B, rawAttributes = "" }
68 | , LangHtmlEntity
69 | (Just "ja")
70 | HtmlText { tagStack = [Div, P, B], rawText = "日本語" }
71 | , LangHtmlEntity (Just "ja") HtmlEndTag { tagStack = [Div, P], tag = B }
72 | , LangHtmlEntity
73 | (Just "yue-hant")
74 | HtmlStartTag
75 | { tagStack = [Div, P]
76 | , tag = Span
77 | , rawAttributes = "lang='yue-Hant'"
78 | }
79 | , LangHtmlEntity
80 | (Just "yue-hant")
81 | HtmlText { tagStack = [Div, P, Span], rawText = "與" }
82 | , LangHtmlEntity
83 | (Just "yue-hant")
84 | HtmlStartTag { tagStack = [Div, P, Span], tag = B, rawAttributes = "" }
85 | , LangHtmlEntity
86 | (Just "yue-hant")
87 | HtmlText { tagStack = [Div, P, Span, B], rawText = "與粵語" }
88 | , LangHtmlEntity
89 | (Just "yue-hant")
90 | HtmlEndTag { tagStack = [Div, P, Span], tag = B }
91 | , LangHtmlEntity
92 | (Just "yue-hant")
93 | HtmlEndTag { tagStack = [Div, P], tag = Span }
94 | , LangHtmlEntity (Just "ja") HtmlEndTag { tagStack = [Div] , tag = P }
95 | , LangHtmlEntity Nothing HtmlEndTag { tagStack = [], tag = Div }
96 | ]
97 |
98 | spec :: Spec
99 | spec = do
100 | specify "extractLang" $ do
101 | extractLang "" `shouldBe` Nothing
102 | extractLang "lang=en" `shouldBe` Just "en"
103 | extractLang "lang=en-US" `shouldBe` Just "en-us"
104 | extractLang "lang='ko-KR'" `shouldBe` Just "ko-kr"
105 | extractLang "lang=\"zh-Hant\"" `shouldBe` Just "zh-hant"
106 | extractLang "lang=\"yue-Hans-HK\"" `shouldBe` Just "yue-hans-hk"
107 | extractLang "id=\"foo\" lang=\"en\"" `shouldBe` Just "en"
108 | extractLang "id=\"foo\" lang=zh-CN class=bar" `shouldBe` Just "zh-cn"
109 | specify "annotateWithLang" $ do
110 | annotateWithLang [] `shouldBe` []
111 | annotateWithLang source `shouldBe` annotated
112 |
--------------------------------------------------------------------------------
/src/Text/Seonbi/Html/Clipper.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE LambdaCase #-}
2 | module Text.Seonbi.Html.Clipper
3 | ( clipPrefixText
4 | , clipSuffixText
5 | , clipText
6 | ) where
7 |
8 | import Control.Monad
9 | import Data.List (dropWhileEnd)
10 |
11 | import Data.Text
12 |
13 | import Text.Seonbi.Html
14 |
15 | -- | Clip the given prefix text and suffix text from the HTML fragments.
16 | -- It simply is composed of 'clipPrefixText' and 'clipSuffixText' functions.
17 | -- It returns 'Nothing' if any of a prefix and a suffix does not match.
18 | clipText :: Text -> Text -> [HtmlEntity] -> Maybe [HtmlEntity]
19 | clipText prefix suffix =
20 | clipSuffixText suffix <=< clipPrefixText prefix
21 |
22 | -- | Clip the given prefix text from the HTML fragments. If its first
23 | -- text element does not have the same prefix, or the first element is not
24 | -- an 'HtmlText' node, or the list of HTML fragments have nothing at all,
25 | -- it returns 'Nothing'.
26 | --
27 | -- >>> :set -XOverloadedLists
28 | -- >>> :set -XOverloadedStrings
29 | -- >>> clipPrefixText "foo" [HtmlText [] "bar", HtmlStartTag [] P ""]
30 | -- Nothing
31 | -- >>> clipPrefixText "foo" [HtmlStartTag [] P "", HtmlText [] "foo"]
32 | -- Nothing
33 | -- >>> clipPrefixText "foo" []
34 | -- Nothing
35 | --
36 | -- If the first element is an 'HtmlText' node, and its 'rawText' contains
37 | -- the common prefix text, it returns a 'Just' value holding a list of
38 | -- HTML fragments with the common prefix removed.
39 | --
40 | -- >>> clipPrefixText "foo" [HtmlText [] "foobar", HtmlStartTag [] P ""]
41 | -- Just [HtmlText {... "bar"},HtmlStartTag {...}]
42 | -- >>> clipPrefixText "foo" [HtmlText [] "foo", HtmlStartTag [] P ""]
43 | -- Just [HtmlStartTag {..., tag = P, ...}]
44 | --
45 | -- A given text is treated as a raw text, which means even if some HTML
46 | -- entities refer to the same characters it may fails to match unless
47 | -- they share the exactly same representation, e.g.:
48 | --
49 | -- >>> clipPrefixText "&" [HtmlText [] "&"]
50 | -- Nothing
51 | --
52 | -- In the same manner, it doesn't find a prefix from 'HtmlCdata', e.g.:
53 | --
54 | -- >>> clipPrefixText "foo" [HtmlCdata [] "foo", HtmlStartTag [] P ""]
55 | -- Nothing
56 | --
57 | -- In order to remove a prefix from both 'HtmlText' and 'HtmlCdata',
58 | -- apply 'normalizeText' first so that all 'HtmlCdata' entities are transformed
59 | -- to equivalent 'HtmlText' entities:
60 | --
61 | -- >>> import Text.Seonbi.Html.TextNormalizer (normalizeText)
62 | -- >>> let normalized = normalizeText [HtmlCdata [] "foo", HtmlStartTag [] P ""]
63 | -- >>> clipPrefixText "foo" normalized
64 | -- Just [HtmlStartTag {..., tag = P, ...}]
65 | --
66 | -- Plus, it works even if HTML fragments contain some 'HtmlComment' entities,
67 | -- but these are not touched at all, e.g.:
68 | --
69 | -- >>> clipPrefixText "bar" [HtmlComment [] "foo", HtmlText [] "barbaz"]
70 | -- Just [HtmlComment {... "foo"},HtmlText {... "baz"}]
71 | clipPrefixText :: Text -> [HtmlEntity] -> Maybe [HtmlEntity]
72 | clipPrefixText prefix []
73 | | Data.Text.null prefix = Just []
74 | | otherwise = Nothing
75 | clipPrefixText prefix (x@HtmlComment {} : xs) =
76 | (x :) <$> clipPrefixText prefix xs
77 | clipPrefixText prefix (x@HtmlText { rawText = rawText' } : xs)
78 | | prefix == rawText' = Just xs
79 | | prefix `isPrefixOf` rawText' = Just $
80 | x { rawText = Data.Text.drop (Data.Text.length prefix) rawText' } : xs
81 | | otherwise = Nothing
82 | clipPrefixText _ _ = Nothing
83 |
84 | -- | Clip the given suffix text from the HTML fragments, in the same manner
85 | -- to 'clipPrefixText'.
86 | clipSuffixText :: Text -> [HtmlEntity] -> Maybe [HtmlEntity]
87 | clipSuffixText suffix []
88 | | Data.Text.null suffix = Just []
89 | | otherwise = Nothing
90 | clipSuffixText suffix entities =
91 | case Prelude.last entities' of
92 | e@HtmlText { rawText = rawText' }
93 | | suffix == rawText' -> Just (init' ++ comments)
94 | | suffix `isSuffixOf` rawText' ->
95 | let
96 | sLen = Data.Text.length suffix
97 | rtLen = Data.Text.length rawText'
98 | clipped = Data.Text.take (rtLen - sLen) rawText'
99 | in
100 | Just (init' ++ e { rawText = clipped } : comments)
101 | | otherwise -> Nothing
102 | _ -> Nothing
103 | where
104 | entities' :: [HtmlEntity]
105 | entities' = (`Data.List.dropWhileEnd` entities) $ \ case
106 | HtmlComment {} -> True
107 | _ -> False
108 | init' :: [HtmlEntity]
109 | init' = Prelude.init entities'
110 | comments :: [HtmlEntity]
111 | comments = Prelude.drop (Prelude.length entities') entities
112 |
--------------------------------------------------------------------------------
/src/Text/Seonbi/Trie.hs:
--------------------------------------------------------------------------------
1 | {-# OPTIONS_GHC -fno-warn-orphans #-}
2 | {-# LANGUAGE OverloadedStrings #-}
3 | {-# LANGUAGE RankNTypes #-}
4 | {-# LANGUAGE TypeFamilies #-}
5 | -- | A trie from 'Text' keys to values.
6 | module Text.Seonbi.Trie
7 | ( Trie
8 | , elems
9 | , empty
10 | , fromList
11 | , insert
12 | , keys
13 | , lookup
14 | , member
15 | , mergeBy
16 | , null
17 | , singleton
18 | , size
19 | , toList
20 | , unionL
21 | , unionR
22 | ) where
23 |
24 | import Prelude hiding (lookup, null)
25 |
26 | import Control.Monad (ap)
27 | import qualified GHC.Exts
28 |
29 | import Data.ByteString (ByteString)
30 | import Data.Text hiding (empty, null, singleton)
31 | import Data.Text.Encoding (encodeUtf8, decodeUtf8)
32 | import qualified Data.Trie as BTrie
33 |
34 | -- | A trie from 'Text' keys to 'a' values.
35 | newtype Trie a
36 | = Trie (BTrie.Trie a)
37 | deriving (Eq, Show)
38 |
39 | encodeKey :: Text -> ByteString
40 | encodeKey = encodeUtf8
41 |
42 | decodeKey :: ByteString -> Text
43 | decodeKey = decodeUtf8
44 |
45 | -- | The empty trie.
46 | empty :: Trie a
47 | empty = Trie BTrie.empty
48 |
49 | -- | Checks if the trie is empty.
50 | null :: Trie a -> Bool
51 | null (Trie btrie) = BTrie.null btrie
52 |
53 | -- | Constructs a singleton trie.
54 | singleton :: Text -> a -> Trie a
55 | singleton key value = Trie $ BTrie.singleton (encodeKey key) value
56 |
57 | -- | Gets the number of elements in the trie.
58 | size :: Trie a -> Int
59 | size (Trie btrie) = BTrie.size btrie
60 |
61 | fromList' :: [(Text, a)] -> Trie a
62 | fromList' list = Trie $ BTrie.fromList [(encodeKey k, v) | (k, v) <- list]
63 |
64 | toList' :: Trie a -> [(Text, a)]
65 | toList' (Trie btrie) = [(decodeKey k, v) | (k, v) <- BTrie.toList btrie]
66 |
67 | -- | Converts a list of associated pairs into a trie. For duplicate keys,
68 | -- values earlier in the list shadow later ones.
69 | fromList :: [(Text, a)] -> Trie a
70 | fromList = fromList'
71 |
72 | -- | Converts a trie into a list of associated pairs. Keys will be ordered.
73 | toList :: Trie a -> [(Text, a)]
74 | toList = toList'
75 |
76 | -- | Lists all keys in the trie. Keys will be ordered.
77 | keys :: Trie a -> [Text]
78 | keys (Trie btrie) = Prelude.map decodeKey $ BTrie.keys btrie
79 |
80 | -- | Lists all values in the trie. Values are ordered by their associated keys.
81 | elems :: Trie a -> [a]
82 | elems (Trie btrie) = BTrie.elems btrie
83 |
84 | -- | Gets the value associated with a key if it exists.
85 | lookup :: Text -> Trie a -> Maybe a
86 | lookup key (Trie btrie) = BTrie.lookup (encodeKey key) btrie
87 |
88 | -- | Checks if a key has a value in the trie.
89 | member :: Text -> Trie a -> Bool
90 | member key (Trie btrie) = BTrie.member (encodeKey key) btrie
91 |
92 | -- | Inserts a new key into the trie.
93 | insert
94 | :: Text
95 | -- ^ A new key to insert. If there is already the same key in the trie,
96 | -- the existing value is overwritten by the new value.
97 | -> a
98 | -- ^ A value associated to the key.
99 | -> Trie a
100 | -- ^ An existing trie.
101 | -> Trie a
102 | -- ^ The new trie with the inserted key.
103 | insert key value (Trie btrie) = Trie $ BTrie.insert (encodeKey key) value btrie
104 |
105 | -- | Combines two tries, using a function to resolve collisions. This can only
106 | -- define the space of functions between union and symmetric difference but,
107 | -- with those two, all set operations can be defined (albeit inefficiently).
108 | mergeBy :: (a -> a -> Maybe a) -> Trie a -> Trie a -> Trie a
109 | mergeBy f (Trie a) (Trie b) = Trie $ BTrie.mergeBy f a b
110 |
111 | -- | Combines two tries, resolving conflicts by choosing the value from the
112 | -- left (former) trie.
113 | unionL :: Trie a -> Trie a -> Trie a
114 | unionL (Trie left) (Trie right) = Trie $ BTrie.unionL left right
115 |
116 | -- | Combines two tries, resolving conflicts by choosing the value from the
117 | -- right (latter) trie.
118 | unionR :: Trie a -> Trie a -> Trie a
119 | unionR (Trie left) (Trie right) = Trie $ BTrie.unionR left right
120 |
121 | instance Functor Trie where
122 | fmap f (Trie btrie) = Trie $ fmap f btrie
123 |
124 | instance Foldable Trie where
125 | foldMap f (Trie btrie) = foldMap f btrie
126 |
127 | instance Traversable Trie where
128 | traverse f (Trie btrie) = Trie <$> traverse f btrie
129 |
130 | instance Applicative Trie where
131 | pure = singleton ""
132 | (<*>) = ap
133 |
134 | instance Monad Trie where
135 | Trie btrie >>= f = Trie $ btrie >>= (\ v -> case f v of { Trie b -> b })
136 |
137 | instance (Semigroup a) => Semigroup (Trie a) where
138 | (Trie a) <> (Trie b) = Trie (a <> b)
139 |
140 | instance (Monoid a) => Monoid (Trie a) where
141 | mempty = Trie mempty
142 |
143 | instance GHC.Exts.IsList (Trie a) where
144 | type Item (Trie a) = (Text, a)
145 | fromList = fromList'
146 | toList = toList'
147 |
--------------------------------------------------------------------------------
/setup/action.yaml:
--------------------------------------------------------------------------------
1 | name: Setup Seonbi
2 | description: Set up a specific version of Seonbi and add it to the PATH.
3 | author: Hong Minhee
4 | branding:
5 | icon: package
6 | color: gray-dark
7 | inputs:
8 | seonbi-version:
9 | description: >-
10 | Version of Seonbi to install. Note that asterisks can be used to
11 | choose the latest version, e.g., 1.2.*, 1.*, *.
12 | default: "*"
13 | add-to-path:
14 | description: >-
15 | Whether to add the installed seonbi and seonbi-api to the PATH. Turned
16 | on by default.
17 | default: true
18 | outputs:
19 | seonbi-version:
20 | description: Exact version number of the installed Seonbi.
21 | value: ${{ steps.prepare.outputs.seonbi-version }}
22 | seonbi-path:
23 | description: Absolute path of the installed executable seonbi.
24 | value: ${{ steps.prepare.outputs.seonbi-path }}
25 | seonbi-api-path:
26 | description: Absolute path of the installed executable seonbi-api.
27 | value: ${{ steps.prepare.outputs.seonbi-api-path }}
28 | runs:
29 | using: composite
30 | steps:
31 | - id: prepare
32 | shell: python
33 | run: |
34 | from __future__ import print_function
35 | import fnmatch
36 | import json
37 | import os
38 | import os.path
39 | try: from urllib import request as urllib2
40 | except ImportError: import urllib2
41 | import tempfile
42 |
43 | suffixes = {
44 | ('Linux', 'X64'): 'linux-x86_64.tar.bz2',
45 | ('Linux', 'ARM64'): 'linux-arm64.tar.bz2',
46 | ('macOS', 'X64'): 'macos-x86_64.tar.bz2',
47 | ('macOS', 'ARM64'): 'macos-arm64.tar.bz2',
48 | ('Windows', 'X64'): 'win64.zip',
49 | }
50 | os_ = os.environ['RUNNER_OS']
51 | arch = os.environ['RUNNER_ARCH']
52 | try:
53 | suffix = suffixes[os_, arch]
54 | except KeyError:
55 | print(
56 | "::error title=Unsupported OS and architecture::Seonbi doesn't",
57 | 'support {0}/{1}'.format(os_, arch)
58 | )
59 | raise SystemExit(1)
60 |
61 | # TODO: paging
62 | req = urllib2.Request(
63 | 'https://api.github.com/repos/dahlia/seonbi/releases?per_page=100',
64 | headers={'Authorization': 'Bearer ' + os.environ['GH_TOKEN']}
65 | )
66 | res = urllib2.urlopen(req)
67 | tags = json.load(res)
68 | tags.sort(
69 | key=lambda tag: tuple(map(int, tag['tag_name'].split('.'))),
70 | reverse=True
71 | )
72 | res.close()
73 | version_pattern = os.environ['SEONBI_VERSION'].strip()
74 | for tag in tags:
75 | if not fnmatch.fnmatch(tag['tag_name'], version_pattern):
76 | continue
77 | for asset in tag['assets']:
78 | if asset['name'] == 'seonbi-{0}.{1}'.format(tag['tag_name'], suffix):
79 | print('::set-output name=seonbi-version::' + tag['tag_name'])
80 | print(
81 | '::set-output name=download-url::' + asset['browser_download_url']
82 | )
83 | break
84 | else:
85 | continue
86 | break
87 | else:
88 | print(
89 | '::error title=Unsupported platform::Seonbi', version_pattern,
90 | 'does not support', os_, '&', arch + '.'
91 | )
92 |
93 | dir_path = tempfile.mkdtemp('seonbi', dir=os.environ.get('RUNNER_TEMP'))
94 | seonbi_path = os.path.join(
95 | dir_path,
96 | 'seonbi.exe' if os_ == 'Windows' else 'seonbi'
97 | )
98 | seonbi_api_path = os.path.join(
99 | dir_path,
100 | 'seonbi-api.exe' if os_ == 'Windows' else 'seonbi-api'
101 | )
102 | print('::set-output name=dir-path::' + dir_path)
103 | print('::set-output name=seonbi-path::' + seonbi_path)
104 | print('::set-output name=seonbi-api-path::' + seonbi_api_path)
105 | env:
106 | GH_TOKEN: ${{ github.token }}
107 | SEONBI_VERSION: ${{ inputs.seonbi-version }}
108 | # Linux & macOS
109 | - if: runner.os != 'Windows'
110 | shell: bash
111 | run: |
112 | set -e
113 | wget "$DOWNLOAD_URL"
114 | tar xvfj "$(basename "$DOWNLOAD_URL")"
115 | chmod +x seonbi seonbi-api
116 | if [[ "$ADD_TO_PATH" = "true" ]]; then
117 | pwd >> "$GITHUB_PATH"
118 | fi
119 | env:
120 | DOWNLOAD_URL: ${{ steps.prepare.outputs.download-url }}
121 | DIR_PATH: ${{ steps.prepare.outputs.dir-path }}
122 | ADD_TO_PATH: ${{ inputs.add-to-path }}
123 | working-directory: ${{ steps.prepare.outputs.dir-path }}
124 | # Windows
125 | - if: runner.os == 'Windows'
126 | shell: pwsh
127 | run: |
128 | Invoke-WebRequest `
129 | $env:DOWNLOAD_URL `
130 | -OutFile $env:DOWNLOAD_URL.Split("/")[-1]
131 | 7z x $env:DOWNLOAD_URL.Split("/")[-1]
132 | if (ConvertFrom-Json $env:ADD_TO_PATH) {
133 | Add-Content `
134 | -Path $env:GITHUB_PATH `
135 | -Value "$(Get-Location)"
136 | }
137 | env:
138 | DOWNLOAD_URL: ${{ steps.prepare.outputs.download-url }}
139 | DIR_PATH: ${{ steps.prepare.outputs.dir-path }}
140 | ADD_TO_PATH: ${{ inputs.add-to-path }}
141 | working-directory: ${{ steps.prepare.outputs.dir-path }}
142 |
--------------------------------------------------------------------------------
/scripts/showcase-svg/template.svg:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
9 |
121 |
122 | 선비Seonbi trasforms:
123 |
124 |
125 |
悠久한 歷史와 傳統에 빛나는 우리 大韓國民은 3·1運動으로
126 | 建立된 大韓民國臨時政府의 法統과 不義에 抗拒한 4·19民主理念을 계승하고,
127 | 祖國의 民主改革과 平和的 統一의 使命에 입각하여 正義·人道와 同胞愛로써
128 | 民族의 團結을 공고히 하고, 모든 社會的 弊習과 不義를 타파하며,
129 | 自律과 調和를 바탕으로 自由民主的 基本秩序를 더욱 확고히 하여
130 | 政治·經濟·社會·文化의 모든 領域에 있어서 各人의 機會를 균등히 하고,
131 | 能力을 最高度로 발휘하게 하며, 自由와 權利에 따르는 責任과 義務를
132 | 완수하게 하여, 안으로는 國民生活의 균등한 향상을 기하고 밖으로는
133 | 항구적인 世界平和와 人類共榮에 이바지함으로써 우리들과 우리들의 子孫의
134 | 安全과 自由와 幸福을 영원히 확보할 것을 다짐하면서 1948年 7月 12日에
135 | 制定되고 8次에 걸쳐 改正된 憲法을 이제 國會의 議決을 거쳐 國民投票에
136 | 의하여 改正한다.
137 |
138 |
South Korean orthography
139 |
140 |
PLACEHOLDER: ko-kp
141 |
142 |
North Korean orthography
143 |
144 |
PLACEHOLDER: ko-kp
145 |
146 |
Mixed script with
147 | <ruby>
148 |
149 |
PLACEHOLDER: ko-Kore
150 |
151 |
152 |
153 |
154 |
155 |
--------------------------------------------------------------------------------
/package.yaml:
--------------------------------------------------------------------------------
1 | name: seonbi
2 | version: 0.6.0
3 | synopsis: SmartyPants for Korean language
4 | category: Text
5 | author: Hong Minhee
6 | maintainer: Hong Minhee
7 | copyright: "\xa9 2018\u20132023 Hong Minhee"
8 | license: LGPL-2.1
9 | homepage: https://github.com/dahlia/seonbi
10 | bug-reports: https://github.com/dahlia/seonbi/issues
11 | git: git://github.com/dahlia/seonbi.git
12 | description:
13 | Please see the README.md on GitHub at .
14 | extra-source-files:
15 | - src/Text/Seonbi/Unihan/*.json
16 | - CHANGES.md
17 | - README.md
18 | data-dir: data
19 | data-files:
20 | - '*.tsv'
21 | build-type: Custom
22 | custom-setup:
23 | dependencies:
24 | - base
25 | - bytestring
26 | - Cabal
27 | - directory >= 1 && < 2
28 | - filepath
29 | - http-client >= 0.5 && < 0.8
30 | - temporary >= 1.2 && < 1.4
31 | - text
32 | - zip >= 1.1 && < 3.0
33 | dependencies:
34 | - aeson >= 1.3.1 && < 3
35 | - base >= 4.12 && < 5
36 | - bytestring
37 | - containers
38 | - html-entities >= 1 && < 2
39 | - text
40 | flags:
41 | static:
42 | description: Static link
43 | manual: true
44 | default: false
45 | iconv:
46 | description: Use iconv; however it is ignored on Windows
47 | manual: true
48 | default: false
49 | embed-dictionary:
50 | description: Embed dictionary rather than load from file
51 | manual: true
52 | default: false
53 | when:
54 | - condition: os(darwin)
55 | else:
56 | ghc-options:
57 | - -Wall
58 | - -fprint-explicit-kinds
59 | then:
60 | ghc-options:
61 | - -Wall
62 | - -fprint-explicit-kinds
63 | - -optP-Wno-nonportable-include-path
64 | # The above option works around https://github.com/haskell/cabal/issues/4739
65 | library:
66 | source-dirs: src
67 | dependencies:
68 | - attoparsec >= 0.12 && < 1
69 | - bytestring-trie >= 0.2.5 && < 0.3
70 | - cassava >= 0.5 && < 0.6
71 | - cmark >= 0.6 && < 1
72 | - data-default >= 0.2 && < 1
73 | - filepath >= 1 && < 2
74 | - file-embed >= 0.0.10 && < 0.0.16
75 | - http-media >= 0.8 && < 1
76 | when:
77 | - condition: flag(static) || flag(embed-dictionary)
78 | then:
79 | cpp-options:
80 | - -DEMBED_DICTIONARY
81 | else:
82 | cpp-options:
83 | - -DNO_EMBED_DICTIONARY
84 | executables:
85 | seonbi:
86 | main: seonbi.hs
87 | source-dirs: app
88 | when:
89 | - condition: flag(iconv) && !os(windows)
90 | else:
91 | dependencies: &executable-seonbi-dependencies
92 | cases: ">= 0.1.3.2 && < 0.1.5"
93 | code-page: ">= 0.2 && < 0.3"
94 | html-charset: ">= 0.1 && < 0.2"
95 | optparse-applicative: ">= 0.14 && < 0.18"
96 | seonbi: ">= 0"
97 | then:
98 | dependencies:
99 | <<: *executable-seonbi-dependencies
100 | iconv: ">= 0.4 && < 0.5"
101 | cpp-options:
102 | - -DICONV
103 | - &executable-ghc-options
104 | condition: flag(static)
105 | then:
106 | when:
107 | - condition: os(darwin) || os(windows)
108 | then:
109 | ghc-options:
110 | - -Wall
111 | - -fwarn-incomplete-uni-patterns
112 | - -threaded
113 | - -rtsopts
114 | - -with-rtsopts=-N
115 | # Static link
116 | - -static
117 | - -optc-Os
118 | else:
119 | ghc-options:
120 | - -Wall
121 | - -fwarn-incomplete-uni-patterns
122 | - -threaded
123 | - -rtsopts
124 | - -with-rtsopts=-N
125 | # Static link
126 | - -static
127 | - -optl-static
128 | - -optl-pthread
129 | - -optc-Os
130 | - -fPIC
131 | ld-options:
132 | - -static
133 | else:
134 | ghc-options:
135 | - -Wall
136 | - -fwarn-incomplete-uni-patterns
137 | - -threaded
138 | - -rtsopts
139 | - -with-rtsopts=-N
140 | seonbi-api:
141 | main: seonbi-api.hs
142 | source-dirs: app
143 | dependencies:
144 | - http-types >= 0.12 && < 0.13
145 | - optparse-applicative >= 0.14 && < 0.18
146 | - seonbi
147 | - wai >= 3.2 && < 3.4
148 | - warp >= 3.2 && < 3.4
149 | when:
150 | - *executable-ghc-options
151 | tests:
152 | doctest:
153 | main: doctest.hs
154 | source-dirs: test
155 | other-modules: []
156 | ghc-options:
157 | - -threaded
158 | dependencies:
159 | - doctest
160 | - doctest-discover
161 | - QuickCheck
162 | - seonbi
163 | - unicode-show
164 | spec:
165 | main: hspec.hs
166 | source-dirs: test
167 | ghc-options:
168 | - -threaded
169 | - -rtsopts
170 | - -with-rtsopts=-N
171 | dependencies:
172 | - code-page >= 0.2 && < 0.3
173 | - Diff >= 0.3.4 && < 0.5
174 | - directory >= 1 && < 2
175 | - filepath >= 1 && < 2
176 | - hspec >= 2.4.8 && < 3
177 | - hspec-discover >= 2.4.8 && < 3
178 | - interpolatedstring-perl6 >= 1.0.1 && < 2
179 | - random >= 1.1 && < 1.3
180 | - seonbi
181 | - text
182 | hlint:
183 | main: hlint.hs
184 | source-dirs: test
185 | other-modules: []
186 | ghc-options:
187 | - -threaded
188 | dependencies:
189 | - hlint >= 2.1.7 && < 3.6
190 |
--------------------------------------------------------------------------------
/src/Text/Seonbi/Html/TagStack.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE TypeFamilies #-}
2 | module Text.Seonbi.Html.TagStack
3 | ( HtmlTagStack
4 | , Text.Seonbi.Html.TagStack.any
5 | , descendsFrom
6 | , Text.Seonbi.Html.TagStack.elem
7 | , depth
8 | , empty
9 | , fromList
10 | , last
11 | , pop
12 | , push
13 | , rebase
14 | , toList
15 | ) where
16 |
17 | import Prelude hiding (last)
18 |
19 | import Data.List hiding (last)
20 | import GHC.Exts (IsList (..))
21 |
22 | import Text.Seonbi.Html.Tag
23 |
24 | -- | Represents a hierarchy of a currently parsing position in an 'HtmlTag'
25 | -- tree.
26 | --
27 | -- For example, if an 'scanHtml' has read "@\\\foo\ bar@"
28 | -- it is represented as @'HtmlTagStack' ['B', 'A']@.
29 | --
30 | -- Note that the tags are stored in reverse order, from the deepest to
31 | -- the shallowest, to make inserting a more deeper tag efficient.
32 | newtype HtmlTagStack = HtmlTagStack [HtmlTag] deriving (Eq, Ord)
33 |
34 | instance IsList HtmlTagStack where
35 | type Item HtmlTagStack = HtmlTag
36 | fromList = HtmlTagStack . reverse
37 | toList (HtmlTagStack tags) = reverse tags
38 |
39 | instance Show HtmlTagStack where
40 | show tags = "fromList " ++ show (toList tags)
41 |
42 | -- | An empty stack.
43 | empty :: HtmlTagStack
44 | empty = HtmlTagStack []
45 |
46 | -- | Count the depth of a stack.
47 | --
48 | -- >>> :set -XOverloadedLists
49 | -- >>> depth empty
50 | -- 0
51 | -- >>> depth [Div, Article, P, Em]
52 | -- 4
53 | depth :: HtmlTagStack -> Int
54 | depth (HtmlTagStack stack) = Data.List.length stack
55 |
56 | -- | Get the deepest tag from a 'HtmlTagStack'.
57 | --
58 | -- >>> :set -XOverloadedLists
59 | -- >>> let stack = [Div, Article, P, Em] :: HtmlTagStack
60 | -- >>> last stack
61 | -- Just Em
62 | -- >>> last []
63 | -- Nothing
64 | last :: HtmlTagStack -> Maybe HtmlTag
65 | last (HtmlTagStack []) = Nothing
66 | last (HtmlTagStack (tag:_)) = Just tag
67 |
68 | -- | Build a new stack from a stack by replacing its bottom with a new base.
69 | --
70 | -- >>> :set -XOverloadedLists
71 | -- >>> rebase [Article, BlockQuote] [Div] [Article, BlockQuote, P, Em]
72 | -- fromList [Div,P,Em]
73 | --
74 | -- If there are no such bottom elements, it replaces nothing.
75 | --
76 | -- >>> rebase [Div, Article, BlockQuote] [Div] [Article, BlockQuote, P, Em]
77 | -- fromList [Article,BlockQuote,P,Em]
78 | rebase :: HtmlTagStack -> HtmlTagStack -> HtmlTagStack -> HtmlTagStack
79 | rebase (HtmlTagStack base) (HtmlTagStack newBase) stack@(HtmlTagStack l)
80 | | base `isSuffixOf` l = HtmlTagStack $
81 | take (depth stack - length base) l ++ newBase
82 | | otherwise = stack
83 |
84 | -- | Push one deeper @tag@ to a 'HtmlTagStack'.
85 | --
86 | -- >>> push A empty
87 | -- fromList [A]
88 | -- >>> push B (push A empty)
89 | -- fromList [A,B]
90 | push :: HtmlTag -> HtmlTagStack -> HtmlTagStack
91 | push tag (HtmlTagStack tags) =
92 | HtmlTagStack (tag : tags)
93 |
94 | -- | Pop the deepest @tag@ from a 'HtmlTagStack'.
95 | --
96 | -- >>> :set -XOverloadedLists
97 | -- >>> pop Em [A, B, Em]
98 | -- fromList [A,B]
99 | --
100 | -- It may pop a @tag@ in the middle if a @tag@ looking for is not the deepest:
101 | --
102 | -- >>> pop B [A, B, Em]
103 | -- fromList [A,Em]
104 | --
105 | -- It does not affect to the input if there is no such @tag@ in the input:
106 | --
107 | -- >>> pop P [A, B, Em]
108 | -- fromList [A,B,Em]
109 | -- >>> pop A empty
110 | -- fromList []
111 | pop :: HtmlTag -> HtmlTagStack -> HtmlTagStack
112 | pop tag (HtmlTagStack tags'@(t : ags)) =
113 | if t == tag
114 | then HtmlTagStack ags
115 | else
116 | let
117 | (head', rest) = span (/= tag) tags'
118 | tail' = case uncons rest of
119 | Just (_, tail'') -> tail''
120 | Nothing -> []
121 | in
122 | HtmlTagStack (head' ++ tail')
123 | pop _ (HtmlTagStack []) = empty
124 |
125 | -- | Check if a node ('HtmlEntity') that a 'HtmlTagStack' (the first argument)
126 | -- refers is contained by a node that another 'HtmlTagStack' (the second
127 | -- argument), or they are sibling at least.
128 | --
129 | -- >>> :set -XOverloadedLists
130 | -- >>> descendsFrom [Div, P, A, Em] [Div, P, A]
131 | -- True
132 | -- >>> descendsFrom [Div, P, A] [Div, P, A]
133 | -- True
134 | -- >>> descendsFrom [Div, P, Em] [Div, P, A]
135 | -- False
136 | -- >>> descendsFrom [Div, P] [Div, P, A]
137 | -- False
138 | descendsFrom :: HtmlTagStack -> HtmlTagStack -> Bool
139 | HtmlTagStack a `descendsFrom` HtmlTagStack b =
140 | b `isSuffixOf` a
141 |
142 | -- | Determine whether any element of the tag stack satisfies the predicate.
143 | --
144 | -- >>> :set -XOverloadedLists
145 | -- >>> Text.Seonbi.Html.TagStack.any ((== Void) . htmlTagKind) [Div, P, Script]
146 | -- False
147 | -- >>> Text.Seonbi.Html.TagStack.any ((== Void) . htmlTagKind) [BR, P, Script]
148 | -- True
149 | any :: (HtmlTag -> Bool) -> HtmlTagStack -> Bool
150 | any fn (HtmlTagStack stack) =
151 | Prelude.any fn stack
152 |
153 | -- | Determine whether the element occurs in the tag stack.
154 | --
155 | -- >>> :set -XOverloadedLists
156 | -- >>> A `Text.Seonbi.Html.TagStack.elem` [A, B, Code]
157 | -- True
158 | -- >>> Em `Text.Seonbi.Html.TagStack.elem` [A, B, Code]
159 | -- False
160 | elem :: HtmlTag -> HtmlTagStack -> Bool
161 | elem tag (HtmlTagStack stack) = tag `Prelude.elem` stack
162 |
--------------------------------------------------------------------------------
/src/Text/Seonbi/PairedTransformer.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE ScopedTypeVariables #-}
2 | module Text.Seonbi.PairedTransformer
3 | ( PairedTransformer (..)
4 | , transformPairs
5 | ) where
6 |
7 | import Data.Text hiding (break, reverse)
8 |
9 | import Text.Seonbi.Html
10 |
11 | -- | Settings for 'transformPairs'.
12 | data PairedTransformer match = PairedTransformer
13 | { ignoresTagStack :: HtmlTagStack -> Bool
14 | , matchStart :: [match] -> Text -> Maybe (match, Text, Text, Text)
15 | , matchEnd :: Text -> Maybe (match, Text, Text, Text)
16 | , areMatchesPaired :: match -> match -> Bool
17 | , transformPair :: match -> match -> [HtmlEntity] -> [HtmlEntity]
18 | }
19 |
20 | -- | Some transformations should be done only if a start and an end are paired
21 | -- like parentheses. These even usually can be nested. Even if there is
22 | -- a start and an end they should not be paired unless they are sibling in
23 | -- an HTML tree.
24 | --
25 | -- These kinds of scanning are easily turned highly stateful and imperative,
26 | -- hence hard to debug. This base class provides the common logic between
27 | -- these kinds of paired transformations so that an implementation class fill
28 | -- several abstract methods triggered by the state machine.
29 | transformPairs :: forall m . PairedTransformer m -> [HtmlEntity] -> [HtmlEntity]
30 | transformPairs (PairedTransformer ignores start end arePaired transform) =
31 | iter [] . normalizeText
32 | where
33 | iter :: [Unclosed m] -> [HtmlEntity] -> [HtmlEntity]
34 | iter [] [] = []
35 | iter stack [] = unstack stack
36 | iter stack (x@HtmlText { tagStack = ts, rawText = txt } : xs) =
37 | case (startMatch, endMatch) of
38 | (Just captured, Nothing) ->
39 | roll stack captured ts xs
40 | (Nothing, Just captured@(m, _, _, _))
41 | | Prelude.any ((`arePaired` m) . match) stack ->
42 | unroll stack captured ts xs
43 | (Just captured@(_, pre, _, _), Just captured'@(m', pre', _, _)) ->
44 | if Data.Text.length pre >= Data.Text.length pre' &&
45 | Prelude.any ((`arePaired` m') . match) stack
46 | then unroll stack captured' ts xs
47 | else roll stack captured ts xs
48 | (Nothing, _) ->
49 | case stack of
50 | [] -> x : iter stack xs
51 | s : ss -> iter (s { buffer = x : buffer s } : ss) xs
52 | where
53 | startMatch :: Maybe (m, Text, Text, Text)
54 | startMatch = start (reverse $ fmap match stack) txt
55 | endMatch :: Maybe (m, Text, Text, Text)
56 | endMatch = end txt
57 | iter (s@Unclosed {} : ss) (x : xs) =
58 | iter (s { buffer = x : buffer s } : ss) xs
59 | iter [] (x : xs) = x : iter [] xs
60 | roll :: [Unclosed m]
61 | -> (m, Text, Text, Text)
62 | -> HtmlTagStack
63 | -> [HtmlEntity]
64 | -> [HtmlEntity]
65 | roll [] (startMatch, pre, t, post) tagStack_ entities =
66 | prependText tagStack_ pre $ iter
67 | [Unclosed startMatch [HtmlText tagStack_ t]]
68 | (normalizeText (prependText tagStack_ post entities))
69 | roll (s : ss) (startMatch, pre, t, post) tagStack_ entities = iter
70 | ( Unclosed startMatch [HtmlText tagStack_ t]
71 | : s { buffer = prependText tagStack_ pre $ buffer s }
72 | : ss
73 | )
74 | (normalizeText (prependText tagStack_ post entities))
75 | unroll :: [Unclosed m]
76 | -> (m, Text, Text, Text)
77 | -> HtmlTagStack
78 | -> [HtmlEntity]
79 | -> [HtmlEntity]
80 | unroll stack (endMatch, pre, t, post) tagStack_ es =
81 | case remainStack of
82 | [] -> unrolled ++ iter [] remainEntities
83 | s : ss -> iter
84 | (s { buffer = reverse unrolled ++ buffer s } : ss)
85 | remainEntities
86 | where
87 | prependText' :: Text -> [HtmlEntity] -> [HtmlEntity]
88 | prependText' = prependText tagStack_
89 | unrolled :: [HtmlEntity]
90 | remainStack :: [Unclosed m]
91 | (unrolled, remainStack) = case findPair endMatch stack of
92 | (_, []) ->
93 | ([HtmlText tagStack_ (pre `append` t)], [])
94 | (stack', s@Unclosed { match = startMatch } : ss) ->
95 | let
96 | buf = prependText' pre (unstack' stack' ++ buffer s)
97 | buf' = prependText' t buf
98 | buf'' = reverse buf'
99 | transformed = if Prelude.any (ignores . tagStack) buf''
100 | then buf''
101 | else transform startMatch endMatch buf''
102 | in
103 | (transformed, ss)
104 | remainEntities :: [HtmlEntity]
105 | remainEntities = prependText' post es
106 | findPair :: m -> [Unclosed m] -> ([Unclosed m], [Unclosed m])
107 | findPair m = break (arePaired m . match)
108 | unstack :: [Unclosed m] -> [HtmlEntity]
109 | unstack = reverse . unstack'
110 | unstack' :: [Unclosed m] -> [HtmlEntity]
111 | unstack' [] = []
112 | unstack' (Unclosed { buffer = b } : ss) = b ++ unstack' ss
113 | prependText :: HtmlTagStack -> Text -> [HtmlEntity] -> [HtmlEntity]
114 | prependText tagStack_ txt
115 | | Data.Text.null txt = id
116 | | otherwise = (HtmlText tagStack_ txt :)
117 |
118 | data Unclosed match = Unclosed
119 | { match :: match
120 | , buffer :: [HtmlEntity] -- in reverse order
121 | }
122 |
--------------------------------------------------------------------------------
/demo/src/Markdown/HtmlString.elm:
--------------------------------------------------------------------------------
1 | module Markdown.HtmlString exposing (render)
2 |
3 | import List
4 | import Markdown.Block exposing (..)
5 | import Markdown.Inline exposing (..)
6 | import Maybe exposing (andThen, withDefault)
7 | import String
8 |
9 |
10 | escape : String -> String
11 | escape =
12 | String.replace "&" "&"
13 | >> String.replace "<" "<"
14 | >> String.replace ">" ">"
15 | >> String.replace "\"" """
16 |
17 |
18 | render : List (Block b i) -> String
19 | render blocks =
20 | List.map renderBlock blocks |> String.concat
21 |
22 |
23 | renderBlock : Block b i -> String
24 | renderBlock block =
25 | case block of
26 | BlankLine text ->
27 | escape text
28 |
29 | ThematicBreak ->
30 | "\n \n"
31 |
32 | Heading _ level inlines ->
33 | "\n"
36 | ++ renderInlines inlines
37 | ++ " \n"
40 |
41 | CodeBlock _ code ->
42 | "" ++ escape code ++ " \n"
43 |
44 | Paragraph _ text ->
45 | " " ++ renderInlines text ++ "
\n"
46 |
47 | BlockQuote blocks ->
48 | "\n" ++ render blocks ++ " \n"
49 |
50 | List list items ->
51 | let
52 | ( open, close ) =
53 | case list.type_ of
54 | Unordered ->
55 | ( "" )
56 |
57 | Ordered start ->
58 | ( ""
59 | , " "
60 | )
61 |
62 | renderItem =
63 | \blocks ->
64 | "" ++ render blocks ++ " \n"
65 | in
66 | open
67 | ++ "\n"
68 | ++ String.concat (List.map renderItem items)
69 | ++ close
70 | ++ "\n"
71 |
72 | PlainInlines inlines ->
73 | renderInlines inlines
74 |
75 | Markdown.Block.Custom _ blocks ->
76 | render blocks
77 |
78 |
79 | renderInlines : List (Inline i) -> String
80 | renderInlines inlines =
81 | List.map renderInline inlines
82 | |> String.concat
83 |
84 |
85 | renderInline : Inline i -> String
86 | renderInline inline =
87 | case inline of
88 | Text text ->
89 | escape text
90 |
91 | HardLineBreak ->
92 | " \n"
93 |
94 | CodeInline text ->
95 | "" ++ escape text ++ ""
96 |
97 | Link href title label ->
98 | " andThen (\t -> Just <| " title=\"" ++ t ++ "\"")
103 | |> withDefault ""
104 | )
105 | ++ ">"
106 | ++ (List.map renderInline label |> String.concat)
107 | ++ " "
108 |
109 | Image src title alt ->
110 | " andThen (\t -> Just <| " title=\"" ++ t ++ "\"")
115 | |> withDefault ""
116 | )
117 | ++ " alt=\""
118 | ++ (List.map simplifyInline alt |> String.concat)
119 | ++ "\">"
120 |
121 | HtmlInline tag attrs inlines ->
122 | renderHtmlInline tag attrs inlines
123 |
124 | Emphasis 1 inlines ->
125 | "" ++ renderInlines inlines ++ " "
126 |
127 | Emphasis _ inlines ->
128 | "" ++ renderInlines inlines ++ " "
129 |
130 | Markdown.Inline.Custom _ inlines ->
131 | renderInlines inlines
132 |
133 |
134 | simplifyInlines : List (Inline i) -> String
135 | simplifyInlines inlines =
136 | List.map simplifyInline inlines |> String.concat
137 |
138 |
139 | simplifyInline : Inline i -> String
140 | simplifyInline inline =
141 | case inline of
142 | Text text ->
143 | escape text
144 |
145 | HardLineBreak ->
146 | "\n"
147 |
148 | CodeInline text ->
149 | escape text
150 |
151 | Link _ _ label ->
152 | simplifyInlines label
153 |
154 | Image _ _ alt ->
155 | simplifyInlines alt
156 |
157 | HtmlInline _ _ inlines ->
158 | simplifyInlines inlines
159 |
160 | Emphasis _ inlines ->
161 | simplifyInlines inlines
162 |
163 | Markdown.Inline.Custom _ inlines ->
164 | simplifyInlines inlines
165 |
166 |
167 | renderHtmlInline :
168 | String
169 | -> List ( String, Maybe String )
170 | -> List (Inline i)
171 | -> String
172 | renderHtmlInline tag attrs inlines =
173 | let
174 | attrsString =
175 | String.concat <| List.map renderAttr attrs
176 |
177 | renderAttr =
178 | \( attr, value ) ->
179 | case value of
180 | Just v ->
181 | " " ++ attr ++ "=\"" ++ escape v ++ "\""
182 |
183 | Nothing ->
184 | " " ++ attr
185 | in
186 | "<"
187 | ++ tag
188 | ++ attrsString
189 | ++ ">"
190 | ++ renderInlines inlines
191 | ++ ""
192 | ++ tag
193 | ++ ">"
194 |
--------------------------------------------------------------------------------
/src/Text/Seonbi/Html/Lang.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE OverloadedStrings #-}
2 | module Text.Seonbi.Html.Lang
3 | ( LangHtmlEntity (..)
4 | , LanguageTag
5 | , annotateWithLang
6 | , extractLang
7 | , isKorean
8 | , isNeverKorean
9 | ) where
10 |
11 | import Control.Applicative
12 | import Data.Char (isSpace)
13 | import Data.Maybe
14 |
15 | import Data.Attoparsec.Text
16 | import Data.Text
17 |
18 | import Text.Seonbi.Html.Entity
19 | import Text.Seonbi.Html.Tag (HtmlTag)
20 |
21 | -- | Represents a language tag. Although it is defined as an alias for 'Text',
22 | -- it can be structured in the future. Do not use its contents directly.
23 | type LanguageTag = Text
24 |
25 | -- | Extracts the language tag from the given raw HTML attributes if it has
26 | -- @lang@ attribute.
27 | --
28 | -- >>> extractLang ""
29 | -- Nothing
30 | -- >>> extractLang "lang=en"
31 | -- Just "en"
32 | -- >>> extractLang "lang=\"ko-KR\""
33 | -- Just "ko-kr"
34 | -- >>> extractLang " lang='ko-Hang'"
35 | -- Just "ko-hang"
36 | extractLang
37 | :: HtmlRawAttrs
38 | -- ^ A raw HTML attributes to extract the language tag from.
39 | -> Maybe LanguageTag
40 | -- ^ A language tag extracted from the given raw HTML attributes.
41 | -- If the given raw HTML attributes does not have @lang@ attribute or
42 | -- its value is invalid, 'Nothing' is returned.
43 | extractLang attrs =
44 | case parseOnly parser' attrs of
45 | Right (Just lang') ->
46 | let lt = toLower . strip . normalizeEntities $ lang'
47 | in if Data.Text.null lt then Nothing else Just lt
48 | _ -> Nothing
49 | where
50 | parser' :: Parser (Maybe Text)
51 | parser' = do
52 | skipSpace
53 | attrs' <- langAttr `sepBy` space
54 | skipSpace
55 | return $ listToMaybe $ catMaybes attrs'
56 | langAttr :: Parser (Maybe Text)
57 | langAttr = do
58 | (isLang, cont) <- attrIsLang
59 | value <- if cont then attrValue else return ""
60 | return (if isLang then Just value else Nothing)
61 | attrIsLang :: Parser (Bool, Bool)
62 | attrIsLang = choice
63 | [ asciiCI "lang=" >> return (True, True)
64 | , do { _ <- takeWhile1 (/= '=')
65 | ; eq <- optional (char '=')
66 | ; return (False, isJust eq)
67 | }
68 | ]
69 | attrValue :: Parser Text
70 | attrValue = choice
71 | [ do { skip (== '"'); v <- takeTill (== '"'); skip (== '"'); return v }
72 | , do { skip (== '\'')
73 | ; v <- takeTill (== '\'')
74 | ; skip (== '\''); return v
75 | }
76 | , takeWhile1 (not . isSpace)
77 | ]
78 | normalizeEntities :: Text -> Text
79 | normalizeEntities
80 | = Data.Text.replace "‐" "-"
81 | . Data.Text.replace "‐" "-"
82 | . Data.Text.replace "‐" "-"
83 | . Data.Text.replace "‐" "-"
84 | . Data.Text.replace "‐" "-"
85 |
86 | -- | Annotates 'HtmlEntity' with the 'lang' tag extracted from it or its
87 | -- ancestors.
88 | data LangHtmlEntity = LangHtmlEntity
89 | { -- | The @lang@ tag extracted from the HTML 'entity' or its ancestors.
90 | lang :: Maybe LanguageTag
91 | -- | The annotated HTML 'entity'.
92 | , entity :: HtmlEntity
93 | } deriving (Show, Eq)
94 |
95 | -- | Annotates the given HTML entities with the language tag extracted from
96 | -- their @lang@ attributes. If a parent entity has @lang@ attribute, its
97 | -- all descendants are annotated with the same language tag.
98 | annotateWithLang :: [HtmlEntity] -> [LangHtmlEntity]
99 | annotateWithLang =
100 | annotate []
101 | where
102 | annotate :: [(HtmlTag, Maybe LanguageTag)]
103 | -> [HtmlEntity]
104 | -> [LangHtmlEntity]
105 | annotate _ [] = []
106 | annotate stack (x@HtmlStartTag { tag = tag', rawAttributes = attrs } : xs) =
107 | LangHtmlEntity thisLang x : annotate nextStack xs
108 | where
109 | parentLang :: Maybe LanguageTag
110 | parentLang = case stack of
111 | (_, l):_ -> l
112 | _ -> Nothing
113 | thisLang :: Maybe LanguageTag
114 | thisLang = extractLang attrs <|> parentLang
115 | nextStack :: [(HtmlTag, Maybe LanguageTag)]
116 | nextStack = (tag', thisLang) : stack
117 | annotate stack (x@HtmlEndTag { tag = tag' } : xs) =
118 | LangHtmlEntity thisLang x : annotate nextStack xs
119 | where
120 | (nextStack, thisLang) = case stack of
121 | [] -> ([], Nothing)
122 | s@((t, lang'):ys) ->
123 | (if t == tag' then ys else s, lang')
124 | annotate stack (x : xs) =
125 | LangHtmlEntity parentLang x : annotate stack xs
126 | where
127 | parentLang :: Maybe LanguageTag
128 | parentLang = case stack of
129 | (_, l):_ -> l
130 | _ -> Nothing
131 |
132 | -- | Determines whether the given language tag refers to any kind of Korean.
133 | --
134 | -- >>> isKorean "ko"
135 | -- True
136 | -- >>> isKorean "ko-KR"
137 | -- True
138 | -- >>> isKorean "kor-Hang"
139 | -- True
140 | -- >>> isKorean "en"
141 | -- False
142 | -- >>> isKorean "en-KR"
143 | -- False
144 | isKorean :: LanguageTag -> Bool
145 | isKorean lang' =
146 | l == "ko" || l == "kor" ||
147 | "ko-" `isPrefixOf` l ||
148 | "kor-" `isPrefixOf` l
149 | where
150 | l :: Text
151 | l = toLower lang'
152 |
153 | -- | Determines whether the given language tag undoubtedly does not refer
154 | -- to any kind of Korean.
155 | --
156 | -- >>> isNeverKorean $ Just "ko"
157 | -- False
158 | -- >>> isNeverKorean $ Just "ko-KR"
159 | -- False
160 | -- >>> isNeverKorean Nothing
161 | -- False
162 | -- >>> isNeverKorean $ Just "en"
163 | -- True
164 | isNeverKorean :: Maybe LanguageTag -> Bool
165 | isNeverKorean Nothing = False
166 | isNeverKorean (Just lang') = not (isKorean lang')
167 |
--------------------------------------------------------------------------------
/src/Text/Seonbi/Html/Scanner.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE OverloadedStrings #-}
2 | {-# LANGUAGE TupleSections #-}
3 | module Text.Seonbi.Html.Scanner
4 | ( Result (..)
5 | , scanHtml
6 | ) where
7 |
8 | import Data.Char
9 | import Prelude hiding (takeWhile)
10 |
11 | import Data.Attoparsec.Text.Lazy
12 | import Data.Map.Strict
13 | import qualified Data.Text
14 | import qualified Data.Text.Lazy
15 |
16 | import Text.Seonbi.Html.Entity
17 | import Text.Seonbi.Html.Tag
18 | import Text.Seonbi.Html.TagStack
19 |
20 | htmlFragments :: Parser [HtmlEntity]
21 | htmlFragments = do
22 | result <- option [] $ fragments Text.Seonbi.Html.TagStack.empty
23 | txt <- htmlText Text.Seonbi.Html.TagStack.empty
24 | endOfInput
25 | return $ case txt of
26 | HtmlText { rawText = "" } -> result
27 | _ -> result ++ [txt]
28 |
29 | fragments :: HtmlTagStack -> Parser [HtmlEntity]
30 | fragments tagStack' = do
31 | txt <- htmlText tagStack'
32 | (entities, nextStack) <- htmlEntity tagStack'
33 | nextChunk <- option [] $ fragments nextStack
34 | let chunks = entities ++ nextChunk
35 | return $ case txt of
36 | HtmlText { rawText = "" } -> chunks
37 | txt' -> txt' : chunks
38 |
39 | htmlText :: HtmlTagStack -> Parser HtmlEntity
40 | htmlText tagStack' = do
41 | texts <- many' textFragment
42 | return $ mkText $ Data.Text.concat texts
43 | where
44 | mkText :: Data.Text.Text -> HtmlEntity
45 | mkText txt = HtmlText { tagStack = tagStack', rawText = txt }
46 |
47 | textFragment :: Parser Data.Text.Text
48 | textFragment = choice
49 | [ takeWhile1 (/= '<')
50 | , do
51 | a <- char '<'
52 | b <- satisfy $ \ c ->
53 | not (c == '!' || c == '/' || isAsciiUpper c || isAsciiLower c)
54 | return $ Data.Text.pack [a, b]
55 | ]
56 |
57 | htmlEntity :: HtmlTagStack -> Parser ([HtmlEntity], HtmlTagStack)
58 | htmlEntity tagStack' = choice
59 | [ htmlComment tagStack'
60 | , cdata tagStack'
61 | , startTag tagStack'
62 | , endTag tagStack'
63 | -- fallback:
64 | , (, tagStack') . (: []) . HtmlText tagStack' . Data.Text.singleton
65 | <$> anyChar
66 | ]
67 |
68 | -- https://www.w3.org/TR/html5/syntax.html#comments
69 | htmlComment :: HtmlTagStack -> Parser ([HtmlEntity], HtmlTagStack)
70 | htmlComment tagStack' = do
71 | _ <- string ""
84 | return
85 | ( [ HtmlComment
86 | { tagStack = tagStack'
87 | , comment = Data.Text.concat contents
88 | }
89 | ]
90 | , tagStack'
91 | )
92 |
93 | -- https://www.w3.org/TR/html5/syntax.html#cdata-sections
94 | cdata :: HtmlTagStack -> Parser ([HtmlEntity], HtmlTagStack)
95 | cdata tagStack' = do
96 | _ <- string "'
106 | return $ Data.Text.snoc a b
107 | ]
108 | _ <- string "]]>"
109 | return
110 | ( [HtmlCdata { tagStack = tagStack', text = Data.Text.concat contents }]
111 | , tagStack'
112 | )
113 |
114 | -- https://www.w3.org/TR/html5/syntax.html#start-tags
115 | startTag :: HtmlTagStack -> Parser ([HtmlEntity], HtmlTagStack)
116 | startTag tagStack' = do
117 | _ <- char '<'
118 | tag' <- htmlTag
119 | attributes <- many' $ choice
120 | [ do
121 | s <- char '"'
122 | c <- takeWhile (/= '"')
123 | e <- char '"'
124 | return (Data.Text.cons s $ Data.Text.snoc c e)
125 | , do
126 | s <- char '\''
127 | c <- takeWhile (/= '\'')
128 | e <- char '\''
129 | return (Data.Text.cons s $ Data.Text.snoc c e)
130 | , takeWhile1 $ \ c -> c /= '"' && c /= '\'' && c /= '/' && c /= '>'
131 | ]
132 | selfClosing <- option ' ' $ char '/'
133 | _ <- char '>'
134 | let (trailingEntities, nextTagStack) =
135 | if selfClosing == '/' || htmlTagKind tag' == Void
136 | then ([HtmlEndTag { tagStack = tagStack', tag = tag' }], tagStack')
137 | else ([], push tag' tagStack')
138 | return
139 | ( HtmlStartTag
140 | { tagStack = tagStack'
141 | , tag = tag'
142 | , rawAttributes = Data.Text.concat attributes
143 | } : trailingEntities
144 | , nextTagStack
145 | )
146 |
147 | -- https://www.w3.org/TR/html5/syntax.html#end-tags
148 | endTag :: HtmlTagStack -> Parser ([HtmlEntity], HtmlTagStack)
149 | endTag tagStack' = do
150 | _ <- string ""
151 | tag' <- htmlTag
152 | _ <- char '>'
153 | return $ case htmlTagKind tag' of
154 | Void -> ([], tagStack')
155 | _ ->
156 | let
157 | nextTagStack = pop tag' tagStack'
158 | in
159 | ( [HtmlEndTag { tagStack = nextTagStack, tag = tag' }]
160 | , nextTagStack
161 | )
162 |
163 | htmlTag :: Parser HtmlTag
164 | htmlTag = do
165 | name <- tagName
166 | case Data.Map.Strict.lookup (Data.Text.toLower name) htmlTagNames of
167 | Just t -> return t
168 | _ -> fail ("failed to parse; invalid tag: " ++ Data.Text.unpack name)
169 |
170 | tagName :: Parser Data.Text.Text
171 | tagName = do
172 | first <- satisfy $ \ c -> isAsciiUpper c || isAsciiLower c
173 | rest <- takeWhile $ \ c -> isAsciiUpper c || isAsciiLower c || isDigit c
174 | return $ Data.Text.cons first rest
175 |
176 | scanHtml :: Data.Text.Lazy.Text -> Result [HtmlEntity]
177 | scanHtml = parse htmlFragments
178 |
--------------------------------------------------------------------------------
/test/Text/Seonbi/Html/ClipperSpec.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE OverloadedLists #-}
2 | {-# LANGUAGE OverloadedStrings #-}
3 | module Text.Seonbi.Html.ClipperSpec (spec) where
4 |
5 | import Control.Monad
6 |
7 | import Data.Text
8 | import Test.Hspec
9 |
10 | import Text.Seonbi.Html.Clipper
11 | import Text.Seonbi.Html.Entity
12 | import Text.Seonbi.Html.Tag
13 |
14 | spec :: Spec
15 | spec = do
16 | describe "clipPrefixText" $ do
17 | it "returns Nothing if entities are empty and a prefix is not empty" $
18 | clipPrefixText "foo" [] `shouldBe` Nothing
19 | it "returns Nothing if the first entity is not an HtmlText" $
20 | forM_ (["", "foo"] :: [Text]) $ \ prefix -> do
21 | clipPrefixText prefix
22 | [ HtmlStartTag [] P ""
23 | , HtmlText [P] "foo"
24 | , HtmlEndTag [] P
25 | ] `shouldBe` Nothing
26 | clipPrefixText prefix
27 | [ HtmlComment [] "foo"
28 | , HtmlStartTag [] P ""
29 | , HtmlText [P] "foo"
30 | , HtmlEndTag [] P
31 | ] `shouldBe` Nothing
32 | clipPrefixText prefix [HtmlEndTag [] P] `shouldBe` Nothing
33 | clipPrefixText prefix [HtmlCdata [] "foo"] `shouldBe` Nothing
34 | it "returns Just [] if entities are empty and a prefix is empty too" $
35 | clipPrefixText "" [] `shouldBe` Just []
36 | it "returns entities with the prefix text dropped" $ do
37 | clipPrefixText "foo"
38 | [ HtmlText [] "foobar"
39 | , HtmlStartTag [] P ""
40 | , HtmlText [P] "foo"
41 | , HtmlEndTag [] P
42 | ]
43 | `shouldBe` Just
44 | [ HtmlText [] "bar"
45 | , HtmlStartTag [] P ""
46 | , HtmlText [P] "foo"
47 | , HtmlEndTag [] P
48 | ]
49 | clipPrefixText "foo"
50 | [ HtmlText [] "foo"
51 | , HtmlStartTag [] P ""
52 | , HtmlText [P] "foo"
53 | , HtmlEndTag [] P
54 | ]
55 | `shouldBe` Just
56 | [ HtmlStartTag [] P ""
57 | , HtmlText [P] "foo"
58 | , HtmlEndTag [] P
59 | ]
60 | it "ignores HtmlComment entities but preseves them" $ do
61 | clipPrefixText "foo"
62 | [ HtmlComment [] "comment"
63 | , HtmlText [] "foobar"
64 | , HtmlStartTag [] P ""
65 | , HtmlText [P] "foo"
66 | , HtmlEndTag [] P
67 | ]
68 | `shouldBe` Just
69 | [ HtmlComment [] "comment"
70 | , HtmlText [] "bar"
71 | , HtmlStartTag [] P ""
72 | , HtmlText [P] "foo"
73 | , HtmlEndTag [] P
74 | ]
75 | clipPrefixText "foo"
76 | [ HtmlComment [] "comment"
77 | , HtmlText [] "foo"
78 | , HtmlStartTag [] P ""
79 | , HtmlText [P] "foo"
80 | , HtmlEndTag [] P
81 | ]
82 | `shouldBe` Just
83 | [ HtmlComment [] "comment"
84 | , HtmlStartTag [] P ""
85 | , HtmlText [P] "foo"
86 | , HtmlEndTag [] P
87 | ]
88 |
89 | describe "clipSuffixText" $ do
90 | it "returns Nothing if entities are empty and a suffix is not empty" $
91 | clipSuffixText "foo" [] `shouldBe` Nothing
92 | it "returns Nothing if the last entity is not an HtmlText" $
93 | forM_ (["", "foo"] :: [Text]) $ \ suffix -> do
94 | clipSuffixText suffix
95 | [ HtmlStartTag [] P ""
96 | , HtmlText [P] "foo"
97 | , HtmlEndTag [] P
98 | ] `shouldBe` Nothing
99 | clipSuffixText suffix
100 | [ HtmlStartTag [] P ""
101 | , HtmlText [P] "foo"
102 | , HtmlEndTag [] P
103 | , HtmlComment [] "foo"
104 | ] `shouldBe` Nothing
105 | clipSuffixText suffix [HtmlEndTag [] P] `shouldBe` Nothing
106 | clipSuffixText suffix [HtmlCdata [] "foo"] `shouldBe` Nothing
107 | it "returns Just [] if entities are empty and a suffix is empty too" $
108 | clipSuffixText "" [] `shouldBe` Just []
109 | it "returns entities with the suffix text dropped" $ do
110 | clipSuffixText "bar"
111 | [ HtmlStartTag [] P ""
112 | , HtmlText [P] "foo"
113 | , HtmlEndTag [] P
114 | , HtmlText [] "foobar"
115 | ]
116 | `shouldBe` Just
117 | [ HtmlStartTag [] P ""
118 | , HtmlText [P] "foo"
119 | , HtmlEndTag [] P
120 | , HtmlText [] "foo"
121 | ]
122 | clipSuffixText "foo"
123 | [ HtmlStartTag [] P ""
124 | , HtmlText [P] "foo"
125 | , HtmlEndTag [] P
126 | , HtmlText [] "foo"
127 | ]
128 | `shouldBe` Just
129 | [ HtmlStartTag [] P ""
130 | , HtmlText [P] "foo"
131 | , HtmlEndTag [] P
132 | ]
133 | it "ignores HtmlComment entities but preseves them" $ do
134 | clipSuffixText "bar"
135 | [ HtmlStartTag [] P ""
136 | , HtmlText [P] "foo"
137 | , HtmlEndTag [] P
138 | , HtmlText [] "foobar"
139 | , HtmlComment [] "comment"
140 | ]
141 | `shouldBe` Just
142 | [ HtmlStartTag [] P ""
143 | , HtmlText [P] "foo"
144 | , HtmlEndTag [] P
145 | , HtmlText [] "foo"
146 | , HtmlComment [] "comment"
147 | ]
148 | clipSuffixText "foo"
149 | [ HtmlStartTag [] P ""
150 | , HtmlText [P] "foo"
151 | , HtmlEndTag [] P
152 | , HtmlText [] "foo"
153 | , HtmlComment [] "comment"
154 | ]
155 | `shouldBe` Just
156 | [ HtmlStartTag [] P ""
157 | , HtmlText [P] "foo"
158 | , HtmlEndTag [] P
159 | , HtmlComment [] "comment"
160 | ]
161 |
162 | specify "clipText" $ do
163 | clipText "foo" "baz"
164 | [ HtmlText [] "foo"
165 | , HtmlStartTag [] P ""
166 | , HtmlText [P] "bar"
167 | , HtmlEndTag [] P
168 | , HtmlText [] "baz"
169 | ] `shouldBe` Just
170 | [ HtmlStartTag [] P ""
171 | , HtmlText [P] "bar"
172 | , HtmlEndTag [] P
173 | ]
174 | clipText "foo" "quux"
175 | [ HtmlText [] "foobar"
176 | , HtmlStartTag [] P ""
177 | , HtmlText [P] "baz"
178 | , HtmlEndTag [] P
179 | , HtmlText [] "quxquux"
180 | ] `shouldBe` Just
181 | [ HtmlText [] "bar"
182 | , HtmlStartTag [] P ""
183 | , HtmlText [P] "baz"
184 | , HtmlEndTag [] P
185 | , HtmlText [] "qux"
186 | ]
187 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Seonbi: SmartyPants for Korean language
2 | =======================================
3 |
4 | [![][releases-badge]][releases] [![][hackage-badge]][hackage] [![][dockerhub-badge]][dockerhub] [![][ci-status-badge]][ci]
5 |
6 | [][demo web app]
7 |
8 | (TL;DR: See the [demo web app].)
9 |
10 | Seonbi (선비) is an HTML preprocessor that makes typographic adjustments
11 | to an HTML so that the result uses accurate punctuations according to
12 | the modern Korean orthography.
13 | (It's similar to what [SmartyPants] does for text written in English.)
14 |
15 | It also transforms `ko-Kore` text (國漢文混用; [Korean mixed script]) into
16 | `ko-Hang` text (한글전용; Hangul-only script).
17 |
18 | Seonbi provides a Haskell library, a CLI, and an HTTP API; any of them can
19 | perform the following transformations:
20 |
21 | - All hanja words (e.g., `漢字`) into corresponding hangul-only words
22 | (e.g., `한자`)
23 | - Straight quotes and apostrophes (`"` & `'`) into curly quotes HTML
24 | entities (`“`, `”`, `‘`, & `’`)
25 | - Three consecutive periods (`...` or `。。。`) into an ellipsis entity (`…`)
26 | - Classical (Chinese-style) stops (`。`, `、`, `?`, & `!`) into modern
27 | (English-style) stops (`.`, `,`, `?`, & `!`)
28 | - Pairs of less-than and greater-than inequality symbols (`<` & `>`) into
29 | pairs of proper angle quotes (`〈` & `〉`)
30 | - Pairs of two consecutive inequality symbols (`<<` & `>>`) into
31 | pairs of proper double angle quotes (`《` & `》`)
32 | - A hyphen (`-`) or hangul vowel *eu* (`ㅡ`) surrounded by spaces, or
33 | two/three consecutive hyphens (`--` or `---`) into a proper em dash (`—`)
34 | - A less-than inequality symbol followed by a hyphen or an equality
35 | symbol (`<-`, `<=`) into arrows to the left (`←`, `⇐`)
36 | - A hyphen or an equality symbol followed by a greater-than inequality
37 | symbol (`->`, `=>`) into arrows to the right (`→`, `⇒`)
38 | - A hyphen or an equality symbol wrapped by inequality symbols (`<->`, `<=>`)
39 | into bi-directional arrows (`↔`, `⇔`)
40 |
41 | Each transformations can be partially turned on and off, and some
42 | transformations have many options.
43 |
44 | All transformations work with both plain texts and rich text tree.
45 | In a similar way to SmartyPants, it does not modify characters within
46 | several sensitive HTML elements like ``/``/`