├── data
    ├── .gitignore
    └── Unihan
    │   └── Unihan_NumericValues.txt
├── Setup.lhs
├── .gitignore
├── CJK
    ├── Data
    │   ├── QuocNgu.hs
    │   ├── KoreanYale.hs
    │   ├── Hangul.hs
    │   ├── Jyutping.hs
    │   ├── Types.hs
    │   ├── Internal.hs
    │   ├── Unihan
    │   │   ├── NumericValues.hs
    │   │   ├── Variants.hs
    │   │   ├── RadicalStrokeCounts.hs
    │   │   ├── DictionaryLikeData.hs
    │   │   └── Readings.hs
    │   ├── Pinyin.hs
    │   └── CEDICT.hs
    └── Utilities.hs
├── tests
    └── Tests.hs
├── release
├── cjk.cabal
└── LICENSE


/data/.gitignore:
--------------------------------------------------------------------------------
1 | # Packed files
2 | cedict_1_0_ts_utf-8_mdbg.txt.gz
3 | Unihan.zip
4 | 


--------------------------------------------------------------------------------
/Setup.lhs:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env runhaskell
2 | 
3 | > import Distribution.Simple
4 | > main = defaultMain


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Build artifacts
2 | dist/
3 | *.hi
4 | *.o
5 | 
6 | # OS junk
7 | Thumbs.db
8 | .DS_Store
9 | 


--------------------------------------------------------------------------------
/CJK/Data/QuocNgu.hs:
--------------------------------------------------------------------------------
1 | module CJK.Data.QuocNgu where
2 | 
3 | import qualified Data.Text as Text
4 | 
5 | 
6 | -- TODO: flesh out this definition
7 | type Phone = Text.Text
8 | 


--------------------------------------------------------------------------------
/CJK/Data/KoreanYale.hs:
--------------------------------------------------------------------------------
1 | module CJK.Data.KoreanYale where
2 | 
3 | import qualified Data.Text as Text
4 | 
5 | 
6 | -- TODO: flesh out this definition
7 | type Phone = Text.Text
8 | 


--------------------------------------------------------------------------------
/CJK/Data/Hangul.hs:
--------------------------------------------------------------------------------
 1 | module CJK.Data.Hangul where
 2 | 
 3 | import qualified Data.Text as Text
 4 | import qualified Data.Text.ICU.Normalize as Text
 5 | 
 6 | 
 7 | -- TODO: flesh out this definition
 8 | type Phone = Char
 9 | 
10 | fromJamos :: Text.Text -> Phone
11 | fromJamos s = case Text.unpack (Text.normalize Text.NFC s) of
12 |     [c] -> c
13 |     cs -> error $ "Certainly non-Korean phone " ++ cs
14 | 


--------------------------------------------------------------------------------
/CJK/Data/Jyutping.hs:
--------------------------------------------------------------------------------
 1 | module CJK.Data.Jyutping where
 2 | 
 3 | import qualified Data.Text as Text
 4 | 
 5 | 
 6 | data Tone = HighLevel
 7 |           | MidRising
 8 |           | MidLevel
 9 |           | LowFalling
10 |           | LowRising
11 |           | LowLevel
12 |           deriving (Eq, Ord)
13 | 
14 | instance Show Tone where
15 |     show = show . toneNumber
16 | 
17 | toneNumber :: Tone -> Int
18 | toneNumber HighLevel  = 1
19 | toneNumber MidRising  = 2
20 | toneNumber MidLevel   = 3
21 | toneNumber LowFalling = 4
22 | toneNumber LowRising  = 5
23 | toneNumber LowLevel   = 6
24 | 
25 | 
26 | data Phone = Phone {
27 |     sound :: Text.Text,
28 |     tone  :: Tone
29 |   }
30 | 
31 | instance Show Phone where
32 |     show jyut = Text.unpack (sound jyut) ++ show (tone jyut)
33 | 


--------------------------------------------------------------------------------
/CJK/Data/Types.hs:
--------------------------------------------------------------------------------
 1 | module CJK.Data.Types where
 2 | 
 3 | import qualified Data.Text as Text
 4 | 
 5 | 
 6 | newtype KangXiRadical = KangXi {
 7 |     kangXiRadicalNumber :: Int -- ^ Radical number in the range 1 to 214 inclusive
 8 |   } deriving (Show) -- Useful for debugging in GHCi
 9 | 
10 | type StrokeCount = Int
11 | 
12 | data RadicalStrokeCount a = RSC {
13 |     radical           :: a,          -- ^ The radical which is considered to form the main part of the character
14 |     additionalStrokes :: StrokeCount -- ^ The “additional strokes” value is the residual stroke-count, the count of all strokes remaining after eliminating all strokes associated with the radical.
15 |   } deriving (Show) -- Useful for debugging in GHCi
16 | 
17 | -- | Location of the associated information in the《漢語大字典》 Hànyǔ Dà Zìdiǎn
18 | type HDZEntry = Text.Text
19 | 
20 | -- | How to input the character in Cangjie
21 | type CangjieInputCode = Text.Text
22 | 


--------------------------------------------------------------------------------
/tests/Tests.hs:
--------------------------------------------------------------------------------
 1 | module Main where
 2 | 
 3 | import Control.Monad
 4 | import Data.Maybe
 5 | import System.Exit
 6 | 
 7 | import qualified CJK.Data.Unihan.DictionaryLikeData  as DictionaryLikeData
 8 | import qualified CJK.Data.Unihan.NumericValues       as NumericValues
 9 | import qualified CJK.Data.Unihan.RadicalStrokeCounts as RadicalStrokeCounts
10 | import qualified CJK.Data.Unihan.Readings            as Readings
11 | import qualified CJK.Data.Unihan.Variants            as Variants
12 | import qualified CJK.Data.CEDICT as CEDICT
13 | 
14 | 
15 | -- Just check a single data point from each module. As long as the data parses
16 | -- successfully there is a 90% chance everything is working
17 | main :: IO ()
18 | main = do
19 |     checkNot "DictionaryLikeData"  $ isNothing $ DictionaryLikeData.cangjie '好'
20 |     checkNot "NumericValues"       $ isNothing $ NumericValues.numericValue '十'
21 |     checkNot "RadicalStrokeCounts" $ length (RadicalStrokeCounts.unicode '好') == 0
22 |     checkNot "Readings"            $ length (Readings.mandarinBestEffort '好') == 0
23 |     checkNot "Variants"            $ length (Variants.traditionalVariants '电') == 0
24 |     checkNot "CEDICT"              $ length CEDICT.entries == 0
25 | 
26 | checkNot :: String -> Bool -> IO ()
27 | checkNot msg p = do
28 |     putStr (msg ++ ": ")
29 |     if p then putStrLn "failure" >> exitWith (ExitFailure 1)
30 |          else putStrLn "success"


--------------------------------------------------------------------------------
/CJK/Data/Internal.hs:
--------------------------------------------------------------------------------
 1 | module CJK.Data.Internal where
 2 | 
 3 | import qualified CJK.Data.Jyutping as Jyutping
 4 | import qualified CJK.Data.Pinyin   as Pinyin
 5 | import CJK.Data.Types
 6 | 
 7 | import Control.Applicative
 8 | 
 9 | import Data.Char
10 | import Data.Attoparsec.Text
11 | 
12 | 
13 | jyutpingP :: Parser Jyutping.Phone
14 | jyutpingP = liftA2 Jyutping.Phone (takeWhile1 (\c -> isAsciiUpper c || isAsciiLower c || c == '[' || c == ']')) jyutpingToneP -- Some kCheungBauer says [ng]ai1
15 | 
16 | jyutpingToneP :: Parser Jyutping.Tone
17 | jyutpingToneP = char '1' *> pure Jyutping.HighLevel
18 |             <|> char '2' *> pure Jyutping.MidRising
19 |             <|> char '3' *> pure Jyutping.MidLevel
20 |             <|> char '4' *> pure Jyutping.LowFalling
21 |             <|> char '5' *> pure Jyutping.LowRising
22 |             <|> char '6' *> pure Jyutping.LowLevel
23 | 
24 | 
25 | tonedPinyinP :: Parser Pinyin.Phone
26 | tonedPinyinP = liftA2 Pinyin.Phone (takeWhile1 (\c -> isAsciiUpper c || isAsciiLower c || c == 'ü' || c == 'Ü')) pinyinToneP
27 | 
28 | pinyinToneP :: Parser Pinyin.Tone
29 | pinyinToneP = char '1' *> pure Pinyin.Flat
30 |           <|> char '2' *> pure Pinyin.Rising
31 |           <|> char '3' *> pure Pinyin.FallingRising
32 |           <|> char '4' *> pure Pinyin.Falling
33 |           <|> char '5' *> pure Pinyin.Neutral
34 | 
35 | hdzEntryP :: Parser HDZEntry
36 | hdzEntryP = takeWhile1 (\c -> isDigit c || c == '.')
37 | 


--------------------------------------------------------------------------------
/release:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | 
 4 | echo "Have you updated the version number? Type 'yes' if you have!"
 5 | read version_response
 6 | 
 7 | if [ "$version_response" != "yes" ]; then
 8 |     echo "Go and update the version number"
 9 |     exit 1
10 | fi
11 | 
12 | sdist_output=`runghc Setup.lhs sdist`
13 | 
14 | if [ "$?" != "0" ]; then
15 |     echo "Cabal sdist failed, aborting"
16 |     exit 1
17 | fi
18 | 
19 | # Want to find a line like:
20 | # Source tarball created: dist/ansi-terminal-0.1.tar.gz
21 | 
22 | # Test this with:
23 | # runghc Setup.lhs sdist | grep ...
24 | filename=`echo $sdist_output | sed 's/.*Source tarball created: \([^ ]*\).*/\1/'`
25 | echo "Filename: $filename"
26 | 
27 | if [ "$filename" = "$sdist_output" ]; then
28 |     echo "Could not find filename, aborting"
29 |     exit 1
30 | fi
31 | 
32 | # Test this with:
33 | # echo dist/ansi-terminal-0.1.tar.gz | sed ...
34 | version=`echo $filename | sed 's/^[^0-9]*\([0-9\.]*\).tar.gz$/\1/'`
35 | echo "Version: $version"
36 | 
37 | if [ "$version" = "$filename" ]; then
38 |     echo "Could not find version, aborting"
39 |     exit 1
40 | fi
41 | 
42 | echo "This is your last chance to abort! I'm going to upload in 10 seconds"
43 | sleep 10
44 | 
45 | git tag "v$version"
46 | 
47 | if [ "$?" != "0" ]; then
48 |     echo "Git tag failed, aborting"
49 |     exit 1
50 | fi
51 | 
52 | # You need to have stored your Hackage username and password as directed by cabal upload
53 | # I use -v5 because otherwise the error messages can be cryptic :-)
54 | cabal upload -v2 $filename
55 | 
56 | if [ "$?" != "0" ]; then
57 |     echo "Hackage upload failed, aborting"
58 |     exit 1
59 | fi
60 | 
61 | # Success!
62 | exit 0
63 | 


--------------------------------------------------------------------------------
/CJK/Utilities.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE CPP #-}
 2 | module CJK.Utilities where
 3 | 
 4 | import Control.Applicative
 5 | 
 6 | import Data.Char
 7 | import qualified Data.ByteString.Lazy as BS
 8 | 
 9 | import qualified Data.Text as Text
10 | import qualified Data.Text.Lazy as TextL
11 | import qualified Data.Text.Lazy.Encoding as TextL
12 | 
13 | import qualified Data.Attoparsec.Text as DAT
14 | import qualified Data.Attoparsec.Text.Lazy as DATL
15 | 
16 | import Paths_cjk
17 | 
18 | 
19 | readUTF8DataFile :: FilePath -> IO TextL.Text
20 | readUTF8DataFile fp = do
21 |     full_fp <- getDataFileName fp
22 |     readUTF8File full_fp
23 | 
24 | readUTF8File :: FilePath -> IO TextL.Text
25 | readUTF8File = fmap TextL.decodeUtf8 . BS.readFile
26 | 
27 | parseLazy :: DAT.Parser a -> TextL.Text -> a
28 | #ifndef MIN_VERSION_attoparsec
29 | #define MIN_VERSION_attoparsec(A,B,C) 0
30 | #endif
31 | #if MIN_VERSION_attoparsec(0,10,4)
32 | -- Lazy parse is bugged in attoparsec-0.10.3 and below (see ticket #10/#21)
33 | parseLazy p s = case DATL.eitherResult (DATL.parse p s) of
34 |     Left err -> error $ "parseLazy: " ++ err
35 |     Right x  -> x
36 | #else
37 | parseLazy p s = case DAT.parseOnly p (TextL.toStrict s) of
38 |     Left err -> error $ "parseLazy: " ++ err
39 |     Right x  -> x
40 | #endif
41 | 
42 | 
43 | charP :: DAT.Parser Char
44 | charP = fmap chr $ DAT.string (Text.pack "U+") *> DAT.hexadecimal
45 | 
46 | canParse :: DAT.Parser a -> DAT.Parser Bool
47 | canParse p = p *> pure True <|> pure False
48 | 
49 | lineTerminator :: DAT.Parser ()
50 | lineTerminator = DAT.endOfLine <|> DAT.endOfInput
51 | 
52 | skipTrueSpace :: DAT.Parser ()
53 | skipTrueSpace = DAT.skipWhile isTrueSpace
54 | 
55 | isTrueSpace :: Char -> Bool
56 | isTrueSpace c = isSpace c && c /= '\r' && c /= '\n'
57 | 
58 | 
59 | liftA4 :: Applicative t => (a -> b -> c -> d -> e) -> t a -> t b -> t c -> t d -> t e
60 | liftA4 f ma mb mc md = pure f <*> ma <*> mb <*> mc <*> md
61 | 


--------------------------------------------------------------------------------
/cjk.cabal:
--------------------------------------------------------------------------------
 1 | name:                cjk
 2 | version:             0.1.0.1
 3 | synopsis:            Data about Chinese, Japanese and Korean characters and languages
 4 | description:         A Haskell interface to the most important information from the Unicode Unihan character
 5 |                      database and CC-CEDICT free Chinese-English dictionary.
 6 |                      .
 7 |                      Contributions of data from more sources are very welcome!
 8 | homepage:            http://github.com/batterseapower/cjk
 9 | license:             BSD3
10 | license-file:        LICENSE
11 | author:              Max Bolingbroke <batterseapower@hotmail.com>
12 | maintainer:          Max Bolingbroke <batterseapower@hotmail.com>
13 | category:            Text
14 | build-type:          Simple
15 | cabal-version:       >=1.8
16 | data-files:          data/cedict_1_0_ts_utf-8_mdbg.txt
17 |                      data/Unihan/*.txt
18 | 
19 | library
20 |   exposed-modules:   CJK.Data.Unihan.DictionaryLikeData
21 |                      CJK.Data.Unihan.NumericValues
22 |                      CJK.Data.Unihan.RadicalStrokeCounts
23 |                      CJK.Data.Unihan.Readings
24 |                      CJK.Data.Unihan.Variants
25 |                      CJK.Data.CEDICT
26 |                      CJK.Data.Hangul
27 |                      CJK.Data.Jyutping
28 |                      CJK.Data.KoreanYale
29 |                      CJK.Data.Pinyin
30 |                      CJK.Data.QuocNgu
31 |                      CJK.Data.Types
32 |   other-modules:     CJK.Data.Internal
33 |                      CJK.Utilities
34 |                      -- I don't think I should have to put this here, but if I don't then any executables
35 |                      -- linking against the cjk library will fail to find symbols exported by this module
36 |                      Paths_cjk
37 |   build-depends:       base >=4.5 && < 5
38 |                      , containers >=0.4.2
39 |                      , bytestring >=0.9
40 |                      , text >=0.11
41 |                      , text-icu >=0.6.3.5
42 |                      , attoparsec >=0.10.3
43 | 
44 | test-suite tests
45 |     type:            exitcode-stdio-1.0
46 |     hs-source-dirs:  tests
47 |     main-is:         Tests.hs
48 |     build-depends:     base
49 |                      , cjk
50 | 
51 | source-repository head
52 |     type:            git
53 |     location:        https://github.com/batterseapower/cjk.git
54 | 


--------------------------------------------------------------------------------
/CJK/Data/Unihan/NumericValues.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | module CJK.Data.Unihan.NumericValues (
 3 |     NumericUse(..), numericValue
 4 |   ) where
 5 | 
 6 | import CJK.Utilities
 7 | 
 8 | import Control.Applicative
 9 | 
10 | import qualified Data.Text as Text
11 | import qualified Data.Text.Lazy as TextL
12 | import Data.Attoparsec.Text
13 | 
14 | import Data.Char
15 | import Data.Maybe
16 | import qualified Data.Map as M
17 | import Data.List
18 | 
19 | import System.IO.Unsafe
20 | 
21 | 
22 | data NumericUse = AccountingUse -- ^ Used in the writing of accounting numerals (to prevent fraud)
23 |                 | OtherUse      -- ^ Used in certain unusual, specialized contexts
24 |                 | PrimaryUse    -- ^ Used in the writing of numbers in the standard fashion
25 |                 deriving (Eq, Show)
26 | 
27 | -- | The value of the character and the contexts in which it is used
28 | numericValue :: Char -> Maybe (NumericUse, Integer)
29 | numericValue c = M.lookup c numericValues
30 | 
31 | 
32 | type NumericValuesMap = M.Map Char (NumericUse, Integer)
33 | 
34 | emptyNumericValuesMap :: NumericValuesMap
35 | emptyNumericValuesMap = M.empty
36 | 
37 | unionNumericValuesMap :: NumericValuesMap -> NumericValuesMap -> NumericValuesMap
38 | unionNumericValuesMap = M.unionWith (error "unionNumericValuesMap: impossible") -- There is at most one line for each (character, field name) combination
39 | 
40 | {-# NOINLINE contents #-}
41 | contents :: TextL.Text
42 | contents = unsafePerformIO (readUTF8DataFile "data/Unihan/Unihan_NumericValues.txt")
43 | 
44 | numericValues :: NumericValuesMap
45 | numericValues = parseLazy fileP contents
46 | 
47 | 
48 | fileP :: Parser NumericValuesMap
49 | fileP = fmap (foldl' unionNumericValuesMap emptyNumericValuesMap) (lineP `manyTill` endOfInput)
50 | 
51 | lineP :: Parser NumericValuesMap
52 | lineP = do { c <- charP <* skipSpace; dataP <- numericValueP c <* skipSpace; dataP <* skipTrueSpace <* lineTerminator }
53 |     <|> char '#' *> manyTill anyChar lineTerminator *> pure emptyNumericValuesMap
54 |     <|> manyTill skipTrueSpace lineTerminator *> pure emptyNumericValuesMap
55 |     <?> "line"
56 | 
57 | numericValueP :: Char -> Parser (Parser NumericValuesMap)
58 | numericValueP c = string "kAccountingNumeric" *> pure (liftA (mk AccountingUse) decimal)
59 |               <|> string "kOtherNumeric"      *> pure (liftA (mk OtherUse)      decimal)
60 |               <|> string "kPrimaryNumeric"    *> pure (liftA (mk PrimaryUse)    decimal)
61 |   where mk use x = M.singleton c (use, x)
62 | 


--------------------------------------------------------------------------------
/data/Unihan/Unihan_NumericValues.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Unihan_NumericValues.txt
 3 | # Date: 2012-08-17 17:13:30 GMT [JHJ]
 4 | # Unicode version: 6.2.0
 5 | #
 6 | # Unicode Character Database
 7 | # Copyright (c) 1991-2012 Unicode, Inc.
 8 | # For terms of use, see http://www.unicode.org/terms_of_use.html
 9 | # For documentation, see http://www.unicode.org/reports/tr44/
10 | #
11 | # This file contains data on the following fields from the Unihan database:
12 | #	kAccountingNumeric
13 | #	kOtherNumeric
14 | #	kPrimaryNumeric
15 | #
16 | # For details on the file format, see http://www.unicode.org/reports/tr38/
17 | #
18 | U+3405	kOtherNumeric	5
19 | U+3483	kOtherNumeric	2
20 | U+382A	kOtherNumeric	5
21 | U+3B4D	kOtherNumeric	7
22 | U+4E00	kPrimaryNumeric	1
23 | U+4E03	kPrimaryNumeric	7
24 | U+4E07	kPrimaryNumeric	10000
25 | U+4E09	kPrimaryNumeric	3
26 | U+4E5D	kPrimaryNumeric	9
27 | U+4E8C	kPrimaryNumeric	2
28 | U+4E94	kPrimaryNumeric	5
29 | U+4E96	kOtherNumeric	4
30 | U+4EBF	kPrimaryNumeric	100000000
31 | U+4EC0	kOtherNumeric	10
32 | U+4EDF	kAccountingNumeric	1000
33 | U+4EE8	kOtherNumeric	3
34 | U+4F0D	kAccountingNumeric	5
35 | U+4F70	kAccountingNumeric	100
36 | U+5104	kPrimaryNumeric	100000000
37 | U+5146	kPrimaryNumeric	1000000000000
38 | U+5169	kOtherNumeric	2
39 | U+516B	kPrimaryNumeric	8
40 | U+516D	kPrimaryNumeric	6
41 | U+5341	kPrimaryNumeric	10
42 | U+5343	kPrimaryNumeric	1000
43 | U+5344	kOtherNumeric	20
44 | U+5345	kOtherNumeric	30
45 | U+534C	kOtherNumeric	40
46 | U+53C1	kAccountingNumeric	3
47 | U+53C2	kAccountingNumeric	3
48 | U+53C3	kAccountingNumeric	3
49 | U+53C4	kOtherNumeric	3
50 | U+56DB	kPrimaryNumeric	4
51 | U+58F1	kAccountingNumeric	1
52 | U+58F9	kAccountingNumeric	1
53 | U+5E7A	kOtherNumeric	1
54 | U+5EFE	kOtherNumeric	9
55 | U+5EFF	kOtherNumeric	20
56 | U+5F0C	kAccountingNumeric	1
57 | U+5F0D	kAccountingNumeric	2
58 | U+5F0E	kAccountingNumeric	3
59 | U+5F10	kAccountingNumeric	2
60 | U+62FE	kAccountingNumeric	10
61 | U+634C	kAccountingNumeric	8
62 | U+67D2	kAccountingNumeric	7
63 | U+6F06	kAccountingNumeric	7
64 | U+7396	kAccountingNumeric	9
65 | U+767E	kPrimaryNumeric	100
66 | U+8086	kAccountingNumeric	4
67 | U+842C	kAccountingNumeric	10000
68 | U+8CAE	kAccountingNumeric	2
69 | U+8CB3	kAccountingNumeric	2
70 | U+8D30	kAccountingNumeric	2
71 | U+9621	kAccountingNumeric	1000
72 | U+9646	kAccountingNumeric	6
73 | U+964C	kAccountingNumeric	100
74 | U+9678	kAccountingNumeric	6
75 | U+96F6	kPrimaryNumeric	0
76 | U+20001	kOtherNumeric	7
77 | U+20064	kOtherNumeric	4
78 | U+200E2	kOtherNumeric	4
79 | U+20121	kOtherNumeric	5
80 | U+2092A	kOtherNumeric	1
81 | U+20983	kOtherNumeric	30
82 | U+2098C	kOtherNumeric	40
83 | U+2099C	kOtherNumeric	40
84 | U+20AEA	kOtherNumeric	6
85 | U+20AFD	kOtherNumeric	3
86 | U+20B19	kOtherNumeric	3
87 | U+22390	kOtherNumeric	2
88 | U+22998	kOtherNumeric	3
89 | U+23B1B	kOtherNumeric	3
90 | U+2626D	kOtherNumeric	4
91 | 
92 | # EOF
93 | 


--------------------------------------------------------------------------------
/CJK/Data/Pinyin.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, PatternGuards #-}
  2 | module CJK.Data.Pinyin where
  3 | 
  4 | import qualified Data.Text as Text
  5 | import qualified Data.Text.ICU.Normalize as Text
  6 | import Data.Maybe
  7 | 
  8 | 
  9 | data Tone = Flat
 10 |           | Rising
 11 |           | FallingRising
 12 |           | Falling
 13 |           | Neutral
 14 |           deriving (Eq, Ord)
 15 | 
 16 | instance Show Tone where
 17 |     show = show . toneNumber
 18 | 
 19 | toneNumber :: Tone -> Int
 20 | toneNumber Flat          = 1
 21 | toneNumber Rising        = 2
 22 | toneNumber FallingRising = 3
 23 | toneNumber Falling       = 4
 24 | toneNumber Neutral       = 5
 25 | 
 26 | -- | Returns the Unicode combining character used to produce the accent for this tone. Returns Nothing if no accent is required.
 27 | toneCombiningMark :: Tone -> Maybe Char
 28 | toneCombiningMark Flat          = Just '\x304'
 29 | toneCombiningMark Rising        = Just '\x301'
 30 | toneCombiningMark FallingRising = Just '\x30C'
 31 | toneCombiningMark Falling       = Just '\x300'
 32 | toneCombiningMark Neutral       = Nothing
 33 | 
 34 | -- | Returns the tone associated with this Unicode combining character, if any.
 35 | combiningMarkTone :: Char -> Maybe Tone
 36 | combiningMarkTone '\x304' = Just Flat
 37 | combiningMarkTone '\x301' = Just Rising
 38 | combiningMarkTone '\x30C' = Just FallingRising
 39 | combiningMarkTone '\x300' = Just Falling
 40 | combiningMarkTone _       = Nothing
 41 | 
 42 | 
 43 | data Phone = Phone {
 44 |     sound :: Text.Text,
 45 |     tone  :: Tone
 46 |   }
 47 | 
 48 | instance Show Phone where
 49 |     show yin = Text.unpack (sound yin) ++ show (tone yin)
 50 | 
 51 | fromAccented :: Text.Text -> Phone
 52 | fromAccented s = go [] Nothing $ Text.unpack $ Text.normalize Text.NFD s
 53 |   where go !tser !mb_tone cs = case cs of
 54 |             []           -> Phone { sound = Text.pack (reverse tser), tone = fromMaybe Neutral mb_tone }
 55 |             (c:cs)       -> case combiningMarkTone c of
 56 |                               Just tone -> go tser     (jst tone) cs
 57 |                               Nothing   -> go (c:tser) mb_tone    cs
 58 |           where jst tone' = case mb_tone of
 59 |                               Just tone | tone /= tone' -> error $ "Conflicting tones " ++ show tone ++ " and " ++ show tone' ++ " in " ++ Text.unpack s
 60 |                               _                         -> Just tone' -- Allow multiple tones of the same time, even if it is technically incorrect
 61 | 
 62 | -- Places an accent mark on the Pinyin according to these rules (from <http://talkbank.org/pinyin/Tone_marks.php>):
 63 | --
 64 | --  1. If there is only one vowel, it takes the diacritic.
 65 | --  2. If there is more than one vowel, then the vowels {a}, {e}, or {o} take the diacritic.
 66 | --  3. If the vowel cluster is {ao}, then {a} takes the diacritic.
 67 | --  4. If the vowel cluster is {iu} or {ui}, the last letter takes the diacritic.
 68 | toAccented :: Phone -> Text.Text
 69 | toAccented yin = Text.normalize Text.NFC $ Text.pack $ go $ Text.unpack $ sound yin
 70 |   where go cs = case span isVowel cs of
 71 |           ([],  [])   -> [] -- All pinyin contain a vowel, so this can only happen when the pinyin is in fact invalid
 72 |           ([],  c:cs) -> c:go cs
 73 |           (vws, cs)   -> go' vws ++ cs
 74 | 
 75 |         go' :: String -> String
 76 |         -- 1. If there is only one vowel, it takes the diacritic.
 77 |         go' [vw] = vw : mark
 78 |         -- 2. If there is more than one vowel, then the vowels {a}, {e}, or {o} take the diacritic.
 79 |         go' vws | (vws1, vw:vws2) <- span (\vw -> not (isA vw || isE vw || isO vw)) vws = vws1 ++ (vw : mark ++ vws2)
 80 |         -- 3. If the vowel cluster is {ao}, then {a} takes the diacritic.
 81 |         go' [vw1, vw2] | isA vw1 && isO vw2 = vw1 : mark ++ [vw2]
 82 |         -- 4. If the vowel cluster is {iu} or {ui}, the last letter takes the diacritic.
 83 |         go' [vw1, vw2] | (isI vw1 && isU vw2) || (isU vw1 && isI vw2) = [vw1, vw2] ++ mark
 84 |         -- Default to just after the first vowel
 85 |         go' (vw:vws) = vw : mark ++ vws
 86 | 
 87 |         isA, isE, isI, isO, isU, isVowel :: Char -> Bool
 88 | 
 89 |         isA 'a' = True
 90 |         isA 'A' = True
 91 |         isA _   = False
 92 | 
 93 |         isE 'e' = True
 94 |         isE 'E' = True
 95 |         isE _   = False
 96 | 
 97 |         isI 'i' = True
 98 |         isI 'I' = True
 99 |         isI _   = False
100 | 
101 |         isO 'o' = True
102 |         isO 'O' = True
103 |         isO _   = False
104 | 
105 |         isU 'u' = True
106 |         isU 'U' = True
107 |         isU _   = False
108 | 
109 |         isVowel c = isA c || isE c || isI c || isO c || isU c
110 | 
111 |         mark = maybeToList (toneCombiningMark (tone yin))
112 | 


--------------------------------------------------------------------------------
/CJK/Data/Unihan/Variants.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings #-}
  2 | module CJK.Data.Unihan.Variants (
  3 |     SemanticVariantType(..),
  4 |     VariantSource, VariantCitation, Variant,
  5 |     compatibilityVariants, zVariants,
  6 |     semanticVariants, specializedSemanticVariants,
  7 |     simplifiedVariants, traditionalVariants
  8 |   ) where
  9 | 
 10 | import CJK.Utilities
 11 | 
 12 | import Control.Applicative
 13 | 
 14 | import qualified Data.Text as Text
 15 | import qualified Data.Text.Lazy as TextL
 16 | import Data.Attoparsec.Text
 17 | 
 18 | import Data.Char
 19 | import Data.Maybe
 20 | import qualified Data.Map as M
 21 | import Data.List
 22 | 
 23 | import System.IO.Unsafe
 24 | 
 25 | 
 26 | data SemanticVariantType = T -- T for tòng, U+540C 同. The indicated source explicitly indicates the two are the same (e.g., by saying that the one character is “the same as” the other).
 27 |                          | B -- T for bù, U+4E0D 不. The source explicitly indicates that the two are used improperly one for the other.
 28 |                          | Z -- T for zhèng, U+6B63 正. The source explicitly indicates that the given character is the preferred form
 29 |                          | F -- T for fán, U+7E41 繁. The source explicitly indicates that the given character is the traditional form.
 30 |                          | J -- T for jiǎn U+7C21 簡/U+7B80 简. The source explicitly indicates that the given character is the simplified form.
 31 |                          deriving (Eq, Ord, Show)
 32 | 
 33 | type VariantSource = Text.Text
 34 | type VariantCitation = [(VariantSource, [SemanticVariantType])]
 35 | 
 36 | type Variant = (Char, VariantCitation)
 37 | 
 38 | -- | The compatibility decomposition for this ideograph
 39 | compatibilityVariants :: Char -> [Char]
 40 | compatibilityVariants c = case variants of VMS mp _ _ _ _ _ -> M.findWithDefault [] c mp
 41 | 
 42 | -- | A semantic variant is an x- or y-variant with similar or identical meaning which can generally be used in place of the indicated character
 43 | semanticVariants :: Char -> [Variant]
 44 | semanticVariants c = case variants of VMS _ mp _ _ _ _ -> M.findWithDefault [] c mp
 45 | 
 46 | -- | Simplified Chinese variant(s) for this character
 47 | simplifiedVariants :: Char -> [Char]
 48 | simplifiedVariants c = case variants of VMS _ _ mp _ _ _ -> M.findWithDefault [] c mp
 49 | 
 50 | -- | A specialized semantic variant is an x- or y-variant with similar or identical meaning only in certain contexts (such as accountants’ numerals)
 51 | specializedSemanticVariants :: Char -> [Variant]
 52 | specializedSemanticVariants c = case variants of VMS _ _ _ mp _ _ -> M.findWithDefault [] c mp
 53 | 
 54 | -- | Traditional Chinese variant(s) for this character
 55 | traditionalVariants :: Char -> [Char]
 56 | traditionalVariants c = case variants of VMS _ _ _ _ mp _ -> M.findWithDefault [] c mp
 57 | 
 58 | -- | The z-variant(s) for this character
 59 | zVariants :: Char -> [Variant]
 60 | zVariants c = case variants of VMS _ _ _ _ _ mp -> M.findWithDefault [] c mp
 61 | 
 62 | 
 63 | type VariantMap      = M.Map Char [Char]
 64 | type CitedVariantMap = M.Map Char [Variant]
 65 | data VariantsMap = VMS !VariantMap !CitedVariantMap !VariantMap !CitedVariantMap !VariantMap !CitedVariantMap
 66 |                  deriving (Show) -- Useful for debugging in GHCi
 67 | 
 68 | emptyVariantsMap :: VariantsMap
 69 | emptyVariantsMap = VMS M.empty M.empty M.empty M.empty M.empty M.empty
 70 | 
 71 | unionVariantsMap :: VariantsMap -> VariantsMap -> VariantsMap
 72 | unionVariantsMap (VMS a1 a2 a3 a4 a5 a6) (VMS b1 b2 b3 b4 b5 b6)
 73 |   = VMS (M.unionWith (++) a1 b1) (M.unionWith (++) a2 b2) (M.unionWith (++) a3 b3) (M.unionWith (++) a4 b4) (M.unionWith (++) a5 b5) (M.unionWith (++) a6 b6)
 74 | 
 75 | 
 76 | {-# NOINLINE contents #-}
 77 | contents :: TextL.Text
 78 | contents = unsafePerformIO (readUTF8DataFile "data/Unihan/Unihan_Variants.txt")
 79 | 
 80 | variants :: VariantsMap
 81 | variants = parseLazy fileP contents
 82 | 
 83 | 
 84 | fileP :: Parser VariantsMap
 85 | fileP = fmap (foldl' unionVariantsMap emptyVariantsMap) (lineP `manyTill` endOfInput)
 86 | 
 87 | lineP :: Parser VariantsMap
 88 | lineP = do { c <- charP <* skipSpace; dataP <- variantP c <* skipSpace; dataP <* skipTrueSpace <* lineTerminator }
 89 |     <|> char '#' *> manyTill anyChar lineTerminator *> pure emptyVariantsMap
 90 |     <|> manyTill skipTrueSpace lineTerminator *> pure emptyVariantsMap
 91 |     <?> "line"
 92 | 
 93 | variantP :: Char -> Parser (Parser VariantsMap)
 94 | variantP c = string "kCompatibilityVariant"       *> pure (liftA (\x -> VMS (mk x) M.empty M.empty M.empty M.empty M.empty) charsP)
 95 |          <|> string "kSemanticVariant"            *> pure (liftA (\x -> VMS M.empty (mk x) M.empty M.empty M.empty M.empty) variantsP)
 96 |          <|> string "kSimplifiedVariant"          *> pure (liftA (\x -> VMS M.empty M.empty (mk x) M.empty M.empty M.empty) charsP)
 97 |          <|> string "kSpecializedSemanticVariant" *> pure (liftA (\x -> VMS M.empty M.empty M.empty (mk x) M.empty M.empty) variantsP)
 98 |          <|> string "kTraditionalVariant"         *> pure (liftA (\x -> VMS M.empty M.empty M.empty M.empty (mk x) M.empty) charsP)
 99 |          <|> string "kZVariant"                   *> pure (liftA (\x -> VMS M.empty M.empty M.empty M.empty M.empty (mk x)) variantsP)
100 |          <?> "variant"
101 |   where mk x = M.singleton c x
102 | 
103 | charsP :: Parser [Char]
104 | charsP = charP `sepBy1` skipTrueSpace
105 | 
106 | variantsP :: Parser [Variant]
107 | variantsP = liftA2 (,) charP variantCitationP `sepBy1` skipTrueSpace
108 | 
109 | semanticVariantTypeP :: Parser SemanticVariantType
110 | semanticVariantTypeP = char 'T' *> pure T
111 |                    <|> char 'B' *> pure B
112 |                    <|> char 'Z' *> pure Z
113 |                    <|> char 'F' *> pure F
114 |                    <|> char 'J' *> pure J
115 |                    <?> "semantic variant type"
116 | 
117 | variantCitationP :: Parser VariantCitation
118 | variantCitationP = char '<' *> (entryP `sepBy1` char ',')
119 |                <|> pure [] -- Z-variants are commonly uncited
120 |                <?> "variant citation"
121 |   where entryP = liftA2 (\which mb_xs -> (which, fromMaybe [] mb_xs)) sourceP (optional (char ':' *> many1 semanticVariantTypeP))
122 |         sourceP = takeWhile1 isAlphaNum
123 | 


--------------------------------------------------------------------------------
/CJK/Data/Unihan/RadicalStrokeCounts.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings #-}
  2 | module CJK.Data.Unihan.RadicalStrokeCounts (
  3 |     -- * Dictionary- and standard-consistent radicals
  4 |     IsSimplifiedKangXi, unicode, kangXi, kanWa,
  5 | 
  6 |     -- * Language-consistent radicals
  7 |     korean, japanese,
  8 |     
  9 |     -- * Font-consistent radicals
 10 |     AdobeJapan1_6(..), adobeJapan1_6
 11 |   ) where
 12 | 
 13 | import CJK.Data.Types
 14 | import CJK.Utilities
 15 | 
 16 | import Control.Applicative
 17 | 
 18 | import qualified Data.Text as Text
 19 | import qualified Data.Text.Lazy as TextL
 20 | import Data.Attoparsec.Text
 21 | 
 22 | import Data.Char
 23 | import Data.Maybe
 24 | import qualified Data.Map as M
 25 | import Data.List
 26 | 
 27 | import System.IO.Unsafe
 28 | 
 29 | 
 30 | -- | Whether the character is formed from the simplifed version of the radical
 31 | type IsSimplifiedKangXi = Bool
 32 | 
 33 | data AdobeJapan1_6 = AJ1_6 {
 34 |     aJ1_6IsDirect           :: Bool, -- ^ True if the Unicode code point maps directly to the Adobe-Japan1-6 CID, or False if it is a variant form which is not directly encoded
 35 |     aJ1_6CID                :: Int,  -- ^ The ID of the character in the Adobe-Japan1-6 font
 36 |     aJ1_6RadicalStrokeCount :: RadicalStrokeCount (KangXiRadical, StrokeCount)
 37 |   } deriving (Show) -- Useful for debugging in GHCi
 38 | 
 39 | -- | Radical/stroke count in the Adobe-Japan1-6 font
 40 | --
 41 | -- This data is unusual in that it explicitly includes the stroke count for the form that the radical takes in the glyph.
 42 | adobeJapan1_6 :: Char -> [AdobeJapan1_6]
 43 | adobeJapan1_6 c = M.findWithDefault [] c (kRSAdobe_Japan1_6 strokeCounts)
 44 | 
 45 | -- | Radical/stroke counts usually used in Japanese
 46 | japanese :: Char -> [RadicalStrokeCount KangXiRadical]
 47 | japanese c = M.findWithDefault [] c (kRSJapanese strokeCounts)
 48 | 
 49 | -- | Radical/stroke counts consistent with the KangXi dictionary
 50 | kangXi :: Char -> [RadicalStrokeCount KangXiRadical]
 51 | kangXi c = M.findWithDefault [] c (kRSKangXi strokeCounts)
 52 | 
 53 | -- | Radical/stroke counts consistent with the Morohashi dictionary
 54 | kanWa :: Char -> [RadicalStrokeCount KangXiRadical]
 55 | kanWa c = M.findWithDefault [] c (kRSKanWa strokeCounts)
 56 | 
 57 | -- | Radical/stroke counts usually used in Japanese
 58 | korean :: Char -> [RadicalStrokeCount KangXiRadical]
 59 | korean c = M.findWithDefault [] c (kRSKorean strokeCounts)
 60 | 
 61 | -- | Radical/stroke count consistent with ISO/IEC 10646
 62 | --
 63 | -- The first value in the returned list, if any, is equal to the normative radical-stroke value defined in ISO/IEC 10646.
 64 | unicode :: Char -> [RadicalStrokeCount (KangXiRadical, IsSimplifiedKangXi)]
 65 | unicode c = M.findWithDefault [] c (kRSUnicode strokeCounts)
 66 | 
 67 | 
 68 | data StrokeCountsMap = SMS  {
 69 |     kRSAdobe_Japan1_6 :: !(M.Map Char [AdobeJapan1_6]),
 70 |     kRSJapanese       :: !(M.Map Char [RadicalStrokeCount KangXiRadical]),
 71 |     kRSKangXi         :: !(M.Map Char [RadicalStrokeCount KangXiRadical]),
 72 |     kRSKanWa          :: !(M.Map Char [RadicalStrokeCount KangXiRadical]),
 73 |     kRSKorean         :: !(M.Map Char [RadicalStrokeCount KangXiRadical]),
 74 |     kRSUnicode        :: !(M.Map Char [RadicalStrokeCount (KangXiRadical, IsSimplifiedKangXi)])
 75 |   } deriving (Show) -- Useful for debugging in GHCi
 76 | 
 77 | emptyStrokeCountsMap :: StrokeCountsMap
 78 | emptyStrokeCountsMap = SMS M.empty M.empty M.empty M.empty M.empty M.empty
 79 | 
 80 | unionStrokeCountsMap :: StrokeCountsMap -> StrokeCountsMap -> StrokeCountsMap
 81 | unionStrokeCountsMap (SMS a1 a2 a3 a4 a5 a6) (SMS b1 b2 b3 b4 b5 b6)
 82 |   = SMS (plus a1 b1) (plus a2 b2) (plus a3 b3) (plus a4 b4) (plus a5 b5) (plus a6 b6)
 83 |   where plus = M.unionWith (error "unionStrokeCountsMap: impossible") -- There is at most one line for each (character, field name) combination
 84 | 
 85 | {-# NOINLINE contents #-}
 86 | contents :: TextL.Text
 87 | contents = unsafePerformIO (readUTF8DataFile "data/Unihan/Unihan_RadicalStrokeCounts.txt")
 88 | 
 89 | strokeCounts :: StrokeCountsMap
 90 | strokeCounts = parseLazy fileP contents
 91 | 
 92 | 
 93 | fileP :: Parser StrokeCountsMap
 94 | fileP = fmap (foldl' unionStrokeCountsMap emptyStrokeCountsMap) (lineP `manyTill` endOfInput)
 95 | 
 96 | lineP :: Parser StrokeCountsMap
 97 | lineP = do { c <- charP <* skipSpace; dataP <- strokeCountP c <* skipSpace; dataP <* skipTrueSpace <* lineTerminator }
 98 |     <|> char '#' *> manyTill anyChar lineTerminator *> pure emptyStrokeCountsMap
 99 |     <|> manyTill skipTrueSpace lineTerminator *> pure emptyStrokeCountsMap
100 |     <?> "line"
101 | 
102 | strokeCountP :: Char -> Parser (Parser StrokeCountsMap)
103 | strokeCountP c = string "kRSAdobe_Japan1_6" *> pure (liftA (\x -> emptyStrokeCountsMap { kRSAdobe_Japan1_6 = mk x }) (rsAdobe_Japan1_6P `sepBy1` skipTrueSpace))
104 |              <|> string "kRSJapanese"       *> pure (liftA (\x -> emptyStrokeCountsMap { kRSJapanese       = mk x }) (radicalStrokeCountP `sepBy1` skipTrueSpace))
105 |              <|> string "kRSKangXi"         *> pure (liftA (\x -> emptyStrokeCountsMap { kRSKangXi         = mk x }) (radicalStrokeCountP `sepBy1` skipTrueSpace))
106 |              <|> string "kRSKanWa"          *> pure (liftA (\x -> emptyStrokeCountsMap { kRSKanWa          = mk x }) (radicalStrokeCountP `sepBy1` skipTrueSpace))
107 |              <|> string "kRSKorean"         *> pure (liftA (\x -> emptyStrokeCountsMap { kRSKorean         = mk x }) (radicalStrokeCountP `sepBy1` skipTrueSpace))
108 |              <|> string "kRSUnicode"        *> pure (liftA (\x -> emptyStrokeCountsMap { kRSUnicode        = mk x }) (rsUnicodeP `sepBy1` skipTrueSpace))
109 |   where mk x = M.singleton c x
110 | 
111 | rsAdobe_Japan1_6P :: Parser AdobeJapan1_6
112 | rsAdobe_Japan1_6P = liftA3 AJ1_6 isDirectP (char '+' *> decimal) (char '+' *> rscP)
113 |   where isDirectP = char 'C' *> pure True
114 |                 <|> char 'V' *> pure False
115 |         rscP = liftA3 (\kx kxn n -> RSC (KangXi kx, kxn) n) decimal (char '.' *> decimal) (char '.' *> decimal)
116 | 
117 | rsUnicodeP :: Parser (RadicalStrokeCount (KangXiRadical, IsSimplifiedKangXi))
118 | rsUnicodeP = liftA3 (\kx is_simp n -> RSC (KangXi kx, is_simp) n) decimal (canParse (char '\'')) (char '.' *> decimal)
119 | 
120 | radicalStrokeCountP :: Parser (RadicalStrokeCount KangXiRadical)
121 | radicalStrokeCountP = liftA2 RSC (fmap KangXi decimal) (char '.' *> decimal)
122 | 


--------------------------------------------------------------------------------
/CJK/Data/CEDICT.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings, PatternGuards #-}
  2 | module CJK.Data.CEDICT (
  3 |     Reading, showReading, showSpacelessReading, showReadingAccented, showSpacelessReadingAccented,
  4 |     Word(..), showHeadWord,
  5 |     DefinitionToken(..), WordDefinition(..), Definition(..),
  6 |     entries
  7 |   ) where
  8 | 
  9 | import CJK.Utilities
 10 | import CJK.Data.Internal
 11 | import CJK.Data.Pinyin
 12 | 
 13 | import Control.Applicative
 14 | import Data.Maybe
 15 | import Data.List (intercalate)
 16 | 
 17 | import qualified Data.ByteString.Lazy as BS
 18 | 
 19 | import qualified Data.Text as Text
 20 | import qualified Data.Text.Lazy as TextL
 21 | 
 22 | import Data.Char
 23 | import Data.Monoid
 24 | import Data.Attoparsec.Combinator
 25 | import Data.Attoparsec.Text hiding (parse, eitherResult)
 26 | import Data.Attoparsec.Text.Lazy
 27 | 
 28 | import System.IO.Unsafe
 29 | 
 30 | import Prelude hiding (takeWhile)
 31 | 
 32 | 
 33 | type Reading = [Either Text.Text Phone]
 34 | 
 35 | showReading :: Reading -> String
 36 | showReading yins = intercalate " " (map (either Text.unpack show) yins)
 37 | 
 38 | showSpacelessReading :: Reading -> String
 39 | showSpacelessReading yins = concat (map (either Text.unpack show) yins)
 40 | 
 41 | showReadingAccented :: Reading -> String
 42 | showReadingAccented yins = intercalate " " (map (either Text.unpack (Text.unpack . toAccented)) yins)
 43 | 
 44 | showSpacelessReadingAccented :: Reading -> String
 45 | showSpacelessReadingAccented yins = concat (map (either Text.unpack (Text.unpack . toAccented)) yins)
 46 | 
 47 | 
 48 | data Word = Word {
 49 |     traditional :: [Char],
 50 |     simplified  :: [Char],
 51 |     reading     :: Reading
 52 |   }
 53 | 
 54 | bracketed s = "[" ++ s ++ "]"
 55 | 
 56 | instance Show Word where
 57 |     show word | simplified word == traditional word = traditional word                           ++ bracketed (showReading (reading word))
 58 |               | otherwise                           = traditional word ++ "|" ++ simplified word ++ bracketed (showReading (reading word))
 59 | 
 60 | -- | Show a word as in the head of a dictionary entry
 61 | showHeadWord :: Word -> String
 62 | showHeadWord word = traditional word ++ " " ++ simplified word ++ " " ++ bracketed (showReading (reading word))
 63 | 
 64 | mkWord :: [Char] -> [Char] -> Reading -> Word
 65 | -- Fix problems in dictionary:
 66 | mkWord trad simp yins
 67 |   | ntrad /= nsimp = error $ "mkWord: differing numbers of traditional and simplified characters (" ++ show trad ++ " vs. " ++ show simp ++ ")"
 68 |   | otherwise      = case (trad, simp) of
 69 |     ("中國左翼作家聯盟",           "中国左翼作家联盟")           | nyins == 6 -> Word trad simp ([Right (Phone "Zhong" Flat), Right (Phone "guo" Rising)] ++ yins)
 70 |     ("甘孜藏族自治州甘孜藏族自治州", "甘孜藏族自治州甘孜藏族自治州") | nyins == 7 -> Word "甘孜藏族自治州" "甘孜藏族自治州" yins
 71 |     ("睿宗",                     "睿宗")                    | nyins == 3 -> Word trad simp (tail yins)
 72 |     ("泰米爾納德",                "泰米尔纳德")                | nyins == 6 -> Word trad simp (init yins)
 73 |     ("Zhou周文王",               "Zhou周文王")               | nyins == 3 -> Word "周文王" "周文王" yins
 74 |     ("美國５１區",                "美国５１区")                | nyins == 6 -> Word "美國五十一區" "美国五十一区" yins
 75 |     _ -- Check for missing 市 suffix which is present in yins in examples like 棗莊|枣庄
 76 |       | ntrad + 1 == nyins, Right (Phone "shi" Falling) <- last yins -> Word trad simp (init yins)
 77 |       -- Check for 市 suffix which is missing in yins in examples like 鹿泉市
 78 |       | ntrad == nyins + 1, '市'                        <- last trad -> Word trad simp (yins ++ [Right (Phone "shi" Falling)])
 79 |       -- Last-ditch check for an unhandled error
 80 |       | ntrad /= nyins -> error $ "mkWord: differing numbers of characters and readings (" ++ show trad ++ " vs. " ++ bracketed (showReading yins) ++ ")"
 81 |       | otherwise      -> Word trad simp yins
 82 |   where ntrad = length trad
 83 |         nsimp = length simp
 84 |         nyins = length yins
 85 | 
 86 | 
 87 | data DefinitionToken = PlainToken Text.Text
 88 |                      | WordToken Word
 89 | 
 90 | instance Show DefinitionToken where
 91 |     show (PlainToken text) = Text.unpack text
 92 |     show (WordToken word)  = show word
 93 | 
 94 | 
 95 | data WordDefinition = WordClassifiers [Word]
 96 |                     | WordDefinition [DefinitionToken]
 97 | 
 98 | instance Show WordDefinition where
 99 |     show (WordClassifiers wrds)  = "CL:" ++ intercalate "," (map show wrds)
100 |     show (WordDefinition tokens) = concatMap show tokens
101 | 
102 | 
103 | data Definition = Definition {
104 |     word        :: Word,
105 |     definitions :: [WordDefinition]
106 |   }
107 | 
108 | instance Show Definition where
109 |     show definition = showHeadWord (word definition) ++ " /" ++ intercalate "/" (map show (definitions definition)) ++ "/"
110 | 
111 | 
112 | {-# NOINLINE contents #-}
113 | contents :: TextL.Text
114 | contents = unsafePerformIO (readUTF8DataFile "data/cedict_1_0_ts_utf-8_mdbg.txt")
115 | 
116 | entries :: [Definition]
117 | entries = parseLazy fileP contents
118 | 
119 | fileP :: Parser [Definition]
120 | fileP = fmap catMaybes (many lineP)
121 | 
122 | lineP :: Parser (Maybe Definition)
123 | lineP = char '#' *> manyTill anyChar lineTerminator *> pure Nothing
124 |     <|> liftA4 (\trad simp yins defs -> Just (Definition { word = mkWord trad simp yins, definitions = defs })) nonSpaceP nonSpaceP (readingP <* space) definitionsP <* lineTerminator
125 | 
126 | readingP :: Parser Reading
127 | readingP = char '[' *> (yinP `sepBy1` space) <* char ']'
128 | 
129 | yinP :: Parser (Either Text.Text Phone)
130 | yinP = liftA Right tonedPinyinP
131 |    <|> liftA Left  (takeWhile1 (\c -> not (isSpace c) && c /= ']')) -- CEDICT explicitly writes tone 5, so any missing tones must be for non-Chinese
132 | 
133 | toneP :: Parser Tone
134 | toneP = char '1' *> pure Flat
135 |     <|> char '2' *> pure Rising
136 |     <|> char '3' *> pure FallingRising
137 |     <|> char '4' *> pure Falling
138 |     <|> char '5' *> pure Neutral
139 | 
140 | definitionsP :: Parser [WordDefinition]
141 | definitionsP = char '/' *> many1 (definitionP <* char '/')
142 | 
143 | definitionP :: Parser WordDefinition
144 | definitionP = liftA WordClassifiers (string "CL:" *> (wordP `sepBy1` (char ',' >> skipWhile isSpace))) -- In entries like 個|个[ge4] or CL:個|个[ge4],隻|只[zhi1] the characters do not have to have a space before them, so special case it
145 |           <|> liftA WordDefinition (many tokenP)
146 | 
147 | tokenP :: Parser DefinitionToken
148 | tokenP = liftA WordToken wordP
149 |      <|> liftA3 (\hoklo chars end -> PlainToken (hoklo <> chars <> end)) (string "Hoklo:") (takeWhile (/= ']')) (string "]") -- There are two rogue entries containing Hoklo: 無甚物[bô-siáⁿ-mi̍h]
150 |      <|> liftA PlainToken (takeWhile1 (\c -> not (isSpace c || c == '(') && c /= '/'))
151 |      <|> liftA PlainToken (takeWhile1 (\c -> isTrueSpace c || c == '('))
152 | 
153 | wordP :: Parser Word
154 | wordP = liftA3 (\trad mb_simp yins -> mkWord trad (fromMaybe trad mb_simp) yins) chineseP (optional (char '|' *> chineseP)) readingP
155 |   where
156 |     chineseP :: Parser [Char]
157 |     chineseP = many1 (satisfy (\c -> not (isSpace c) && c /= '/' && c /= '|' && c /= '['))
158 | 
159 | nonSpaceP :: Parser [Char]
160 | nonSpaceP = many1 (satisfy (not . isSpace)) <* space
161 | 


--------------------------------------------------------------------------------
/CJK/Data/Unihan/DictionaryLikeData.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings #-}
  2 | module CJK.Data.Unihan.DictionaryLikeData (
  3 |     cangjie,
  4 |     CheungBauer(..), cheungBauer,
  5 |     cihai,
  6 |     Fenn(..), fenn,
  7 |     fourCornerCode,
  8 |     frequency,
  9 |     gradeLevel,
 10 |     hdzRadBreak,
 11 |     hkGlyph,
 12 |     phonetic,
 13 |     totalStrokes
 14 |   ) where
 15 | 
 16 | import qualified CJK.Data.Jyutping as Jyutping
 17 | import CJK.Data.Internal
 18 | import CJK.Data.Types
 19 | import CJK.Utilities
 20 | 
 21 | import Control.Applicative
 22 | 
 23 | import qualified Data.Text as Text
 24 | import qualified Data.Text.Lazy as TextL
 25 | import Data.Attoparsec.Text
 26 | 
 27 | import Data.Char
 28 | import Data.Maybe
 29 | import qualified Data.Map as M
 30 | import Data.Monoid
 31 | import Data.List
 32 | 
 33 | import System.IO.Unsafe
 34 | 
 35 | 
 36 | data CheungBauer = CB {
 37 |     cbRadicalStrokeCount :: RadicalStrokeCount KangXiRadical,
 38 |     cbCangjie            :: Maybe CangjieInputCode,
 39 |     cbReading            :: [Jyutping.Phone] -- ^ Readings are in alphabetical order
 40 |   } deriving (Show) -- Useful for debugging in GHCi
 41 | 
 42 | data Fenn = Fenn {
 43 |     fennSoothill  :: Maybe Int, -- ^ Soothill number of the character's phonetic, if any
 44 |     fennFrequency :: Maybe Int  -- ^ Number from 1 to 11 indicating roughly which group of 500 most popular characters this character is included in (i.e. 1 is the first 500 characters, 2 the next 500 characters etc). Nothing if the character is rare.
 45 |   } deriving (Show) -- Useful for debugging in GHCi
 46 | 
 47 | -- | The cangjie input code for the character
 48 | cangjie :: Char -> Maybe CangjieInputCode
 49 | cangjie c = M.lookup c (kCangjie dictionaryLikes)
 50 | 
 51 | -- | Data regarding the character in Cheung Kwan-hin and Robert S. Bauer, _The Representation of Cantonese with Chinese Characters_, Journal of Chinese Linguistics, Monograph Series Number 18, 2002
 52 | cheungBauer :: Char -> [CheungBauer]
 53 | cheungBauer c = M.findWithDefault [] c (kCheungBauer dictionaryLikes)
 54 | 
 55 | -- | The position(s) of this character in the Cihai (辭海) dictionary, single volume edition, published in Hong Kong by the Zhonghua Bookstore, 1983 (reprint of the 1947 edition), ISBN 962-231-005-2.
 56 | --
 57 | -- The position is indicated by a decimal number. The digits to the left of the decimal are the page number. The first digit after the decimal is the row on the page, and the remaining two digits after the decimal are the position on the row.
 58 | cihai :: Char -> [Text.Text]
 59 | cihai c = M.findWithDefault [] c (kCihaiT dictionaryLikes)
 60 | 
 61 | -- | Data on the character from The Five Thousand Dictionary (aka Fenn’s Chinese-English Pocket Dictionary) by Courtenay H. Fenn, Cambridge, Mass.: Harvard University Press, 1979.
 62 | fenn :: Char -> [Fenn]
 63 | fenn c = M.findWithDefault [] c (kFenn dictionaryLikes)
 64 | 
 65 | -- | The four-corner code(s) for the character
 66 | --
 67 | -- The four-corner system assigns each character a four-digit code from 0 through 9. The digit is derived from the “shape” of the four corners of the character (upper-left, upper-right, lower-left, lower-right). An optional fifth digit
 68 | -- can be used to further distinguish characters; the fifth digit is derived from the shape in the character’s center or region immediately to the left of the fourth corner.
 69 | --
 70 | -- The four-corner system is now used only rarely. Full descriptions are available online, e.g., at <http://en.wikipedia.org/wiki/Four_corner_input>.
 71 | fourCornerCode :: Char -> [Text.Text]
 72 | fourCornerCode c = M.findWithDefault [] c (kFourCornerCode dictionaryLikes)
 73 | 
 74 | -- | A rough frequency measurement for the character based on analysis of traditional Chinese USENET postings; characters with a kFrequency of 1 are the most common, those with a kFrequency of 2 are less common, and so on, through a kFrequency of 5.
 75 | frequency :: Char -> Maybe Int
 76 | frequency c = M.lookup c (kFrequency dictionaryLikes)
 77 | 
 78 | -- | The primary grade in the Hong Kong school system by which a student is expected to know the character; this data is derived from 朗文初級中文詞典, Hong Kong: Longman, 2001
 79 | gradeLevel :: Char -> Maybe Int
 80 | gradeLevel c = M.lookup c (kGradeLevel dictionaryLikes)
 81 | 
 82 | -- | Does 《漢語大字典》 Hanyu Da Zidian have a radical break beginning at this character’s position? If so, returns the radical and the Hanyu Da Zidian position as in the kHanyu field.
 83 | hdzRadBreak :: Char -> Maybe (Char, HDZEntry)
 84 | hdzRadBreak c = M.lookup c (kHDZRadBreak dictionaryLikes)
 85 | 
 86 | -- | The index of the character in 常用字字形表 (二零零零年修訂本),香港: 香港教育學院, 2000, ISBN 962-949-040-4. This publication gives the “proper” shapes for 4759 characters as used in the Hong Kong school system
 87 | hkGlyph :: Char -> [Int]
 88 | hkGlyph c = M.findWithDefault [] c (kHKGlyph dictionaryLikes)
 89 | 
 90 | -- | The phonetic index for the character from _Ten Thousand Characters: An Analytic Dictionary_, by G. Hugh Casey, S.J. Hong Kong: Kelley and Walsh, 1980
 91 | phonetic :: Char -> [Text.Text]
 92 | phonetic c = M.findWithDefault [] c (kPhonetic dictionaryLikes)
 93 | 
 94 | -- | The total number of strokes in the character (including the radical), that is, the stroke count most commonly associated with the character in modern text using customary fonts.
 95 | --
 96 | -- The first value is preferred for zh-Hans (CN) and the second is preferred for zh-Hant (TW)
 97 | totalStrokes :: Char -> Maybe (StrokeCount, StrokeCount)
 98 | totalStrokes c = M.lookup c (kTotalStrokes dictionaryLikes)
 99 | 
100 | 
101 | data DictionaryLikesMap = DMS {
102 |     kCangjie        :: M.Map Char CangjieInputCode,
103 |     kCheungBauer    :: M.Map Char [CheungBauer],
104 |     kCihaiT         :: M.Map Char [Text.Text],
105 |     kFenn           :: M.Map Char [Fenn],
106 |     kFourCornerCode :: M.Map Char [Text.Text],
107 |     kFrequency      :: M.Map Char Int,
108 |     kGradeLevel     :: M.Map Char Int,
109 |     kHDZRadBreak    :: M.Map Char (Char, HDZEntry),
110 |     kHKGlyph        :: M.Map Char [Int],
111 |     kPhonetic       :: M.Map Char [Text.Text],
112 |     kTotalStrokes   :: M.Map Char (StrokeCount, StrokeCount)
113 |   } deriving (Show) -- Useful for debugging in GHCi
114 | 
115 | emptyDictionaryLikesMap :: DictionaryLikesMap
116 | emptyDictionaryLikesMap = DMS M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty
117 | 
118 | unionDictionaryLikesMap :: DictionaryLikesMap -> DictionaryLikesMap -> DictionaryLikesMap
119 | unionDictionaryLikesMap  (DMS a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11) (DMS b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11)
120 |   = DMS (plus a1 b1) (plus a2 b2) (plus a3 b3) (plus a4  b4)  (plus a5  b5)  (plus a6  b6)
121 |         (plus a7 b7) (plus a8 b8) (plus a9 b9) (plus a10 b10) (plus a11 b11)
122 |   where plus = M.unionWith (error "unionReadingsMap: impossible") -- There is at most one line for each (character, field name) combination
123 | 
124 | {-# NOINLINE contents #-}
125 | contents :: TextL.Text
126 | contents = unsafePerformIO (readUTF8DataFile "data/Unihan/Unihan_DictionaryLikeData.txt")
127 | 
128 | dictionaryLikes :: DictionaryLikesMap
129 | dictionaryLikes = parseLazy fileP contents
130 | 
131 | 
132 | fileP :: Parser DictionaryLikesMap
133 | fileP = fmap (foldl' unionDictionaryLikesMap emptyDictionaryLikesMap) (lineP `manyTill` endOfInput)
134 | 
135 | lineP :: Parser DictionaryLikesMap
136 | lineP = do { c <- charP <* skipSpace; dataP <- dictionaryLikeP c <* skipSpace; dataP <* skipTrueSpace <* lineTerminator }
137 |     <|> char '#' *> manyTill anyChar lineTerminator *> pure emptyDictionaryLikesMap
138 |     <|> manyTill skipTrueSpace lineTerminator *> pure emptyDictionaryLikesMap
139 |     <?> "line"
140 | 
141 | dictionaryLikeP :: Char -> Parser (Parser DictionaryLikesMap)
142 | dictionaryLikeP c = string "kCangjie"        *> pure (liftA (\x -> emptyDictionaryLikesMap { kCangjie        = mk x }) cangjieP)
143 |                 <|> string "kCheungBauer"    *> pure (liftA (\x -> emptyDictionaryLikesMap { kCheungBauer    = mk x }) (cheungBauerP `sepBy1` skipTrueSpace))
144 |                 <|> string "kCihaiT"         *> pure (liftA (\x -> emptyDictionaryLikesMap { kCihaiT         = mk x }) (takeWhile1 (\c -> isDigit c || c == '.') `sepBy1` skipTrueSpace))
145 |                 <|> string "kFenn"           *> pure (liftA (\x -> emptyDictionaryLikesMap { kFenn           = mk x }) (fennP `sepBy1` skipTrueSpace))
146 |                 <|> string "kFourCornerCode" *> pure (liftA (\x -> emptyDictionaryLikesMap { kFourCornerCode = mk x }) (takeWhile1 (\c -> isDigit c || c == '.') `sepBy1` skipTrueSpace))
147 |                 <|> string "kFrequency"      *> pure (liftA (\x -> emptyDictionaryLikesMap { kFrequency      = mk x }) decimal)
148 |                 <|> string "kGradeLevel"     *> pure (liftA (\x -> emptyDictionaryLikesMap { kGradeLevel     = mk x }) decimal)
149 |                 <|> string "kHDZRadBreak"    *> pure (liftA (\x -> emptyDictionaryLikesMap { kHDZRadBreak    = mk x }) hdzRadBreakP)
150 |                 <|> string "kHKGlyph"        *> pure (liftA (\x -> emptyDictionaryLikesMap { kHKGlyph        = mk x }) (decimal `sepBy1` skipTrueSpace))
151 |                 <|> string "kPhonetic"       *> pure (liftA (\x -> emptyDictionaryLikesMap { kPhonetic       = mk x }) (takeWhile1 (\c -> isDigit c || isAsciiUpper c || c == '*') `sepBy1` skipTrueSpace))
152 |                 <|> string "kTotalStrokes"   *> pure (liftA (\x -> emptyDictionaryLikesMap { kTotalStrokes   = mk x }) totalStrokesP)
153 |   where mk x = M.singleton c x
154 | 
155 | cangjieP :: Parser CangjieInputCode
156 | cangjieP = takeWhile1 isAsciiUpper
157 | 
158 | cheungBauerP :: Parser CheungBauer
159 | cheungBauerP = liftA3 CB rscP (char ';' *> optional cangjieP) (char ';' *> liftA concat (jyutpingPatternP `sepBy1` char ','))
160 |   where rscP = liftA2 RSC (fmap KangXi decimal) (char '/' *> decimal)
161 | 
162 | jyutpingPatternP :: Parser [Jyutping.Phone]
163 | jyutpingPatternP = liftA2 (\sounds tones -> [Jyutping.Phone sound tone | sound <- sounds, tone <- tones]) soundP toneP
164 |   where
165 |     -- Some kCheungBauer says [ng]ai1
166 |     soundP = liftA2 (\opt nexts -> [here | next <- nexts, here <- [next, opt <> next]]) (char '[' *> takeWhile1 (/= ']') <* char ']') soundP
167 |          <|> liftA (\x -> [x]) (takeWhile1 (\c -> isAsciiUpper c || isAsciiLower c))
168 | 
169 |     -- Some kCheungBauer says min6/2
170 |     toneP = jyutpingToneP `sepBy1` char '/'
171 | 
172 | fennP :: Parser Fenn
173 | fennP = liftA2 Fenn groupP (optional (char 'a') *> frequencyP) -- Can't find any info on what the optional 'a' means
174 |   where groupP = char '0' *> pure Nothing -- Characters which have a frequency letter but no Soothill phonetic group
175 |              <|> fmap Just decimal
176 |         frequencyP = char 'A' *> return (Just 1)
177 |                  <|> char 'B' *> return (Just 2)
178 |                  <|> char 'C' *> return (Just 3)
179 |                  <|> char 'D' *> return (Just 4)
180 |                  <|> char 'E' *> return (Just 5)
181 |                  <|> char 'F' *> return (Just 6)
182 |                  <|> char 'G' *> return (Just 7)
183 |                  <|> char 'H' *> return (Just 8)
184 |                  <|> char 'I' *> return (Just 9)
185 |                  <|> char 'J' *> return (Just 10)
186 |                  <|> char 'K' *> return (Just 11)
187 |                  <|> char 'P' *> return Nothing -- Conflate these two cases:
188 |                  <|> char '*' *> return Nothing -- who really cares?
189 | 
190 | hdzRadBreakP :: Parser (Char, HDZEntry)
191 | hdzRadBreakP = liftA2 (,) anyChar (char '[' *> string "U+" *> takeWhile1 isHexDigit *> char ']' *> char ':' *> hdzEntryP)
192 | 
193 | totalStrokesP :: Parser (Int, Int)
194 | totalStrokesP = liftA2 (\simp mb_trad -> (simp, fromMaybe simp mb_trad)) decimal (skipTrueSpace *> optional decimal)
195 | 


--------------------------------------------------------------------------------
/CJK/Data/Unihan/Readings.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings #-}
  2 | module CJK.Data.Unihan.Readings (
  3 |     CharDefinition, definition,
  4 |     -- * Mandarin
  5 |     OccurrenceCount, IsHDZSubstitution,
  6 |     mandarinBestEffort, mandarin, hanyuPinlu, hanyuPinyin, xhc1983,
  7 |     -- * Cantonese
  8 |     cantonese,
  9 |     -- * Ancient Chinese
 10 |     CommonTangCharacter, tang,
 11 |     -- * Korean
 12 |     hangul, korean,
 13 |     -- * Japanese
 14 |     japaneseKun, japaneseOn,
 15 |     -- * Vietnamese
 16 |     vietnamese
 17 |   ) where
 18 | 
 19 | import qualified CJK.Data.Hangul     as Hangul
 20 | import qualified CJK.Data.Jyutping   as Jyutping
 21 | import qualified CJK.Data.KoreanYale as KoreanYale
 22 | import qualified CJK.Data.Pinyin     as Pinyin
 23 | import qualified CJK.Data.QuocNgu    as QuocNgu
 24 | import CJK.Data.Internal
 25 | import CJK.Data.Types
 26 | import CJK.Utilities
 27 | 
 28 | import Control.Applicative
 29 | 
 30 | import qualified Data.Text as Text
 31 | import qualified Data.Text.Lazy as TextL
 32 | import Data.Attoparsec.Text
 33 | 
 34 | import Data.Char
 35 | import Data.Maybe
 36 | import qualified Data.Map as M
 37 | import Data.List
 38 | 
 39 | import System.IO.Unsafe
 40 | 
 41 | 
 42 | type CharDefinition = Text.Text
 43 | 
 44 | -- | The sum total of the frequencies of the pronunciations of the character as given in 《現代漢語頻率詞典》 <Xiandai Hanyu Pinlu Cidian>
 45 | type OccurrenceCount = Int
 46 | 
 47 | -- | Whether the word or morpheme represented in toto or in part by the given character with the given reading occurs more than
 48 | -- four times in the seven hundred poems covered by "T’ang Poetic Vocabulary" by Hugh M. Stimson, Far Eastern Publications, Yale Univ. 1976
 49 | type CommonTangCharacter = Bool
 50 | 
 51 | -- | Whether this reference had an encoded variant substituted for an unencoded character used by the Hànyǔ Dà Zìdiǎn
 52 | type IsHDZSubstitution = Bool
 53 | 
 54 | 
 55 | -- | Returns how to pronounce an ideograph in Mandarin, making the best effort to use all of the CEDICT data to get a good answer.
 56 | -- Readings are returned in approximate frequency order.
 57 | --
 58 | -- This algorithm is based on the Unihan FAQ <http://www.unicode.org/faq/han_cjk.html>, which states that the best way is to use the kHanyuPinlu, kXHC1983,
 59 | -- and kHanyuPinyin fields in that order. The kMandarin field may have some readings the other three do not but should be used with caution. The kHanyuPinlu
 60 | -- field lists the most common readings for ideographs in order of frequency of use and is the most useful for most purposes. The kXHC1983
 61 | -- field contains the most important readings for characters in modern use, and the kHanyuPinyin field contains an exhaustive set of readings
 62 | -- for a large set of characters, but includes obscure readings of historic interest only
 63 | mandarinBestEffort :: Char -> [Pinyin.Phone]
 64 | mandarinBestEffort c = nubBy eq $ map fst (hanyuPinlu c) ++
 65 |                                   concatMap snd (xhc1983 c) ++
 66 |                                   concatMap snd (hanyuPinyin c) ++
 67 |                                   (case mandarin c of Nothing -> []; Just (simp, trad) -> [simp, trad]) -- NB: technically an abuse since this data is differentiated by mainland/Taiwan
 68 |   where yin1 `eq` yin2 = Text.toLower (Pinyin.sound yin1) == Text.toLower (Pinyin.sound yin2) && Pinyin.tone yin1 == Pinyin.tone yin2
 69 | 
 70 | -- | The Cantonese pronunciation(s) for this character using the jyutping romanization.
 71 | -- Cantonese pronunciations are sorted alphabetically, not in order of frequency.
 72 | --
 73 | -- Cantonese data are derived from the following sources:
 74 | --  * Casey, G. Hugh, S.J. Ten Thousand Characters: An Analytic Dictionary. Hong Kong: Kelley and Walsh,1980 (kPhonetic).
 75 | --  * Cheung Kwan-hin and Robert S. Bauer, The Representation of Cantonese with Chinese Characters, Journal of Chinese Linguistics Monograph Series Number 18, 2002.
 76 | --  * Roy T. Cowles, A Pocket Dictionary of Cantonese, Hong Kong: University Press, 1999 (kCowles).
 77 | --  * Sidney Lau, A Practical Cantonese-English Dictionary, Hong Kong: Government Printer, 1977 (kLau).
 78 | --  * Bernard F. Meyer and Theodore F. Wempe, Student’s Cantonese-English Dictionary, Maryknoll, New York: Catholic Foreign Mission Society of America, 1947 (kMeyerWempe).
 79 | --  * 饒秉才, ed. 廣州音字典, Hong Kong: Joint Publishing (H.K.) Co., Ltd., 1989.
 80 | --  * 中華新字典, Hong Kong:中華書局, 1987.
 81 | --  * 黃港生, ed. 商務新詞典, Hong Kong: The Commercial Press, 1991.
 82 | --  * 朗文初級中文詞典, Hong Kong: Longman, 2001.
 83 | cantonese :: Char -> [Jyutping.Phone]
 84 | cantonese c = M.findWithDefault [] c (kCantonese readings)
 85 | 
 86 | -- | An English definition for this character. Definitions are for modern written Chinese and are usually (but not always) the
 87 | -- same as the definition in other Chinese dialects or non-Chinese languages. In some cases, synonyms are indicated. Fuller variant
 88 | -- information can be found using the various variant fields.
 89 | --
 90 | -- Definitions specific to non-Chinese languages or Chinese dialects other than modern Mandarin are marked, e.g., (Cant.) or (J).
 91 | -- Minor definitions are separated by commas.
 92 | definition :: Char -> [CharDefinition]
 93 | definition c = M.findWithDefault [] c (kDefinition readings)
 94 | 
 95 | -- | The modern Korean pronunciation(s) for this character in Hangul.
 96 | hangul :: Char -> [Hangul.Phone]
 97 | hangul c = M.findWithDefault [] c (kHangul readings)
 98 | 
 99 | -- | The Pronunciations and Frequencies of this character, based in part on those appearing in
100 | -- 《現代漢語頻率詞典》 <Xiandai Hanyu Pinlu Cidian> (XDHYPLCD) [Modern Standard Beijing Chinese Frequency Dictionary].
101 | --
102 | -- Where more than one pronunciation exists, these are sorted by descending frequency.
103 | -- The occurrence count indicates the sum total of the frequencies of the pronunciations of the character as given in HYPLCD.
104 | --
105 | -- You may want to use 'mandarinBestEffort' instead of this function.
106 | hanyuPinlu :: Char -> [(Pinyin.Phone, OccurrenceCount)]
107 | hanyuPinlu c = M.findWithDefault [] c (kHanyuPinlu readings)
108 | 
109 | -- | The 漢語拼音 Hànyǔ Pīnyīn reading(s) appearing in the edition of 《漢語大字典》 Hànyǔ Dà Zìdiǎn (HDZ).
110 | --
111 | -- Where multiple pīnyīn readings are associated with a given mapping, these are ordered as in HDZ
112 | -- (for the most part reflecting relative commonality).
113 | --
114 | -- Individual entries are in same order as they are found in the Hanyu Da Zidian. This is true both for
115 | -- the locations and the individual readings. While this is generally in the order of utility for modern Chinese, such is not invariably the case.
116 | --
117 | -- You may want to use 'mandarinBestEffort' instead of this function.
118 | hanyuPinyin :: Char -> [([HDZEntry], [Pinyin.Phone])]
119 | hanyuPinyin c = M.findWithDefault [] c (kHanyuPinyin readings)
120 | 
121 | -- | The Japanese kun'yomi pronunciation of this character, in an undefined romanization system.
122 | -- It is recommended that you use kanjidic2 <http://www.csse.monash.edu.au/~jwb/kanjidic2/> instead of this data.
123 | japaneseKun :: Char -> [Text.Text]
124 | japaneseKun c = M.findWithDefault [] c (kJapaneseKun readings)
125 | 
126 | -- | The Japanese on'yomi pronunciation of this character, in an undefined romanization system.
127 | -- It is recommended that you use kanjidic2 <http://www.csse.monash.edu.au/~jwb/kanjidic2/> instead of this data.
128 | japaneseOn :: Char -> [Text.Text]
129 | japaneseOn c = M.findWithDefault [] c (kJapaneseOn readings)
130 | 
131 | -- | The Korean pronunciation(s) of this character, using the Yale romanization system.
132 | korean :: Char -> [KoreanYale.Phone]
133 | korean c = M.findWithDefault [] c (kKorean readings)
134 | 
135 | -- | The most customary pinyin reading for this character; that is, the reading most commonly used in modern text,
136 | -- with some preference given to readings most likely to be in sorted lists. 
137 | --
138 | -- The first value returned is preferred for zh-Hans (CN) and the second is preferred for
139 | -- zh-Hant (TW). Commonly, they will be exactly the same.
140 | --
141 | -- You may want to use 'mandarinBestEffort' instead of this function.
142 | mandarin :: Char -> Maybe (Pinyin.Phone, Pinyin.Phone)
143 | mandarin c = M.lookup c (kMandarin readings)
144 | 
145 | -- | The Tang dynasty pronunciation(s) of this character, in an undefined romanization.
146 | tang :: Char -> [(CommonTangCharacter, Text.Text)]
147 | tang c = M.findWithDefault [] c (kTang readings)
148 | 
149 | -- | The character’s pronunciation(s) in Quốc ngữ.
150 | vietnamese :: Char -> [QuocNgu.Phone]
151 | vietnamese c = M.findWithDefault [] c (kVietnamese readings)
152 | 
153 | -- | One or more Hànyǔ Pīnyīn readings as given in the Xiàndài Hànyǔ Cídiǎn.
154 | --
155 | -- You may want to use 'mandarinBestEffort' instead of this function.
156 | xhc1983 :: Char -> [([(HDZEntry, IsHDZSubstitution)], [Pinyin.Phone])]
157 | xhc1983 c = M.findWithDefault [] c (kXHC1983 readings)
158 | 
159 | 
160 | data ReadingsMap = RMS  {
161 |     kCantonese   :: !(M.Map Char [Jyutping.Phone]),
162 |     kDefinition  :: !(M.Map Char [CharDefinition]),
163 |     kHangul      :: !(M.Map Char [Hangul.Phone]),
164 |     kHanyuPinlu  :: !(M.Map Char [(Pinyin.Phone, OccurrenceCount)]),
165 |     kHanyuPinyin :: !(M.Map Char [([HDZEntry], [Pinyin.Phone])]),
166 |     kJapaneseKun :: !(M.Map Char [Text.Text]), -- Kun and On readings are in mixed
167 |     kJapaneseOn  :: !(M.Map Char [Text.Text]), -- romanization systems! Worthless...
168 |     kKorean      :: !(M.Map Char [KoreanYale.Phone]),
169 |     kMandarin    :: !(M.Map Char (Pinyin.Phone, Pinyin.Phone)),
170 |     kTang        :: !(M.Map Char [(CommonTangCharacter, Text.Text)]), -- Who knows how this is romanized?
171 |     kVietnamese  :: !(M.Map Char [QuocNgu.Phone]),
172 |     kXHC1983     :: !(M.Map Char [([(HDZEntry, IsHDZSubstitution)], [Pinyin.Phone])])
173 |   } deriving (Show) -- Useful for debugging in GHCi
174 | 
175 | emptyReadingsMap :: ReadingsMap
176 | emptyReadingsMap = RMS M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty
177 | 
178 | unionReadingsMap :: ReadingsMap -> ReadingsMap -> ReadingsMap
179 | unionReadingsMap (RMS a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12) (RMS b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12)
180 |   = RMS (plus a1 b1) (plus a2 b2) (plus a3 b3) (plus a4  b4)  (plus a5  b5)  (plus a6  b6)
181 |         (plus a7 b7) (plus a8 b8) (plus a9 b9) (plus a10 b10) (plus a11 b11) (plus a12 b12)
182 |   where plus = M.unionWith (error "unionReadingsMap: impossible") -- There is at most one line for each (character, field name) combination
183 | 
184 | {-# NOINLINE contents #-}
185 | contents :: TextL.Text
186 | contents = unsafePerformIO (readUTF8DataFile "data/Unihan/Unihan_Readings.txt")
187 | 
188 | readings :: ReadingsMap
189 | readings = parseLazy fileP contents
190 | 
191 | 
192 | fileP :: Parser ReadingsMap
193 | fileP = fmap (foldl' unionReadingsMap emptyReadingsMap) (lineP `manyTill` endOfInput)
194 | 
195 | lineP :: Parser ReadingsMap
196 | lineP = do { c <- charP <* skipSpace; dataP <- readingP c <* skipSpace; dataP <* skipTrueSpace <* lineTerminator }
197 |     <|> char '#' *> manyTill anyChar lineTerminator *> pure emptyReadingsMap
198 |     <|> manyTill skipTrueSpace lineTerminator *> pure emptyReadingsMap
199 |     <?> "line"
200 | 
201 | readingP :: Char -> Parser (Parser ReadingsMap)
202 | readingP c = string "kCantonese"   *> pure (liftA (\x -> emptyReadingsMap { kCantonese   = mk x }) (jyutpingP `sepBy1` skipTrueSpace))
203 |          <|> string "kDefinition"  *> pure (liftA (\x -> emptyReadingsMap { kDefinition  = mk x }) definitionsP)
204 |          <|> string "kHangul"      *> pure (liftA (\x -> emptyReadingsMap { kHangul      = mk x }) (hangulP `sepBy1` skipTrueSpace))
205 |          <|> string "kHanyuPinlu"  *> pure (liftA (\x -> emptyReadingsMap { kHanyuPinlu  = mk x }) (hanyuPinluP `sepBy1` skipTrueSpace))
206 |          <|> string "kHanyuPinyin" *> pure (liftA (\x -> emptyReadingsMap { kHanyuPinyin = mk x }) (hanyuPinyinP `sepBy1` skipTrueSpace))
207 |          <|> string "kJapaneseKun" *> pure (liftA (\x -> emptyReadingsMap { kJapaneseKun = mk x }) (takeWhile1 isAsciiUpper `sepBy1` skipTrueSpace))
208 |          <|> string "kJapaneseOn"  *> pure (liftA (\x -> emptyReadingsMap { kJapaneseOn  = mk x }) (takeWhile1 isAsciiUpper `sepBy1` skipTrueSpace))
209 |          <|> string "kKorean"      *> pure (liftA (\x -> emptyReadingsMap { kKorean      = mk x }) (yaleP `sepBy1` skipTrueSpace))
210 |          <|> string "kMandarin"    *> pure (liftA (\x -> emptyReadingsMap { kMandarin    = mk x }) mandarinP)
211 |          <|> string "kTang"        *> pure (liftA (\x -> emptyReadingsMap { kTang        = mk x }) (tangP `sepBy1` skipTrueSpace))
212 |          <|> string "kVietnamese"  *> pure (liftA (\x -> emptyReadingsMap { kVietnamese  = mk x }) (quocNguP `sepBy1` skipTrueSpace))
213 |          <|> string "kXHC1983"     *> pure (liftA (\x -> emptyReadingsMap { kXHC1983     = mk x }) (xhc1983P `sepBy1` skipTrueSpace))
214 |   where mk x = M.singleton c x
215 | 
216 | definitionsP :: Parser [CharDefinition]
217 | definitionsP = takeWhile1 (\c -> c /= '\r' && c /= '\n' && c /= ';') `sepBy1` (takeWhile1 (== ';') <* skipTrueSpace) -- Entry for U+4156 mistakely includes a double ;;
218 | 
219 | hangulP :: Parser Hangul.Phone
220 | hangulP = liftA Hangul.fromJamos (takeWhile1 (not . isSpace))
221 | 
222 | hanyuPinluP :: Parser (Pinyin.Phone, OccurrenceCount)
223 | hanyuPinluP = liftA2 (,) tonedPinyinP (char '(' *> decimal <* char ')')
224 | 
225 | mandarinP :: Parser (Pinyin.Phone, Pinyin.Phone)
226 | mandarinP = liftA2 (\simp mb_trad -> (simp, fromMaybe simp mb_trad)) accentedPinyinP (optional (skipTrueSpace *> accentedPinyinP))
227 | 
228 | accentedPinyinP :: Parser Pinyin.Phone
229 | accentedPinyinP = liftA Pinyin.fromAccented (takeWhile1 (\c -> not (isSpace c) && c /= ','))
230 | 
231 | hanyuPinyinP :: Parser ([HDZEntry], [Pinyin.Phone])
232 | hanyuPinyinP = liftA2 (,) (hdzEntryP `sepBy1` char ',') (char ':' *> (accentedPinyinP `sepBy1` char ','))
233 | 
234 | yaleP :: Parser KoreanYale.Phone
235 | yaleP = takeWhile1 isAsciiUpper
236 | 
237 | tangP :: Parser (CommonTangCharacter, Text.Text)
238 | tangP = liftA2 (,) (canParse (char '*')) (takeWhile1 (not . isSpace))
239 | 
240 | quocNguP :: Parser QuocNgu.Phone
241 | quocNguP = takeWhile1 (not . isSpace)
242 | 
243 | xhc1983P :: Parser ([(HDZEntry, IsHDZSubstitution)], [Pinyin.Phone])
244 | xhc1983P = liftA2 (,) (locP `sepBy1` char ',') (char ':' *> (accentedPinyinP `sepBy1` char ','))
245 |   where locP = liftA2 (,) hdzEntryP (canParse (char '*'))
246 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | The library code and data files are all distributed under their own licenses.
  2 | 
  3 | 
  4 | === Code ===
  5 | 
  6 | Copyright (c) 2008, Maximilian Bolingbroke
  7 | All rights reserved.
  8 | 
  9 | Redistribution and use in source and binary forms, with or without modification, are permitted
 10 | provided that the following conditions are met:
 11 | 
 12 |     * Redistributions of source code must retain the above copyright notice, this list of
 13 |       conditions and the following disclaimer.
 14 |     * Redistributions in binary form must reproduce the above copyright notice, this list of
 15 |       conditions and the following disclaimer in the documentation and/or other materials
 16 |       provided with the distribution.
 17 |     * Neither the name of Maximilian Bolingbroke nor the names of other contributors may be used to
 18 |       endorse or promote products derived from this software without specific prior written permission.
 19 | 
 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 21 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 22 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 23 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
 26 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 27 | OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | 
 30 | === CC-CEDICT ===
 31 | 
 32 | This work is licensed under a Creative Commons Attribution-Share Alike 3.0
 33 | License reproduced from http://creativecommons.org/licenses/by-sa/3.0/ below:
 34 | 
 35 | THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE
 36 | COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY
 37 | COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS
 38 | AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED.
 39 | 
 40 | BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE
 41 | TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY
 42 | BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS
 43 | CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND
 44 | CONDITIONS.
 45 | 
 46 | 1. Definitions
 47 | 
 48 |  a. "Adaptation" means a work based upon the Work, or upon the Work and
 49 |     other pre-existing works, such as a translation, adaptation,
 50 |     derivative work, arrangement of music or other alterations of a
 51 |     literary or artistic work, or phonogram or performance and includes
 52 |     cinematographic adaptations or any other form in which the Work may be
 53 |     recast, transformed, or adapted including in any form recognizably
 54 |     derived from the original, except that a work that constitutes a
 55 |     Collection will not be considered an Adaptation for the purpose of
 56 |     this License. For the avoidance of doubt, where the Work is a musical
 57 |     work, performance or phonogram, the synchronization of the Work in
 58 |     timed-relation with a moving image ("synching") will be considered an
 59 |     Adaptation for the purpose of this License.
 60 |  b. "Collection" means a collection of literary or artistic works, such as
 61 |     encyclopedias and anthologies, or performances, phonograms or
 62 |     broadcasts, or other works or subject matter other than works listed
 63 |     in Section 1(f) below, which, by reason of the selection and
 64 |     arrangement of their contents, constitute intellectual creations, in
 65 |     which the Work is included in its entirety in unmodified form along
 66 |     with one or more other contributions, each constituting separate and
 67 |     independent works in themselves, which together are assembled into a
 68 |     collective whole. A work that constitutes a Collection will not be
 69 |     considered an Adaptation (as defined below) for the purposes of this
 70 |     License.
 71 |  c. "Creative Commons Compatible License" means a license that is listed
 72 |     at http://creativecommons.org/compatiblelicenses that has been
 73 |     approved by Creative Commons as being essentially equivalent to this
 74 |     License, including, at a minimum, because that license: (i) contains
 75 |     terms that have the same purpose, meaning and effect as the License
 76 |     Elements of this License; and, (ii) explicitly permits the relicensing
 77 |     of adaptations of works made available under that license under this
 78 |     License or a Creative Commons jurisdiction license with the same
 79 |     License Elements as this License.
 80 |  d. "Distribute" means to make available to the public the original and
 81 |     copies of the Work or Adaptation, as appropriate, through sale or
 82 |     other transfer of ownership.
 83 |  e. "License Elements" means the following high-level license attributes
 84 |     as selected by Licensor and indicated in the title of this License:
 85 |     Attribution, ShareAlike.
 86 |  f. "Licensor" means the individual, individuals, entity or entities that
 87 |     offer(s) the Work under the terms of this License.
 88 |  g. "Original Author" means, in the case of a literary or artistic work,
 89 |     the individual, individuals, entity or entities who created the Work
 90 |     or if no individual or entity can be identified, the publisher; and in
 91 |     addition (i) in the case of a performance the actors, singers,
 92 |     musicians, dancers, and other persons who act, sing, deliver, declaim,
 93 |     play in, interpret or otherwise perform literary or artistic works or
 94 |     expressions of folklore; (ii) in the case of a phonogram the producer
 95 |     being the person or legal entity who first fixes the sounds of a
 96 |     performance or other sounds; and, (iii) in the case of broadcasts, the
 97 |     organization that transmits the broadcast.
 98 |  h. "Work" means the literary and/or artistic work offered under the terms
 99 |     of this License including without limitation any production in the
100 |     literary, scientific and artistic domain, whatever may be the mode or
101 |     form of its expression including digital form, such as a book,
102 |     pamphlet and other writing; a lecture, address, sermon or other work
103 |     of the same nature; a dramatic or dramatico-musical work; a
104 |     choreographic work or entertainment in dumb show; a musical
105 |     composition with or without words; a cinematographic work to which are
106 |     assimilated works expressed by a process analogous to cinematography;
107 |     a work of drawing, painting, architecture, sculpture, engraving or
108 |     lithography; a photographic work to which are assimilated works
109 |     expressed by a process analogous to photography; a work of applied
110 |     art; an illustration, map, plan, sketch or three-dimensional work
111 |     relative to geography, topography, architecture or science; a
112 |     performance; a broadcast; a phonogram; a compilation of data to the
113 |     extent it is protected as a copyrightable work; or a work performed by
114 |     a variety or circus performer to the extent it is not otherwise
115 |     considered a literary or artistic work.
116 |  i. "You" means an individual or entity exercising rights under this
117 |     License who has not previously violated the terms of this License with
118 |     respect to the Work, or who has received express permission from the
119 |     Licensor to exercise rights under this License despite a previous
120 |     violation.
121 |  j. "Publicly Perform" means to perform public recitations of the Work and
122 |     to communicate to the public those public recitations, by any means or
123 |     process, including by wire or wireless means or public digital
124 |     performances; to make available to the public Works in such a way that
125 |     members of the public may access these Works from a place and at a
126 |     place individually chosen by them; to perform the Work to the public
127 |     by any means or process and the communication to the public of the
128 |     performances of the Work, including by public digital performance; to
129 |     broadcast and rebroadcast the Work by any means including signs,
130 |     sounds or images.
131 |  k. "Reproduce" means to make copies of the Work by any means including
132 |     without limitation by sound or visual recordings and the right of
133 |     fixation and reproducing fixations of the Work, including storage of a
134 |     protected performance or phonogram in digital form or other electronic
135 |     medium.
136 | 
137 | 2. Fair Dealing Rights. Nothing in this License is intended to reduce,
138 | limit, or restrict any uses free from copyright or rights arising from
139 | limitations or exceptions that are provided for in connection with the
140 | copyright protection under copyright law or other applicable laws.
141 | 
142 | 3. License Grant. Subject to the terms and conditions of this License,
143 | Licensor hereby grants You a worldwide, royalty-free, non-exclusive,
144 | perpetual (for the duration of the applicable copyright) license to
145 | exercise the rights in the Work as stated below:
146 | 
147 |  a. to Reproduce the Work, to incorporate the Work into one or more
148 |     Collections, and to Reproduce the Work as incorporated in the
149 |     Collections;
150 |  b. to create and Reproduce Adaptations provided that any such Adaptation,
151 |     including any translation in any medium, takes reasonable steps to
152 |     clearly label, demarcate or otherwise identify that changes were made
153 |     to the original Work. For example, a translation could be marked "The
154 |     original work was translated from English to Spanish," or a
155 |     modification could indicate "The original work has been modified.";
156 |  c. to Distribute and Publicly Perform the Work including as incorporated
157 |     in Collections; and,
158 |  d. to Distribute and Publicly Perform Adaptations.
159 |  e. For the avoidance of doubt:
160 | 
161 |      i. Non-waivable Compulsory License Schemes. In those jurisdictions in
162 |         which the right to collect royalties through any statutory or
163 |         compulsory licensing scheme cannot be waived, the Licensor
164 |         reserves the exclusive right to collect such royalties for any
165 |         exercise by You of the rights granted under this License;
166 |     ii. Waivable Compulsory License Schemes. In those jurisdictions in
167 |         which the right to collect royalties through any statutory or
168 |         compulsory licensing scheme can be waived, the Licensor waives the
169 |         exclusive right to collect such royalties for any exercise by You
170 |         of the rights granted under this License; and,
171 |    iii. Voluntary License Schemes. The Licensor waives the right to
172 |         collect royalties, whether individually or, in the event that the
173 |         Licensor is a member of a collecting society that administers
174 |         voluntary licensing schemes, via that society, from any exercise
175 |         by You of the rights granted under this License.
176 | 
177 | The above rights may be exercised in all media and formats whether now
178 | known or hereafter devised. The above rights include the right to make
179 | such modifications as are technically necessary to exercise the rights in
180 | other media and formats. Subject to Section 8(f), all rights not expressly
181 | granted by Licensor are hereby reserved.
182 | 
183 | 4. Restrictions. The license granted in Section 3 above is expressly made
184 | subject to and limited by the following restrictions:
185 | 
186 |  a. You may Distribute or Publicly Perform the Work only under the terms
187 |     of this License. You must include a copy of, or the Uniform Resource
188 |     Identifier (URI) for, this License with every copy of the Work You
189 |     Distribute or Publicly Perform. You may not offer or impose any terms
190 |     on the Work that restrict the terms of this License or the ability of
191 |     the recipient of the Work to exercise the rights granted to that
192 |     recipient under the terms of the License. You may not sublicense the
193 |     Work. You must keep intact all notices that refer to this License and
194 |     to the disclaimer of warranties with every copy of the Work You
195 |     Distribute or Publicly Perform. When You Distribute or Publicly
196 |     Perform the Work, You may not impose any effective technological
197 |     measures on the Work that restrict the ability of a recipient of the
198 |     Work from You to exercise the rights granted to that recipient under
199 |     the terms of the License. This Section 4(a) applies to the Work as
200 |     incorporated in a Collection, but this does not require the Collection
201 |     apart from the Work itself to be made subject to the terms of this
202 |     License. If You create a Collection, upon notice from any Licensor You
203 |     must, to the extent practicable, remove from the Collection any credit
204 |     as required by Section 4(c), as requested. If You create an
205 |     Adaptation, upon notice from any Licensor You must, to the extent
206 |     practicable, remove from the Adaptation any credit as required by
207 |     Section 4(c), as requested.
208 |  b. You may Distribute or Publicly Perform an Adaptation only under the
209 |     terms of: (i) this License; (ii) a later version of this License with
210 |     the same License Elements as this License; (iii) a Creative Commons
211 |     jurisdiction license (either this or a later license version) that
212 |     contains the same License Elements as this License (e.g.,
213 |     Attribution-ShareAlike 3.0 US)); (iv) a Creative Commons Compatible
214 |     License. If you license the Adaptation under one of the licenses
215 |     mentioned in (iv), you must comply with the terms of that license. If
216 |     you license the Adaptation under the terms of any of the licenses
217 |     mentioned in (i), (ii) or (iii) (the "Applicable License"), you must
218 |     comply with the terms of the Applicable License generally and the
219 |     following provisions: (I) You must include a copy of, or the URI for,
220 |     the Applicable License with every copy of each Adaptation You
221 |     Distribute or Publicly Perform; (II) You may not offer or impose any
222 |     terms on the Adaptation that restrict the terms of the Applicable
223 |     License or the ability of the recipient of the Adaptation to exercise
224 |     the rights granted to that recipient under the terms of the Applicable
225 |     License; (III) You must keep intact all notices that refer to the
226 |     Applicable License and to the disclaimer of warranties with every copy
227 |     of the Work as included in the Adaptation You Distribute or Publicly
228 |     Perform; (IV) when You Distribute or Publicly Perform the Adaptation,
229 |     You may not impose any effective technological measures on the
230 |     Adaptation that restrict the ability of a recipient of the Adaptation
231 |     from You to exercise the rights granted to that recipient under the
232 |     terms of the Applicable License. This Section 4(b) applies to the
233 |     Adaptation as incorporated in a Collection, but this does not require
234 |     the Collection apart from the Adaptation itself to be made subject to
235 |     the terms of the Applicable License.
236 |  c. If You Distribute, or Publicly Perform the Work or any Adaptations or
237 |     Collections, You must, unless a request has been made pursuant to
238 |     Section 4(a), keep intact all copyright notices for the Work and
239 |     provide, reasonable to the medium or means You are utilizing: (i) the
240 |     name of the Original Author (or pseudonym, if applicable) if supplied,
241 |     and/or if the Original Author and/or Licensor designate another party
242 |     or parties (e.g., a sponsor institute, publishing entity, journal) for
243 |     attribution ("Attribution Parties") in Licensor's copyright notice,
244 |     terms of service or by other reasonable means, the name of such party
245 |     or parties; (ii) the title of the Work if supplied; (iii) to the
246 |     extent reasonably practicable, the URI, if any, that Licensor
247 |     specifies to be associated with the Work, unless such URI does not
248 |     refer to the copyright notice or licensing information for the Work;
249 |     and (iv) , consistent with Ssection 3(b), in the case of an
250 |     Adaptation, a credit identifying the use of the Work in the Adaptation
251 |     (e.g., "French translation of the Work by Original Author," or
252 |     "Screenplay based on original Work by Original Author"). The credit
253 |     required by this Section 4(c) may be implemented in any reasonable
254 |     manner; provided, however, that in the case of a Adaptation or
255 |     Collection, at a minimum such credit will appear, if a credit for all
256 |     contributing authors of the Adaptation or Collection appears, then as
257 |     part of these credits and in a manner at least as prominent as the
258 |     credits for the other contributing authors. For the avoidance of
259 |     doubt, You may only use the credit required by this Section for the
260 |     purpose of attribution in the manner set out above and, by exercising
261 |     Your rights under this License, You may not implicitly or explicitly
262 |     assert or imply any connection with, sponsorship or endorsement by the
263 |     Original Author, Licensor and/or Attribution Parties, as appropriate,
264 |     of You or Your use of the Work, without the separate, express prior
265 |     written permission of the Original Author, Licensor and/or Attribution
266 |     Parties.
267 |  d. Except as otherwise agreed in writing by the Licensor or as may be
268 |     otherwise permitted by applicable law, if You Reproduce, Distribute or
269 |     Publicly Perform the Work either by itself or as part of any
270 |     Adaptations or Collections, You must not distort, mutilate, modify or
271 |     take other derogatory action in relation to the Work which would be
272 |     prejudicial to the Original Author's honor or reputation. Licensor
273 |     agrees that in those jurisdictions (e.g. Japan), in which any exercise
274 |     of the right granted in Section 3(b) of this License (the right to
275 |     make Adaptations) would be deemed to be a distortion, mutilation,
276 |     modification or other derogatory action prejudicial to the Original
277 |     Author's honor and reputation, the Licensor will waive or not assert,
278 |     as appropriate, this Section, to the fullest extent permitted by the
279 |     applicable national law, to enable You to reasonably exercise Your
280 |     right under Section 3(b) of this License (right to make Adaptations)
281 |     but not otherwise.
282 | 
283 | 5. Representations, Warranties and Disclaimer
284 | 
285 | UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR
286 | OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY
287 | KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE,
288 | INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY,
289 | FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF
290 | LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS,
291 | WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION
292 | OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.
293 | 
294 | 6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE
295 | LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR
296 | ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES
297 | ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS
298 | BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
299 | 
300 | 7. Termination
301 | 
302 |  a. This License and the rights granted hereunder will terminate
303 |     automatically upon any breach by You of the terms of this License.
304 |     Individuals or entities who have received Adaptations or Collections
305 |     from You under this License, however, will not have their licenses
306 |     terminated provided such individuals or entities remain in full
307 |     compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will
308 |     survive any termination of this License.
309 |  b. Subject to the above terms and conditions, the license granted here is
310 |     perpetual (for the duration of the applicable copyright in the Work).
311 |     Notwithstanding the above, Licensor reserves the right to release the
312 |     Work under different license terms or to stop distributing the Work at
313 |     any time; provided, however that any such election will not serve to
314 |     withdraw this License (or any other license that has been, or is
315 |     required to be, granted under the terms of this License), and this
316 |     License will continue in full force and effect unless terminated as
317 |     stated above.
318 | 
319 | 8. Miscellaneous
320 | 
321 |  a. Each time You Distribute or Publicly Perform the Work or a Collection,
322 |     the Licensor offers to the recipient a license to the Work on the same
323 |     terms and conditions as the license granted to You under this License.
324 |  b. Each time You Distribute or Publicly Perform an Adaptation, Licensor
325 |     offers to the recipient a license to the original Work on the same
326 |     terms and conditions as the license granted to You under this License.
327 |  c. If any provision of this License is invalid or unenforceable under
328 |     applicable law, it shall not affect the validity or enforceability of
329 |     the remainder of the terms of this License, and without further action
330 |     by the parties to this agreement, such provision shall be reformed to
331 |     the minimum extent necessary to make such provision valid and
332 |     enforceable.
333 |  d. No term or provision of this License shall be deemed waived and no
334 |     breach consented to unless such waiver or consent shall be in writing
335 |     and signed by the party to be charged with such waiver or consent.
336 |  e. This License constitutes the entire agreement between the parties with
337 |     respect to the Work licensed here. There are no understandings,
338 |     agreements or representations with respect to the Work not specified
339 |     here. Licensor shall not be bound by any additional provisions that
340 |     may appear in any communication from You. This License may not be
341 |     modified without the mutual written agreement of the Licensor and You.
342 |  f. The rights granted under, and the subject matter referenced, in this
343 |     License were drafted utilizing the terminology of the Berne Convention
344 |     for the Protection of Literary and Artistic Works (as amended on
345 |     September 28, 1979), the Rome Convention of 1961, the WIPO Copyright
346 |     Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996
347 |     and the Universal Copyright Convention (as revised on July 24, 1971).
348 |     These rights and subject matter take effect in the relevant
349 |     jurisdiction in which the License terms are sought to be enforced
350 |     according to the corresponding provisions of the implementation of
351 |     those treaty provisions in the applicable national law. If the
352 |     standard suite of rights granted under applicable copyright law
353 |     includes additional rights not granted under this License, such
354 |     additional rights are deemed to be included in the License; this
355 |     License is not intended to restrict the license of any rights under
356 |     applicable law.
357 | 
358 | 
359 | === Unihan ===
360 | 
361 | UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
362 | 
363 | Unicode Data Files include all data files under the directories http://www.unicode.org/Public/,
364 | http://www.unicode.org/reports/, and http://www.unicode.org/cldr/data/. Unicode Data Files do not
365 | include PDF online code charts under the directory http://www.unicode.org/Public/. Software includes
366 | any source code published in the Unicode Standard or under the directories
367 | http://www.unicode.org/Public/, http://www.unicode.org/reports/, and
368 | http://www.unicode.org/cldr/data/.
369 | 
370 | NOTICE TO USER: Carefully read the following legal agreement. BY DOWNLOADING, INSTALLING, COPYING OR
371 | OTHERWISE USING UNICODE INC.'S DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), YOU
372 | UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE TERMS AND CONDITIONS OF THIS AGREEMENT.
373 | IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
374 | 
375 | COPYRIGHT AND PERMISSION NOTICE
376 | 
377 | Copyright © 1991-2012 Unicode, Inc. All rights reserved. Distributed under the Terms of Use in
378 | http://www.unicode.org/copyright.html.
379 | 
380 | Permission is hereby granted, free of charge, to any person obtaining a copy of the Unicode data
381 | files and any associated documentation (the "Data Files") or Unicode software and any associated
382 | documentation (the "Software") to deal in the Data Files or Software without restriction, including
383 | without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sell copies
384 | of the Data Files or Software, and to permit persons to whom the Data Files or Software are
385 | furnished to do so, provided that (a) the above copyright notice(s) and this permission notice
386 | appear with all copies of the Data Files or Software, (b) both the above copyright notice(s) and
387 | this permission notice appear in associated documentation, and (c) there is clear notice in each
388 | modified Data File or in the Software as well as in the documentation associated with the Data
389 | File(s) or Software that the data or software has been modified.
390 | 
391 | THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
392 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
393 | NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN
394 | THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY
395 | DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
396 | NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
397 | THE DATA FILES OR SOFTWARE.
398 | 
399 | Except as contained in this notice, the name of a copyright holder shall not be used in advertising
400 | or otherwise to promote the sale, use or other dealings in these Data Files or Software without
401 | prior written authorization of the copyright holder.
402 | 


--------------------------------------------------------------------------------