├── data ├── .gitignore └── Unihan │ └── Unihan_NumericValues.txt ├── Setup.lhs ├── .gitignore ├── CJK ├── Data │ ├── QuocNgu.hs │ ├── KoreanYale.hs │ ├── Hangul.hs │ ├── Jyutping.hs │ ├── Types.hs │ ├── Internal.hs │ ├── Unihan │ │ ├── NumericValues.hs │ │ ├── Variants.hs │ │ ├── RadicalStrokeCounts.hs │ │ ├── DictionaryLikeData.hs │ │ └── Readings.hs │ ├── Pinyin.hs │ └── CEDICT.hs └── Utilities.hs ├── tests └── Tests.hs ├── release ├── cjk.cabal └── LICENSE /data/.gitignore: -------------------------------------------------------------------------------- 1 | # Packed files 2 | cedict_1_0_ts_utf-8_mdbg.txt.gz 3 | Unihan.zip 4 | -------------------------------------------------------------------------------- /Setup.lhs: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env runhaskell 2 | 3 | > import Distribution.Simple 4 | > main = defaultMain -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Build artifacts 2 | dist/ 3 | *.hi 4 | *.o 5 | 6 | # OS junk 7 | Thumbs.db 8 | .DS_Store 9 | -------------------------------------------------------------------------------- /CJK/Data/QuocNgu.hs: -------------------------------------------------------------------------------- 1 | module CJK.Data.QuocNgu where 2 | 3 | import qualified Data.Text as Text 4 | 5 | 6 | -- TODO: flesh out this definition 7 | type Phone = Text.Text 8 | -------------------------------------------------------------------------------- /CJK/Data/KoreanYale.hs: -------------------------------------------------------------------------------- 1 | module CJK.Data.KoreanYale where 2 | 3 | import qualified Data.Text as Text 4 | 5 | 6 | -- TODO: flesh out this definition 7 | type Phone = Text.Text 8 | -------------------------------------------------------------------------------- /CJK/Data/Hangul.hs: -------------------------------------------------------------------------------- 1 | module CJK.Data.Hangul where 2 | 3 | import qualified Data.Text as Text 4 | import qualified Data.Text.ICU.Normalize as Text 5 | 6 | 7 | -- TODO: flesh out this definition 8 | type Phone = Char 9 | 10 | fromJamos :: Text.Text -> Phone 11 | fromJamos s = case Text.unpack (Text.normalize Text.NFC s) of 12 | [c] -> c 13 | cs -> error $ "Certainly non-Korean phone " ++ cs 14 | -------------------------------------------------------------------------------- /CJK/Data/Jyutping.hs: -------------------------------------------------------------------------------- 1 | module CJK.Data.Jyutping where 2 | 3 | import qualified Data.Text as Text 4 | 5 | 6 | data Tone = HighLevel 7 | | MidRising 8 | | MidLevel 9 | | LowFalling 10 | | LowRising 11 | | LowLevel 12 | deriving (Eq, Ord) 13 | 14 | instance Show Tone where 15 | show = show . toneNumber 16 | 17 | toneNumber :: Tone -> Int 18 | toneNumber HighLevel = 1 19 | toneNumber MidRising = 2 20 | toneNumber MidLevel = 3 21 | toneNumber LowFalling = 4 22 | toneNumber LowRising = 5 23 | toneNumber LowLevel = 6 24 | 25 | 26 | data Phone = Phone { 27 | sound :: Text.Text, 28 | tone :: Tone 29 | } 30 | 31 | instance Show Phone where 32 | show jyut = Text.unpack (sound jyut) ++ show (tone jyut) 33 | -------------------------------------------------------------------------------- /CJK/Data/Types.hs: -------------------------------------------------------------------------------- 1 | module CJK.Data.Types where 2 | 3 | import qualified Data.Text as Text 4 | 5 | 6 | newtype KangXiRadical = KangXi { 7 | kangXiRadicalNumber :: Int -- ^ Radical number in the range 1 to 214 inclusive 8 | } deriving (Show) -- Useful for debugging in GHCi 9 | 10 | type StrokeCount = Int 11 | 12 | data RadicalStrokeCount a = RSC { 13 | radical :: a, -- ^ The radical which is considered to form the main part of the character 14 | additionalStrokes :: StrokeCount -- ^ The “additional strokes” value is the residual stroke-count, the count of all strokes remaining after eliminating all strokes associated with the radical. 15 | } deriving (Show) -- Useful for debugging in GHCi 16 | 17 | -- | Location of the associated information in the《漢語大字典》 Hànyǔ Dà Zìdiǎn 18 | type HDZEntry = Text.Text 19 | 20 | -- | How to input the character in Cangjie 21 | type CangjieInputCode = Text.Text 22 | -------------------------------------------------------------------------------- /tests/Tests.hs: -------------------------------------------------------------------------------- 1 | module Main where 2 | 3 | import Control.Monad 4 | import Data.Maybe 5 | import System.Exit 6 | 7 | import qualified CJK.Data.Unihan.DictionaryLikeData as DictionaryLikeData 8 | import qualified CJK.Data.Unihan.NumericValues as NumericValues 9 | import qualified CJK.Data.Unihan.RadicalStrokeCounts as RadicalStrokeCounts 10 | import qualified CJK.Data.Unihan.Readings as Readings 11 | import qualified CJK.Data.Unihan.Variants as Variants 12 | import qualified CJK.Data.CEDICT as CEDICT 13 | 14 | 15 | -- Just check a single data point from each module. As long as the data parses 16 | -- successfully there is a 90% chance everything is working 17 | main :: IO () 18 | main = do 19 | checkNot "DictionaryLikeData" $ isNothing $ DictionaryLikeData.cangjie '好' 20 | checkNot "NumericValues" $ isNothing $ NumericValues.numericValue '十' 21 | checkNot "RadicalStrokeCounts" $ length (RadicalStrokeCounts.unicode '好') == 0 22 | checkNot "Readings" $ length (Readings.mandarinBestEffort '好') == 0 23 | checkNot "Variants" $ length (Variants.traditionalVariants '电') == 0 24 | checkNot "CEDICT" $ length CEDICT.entries == 0 25 | 26 | checkNot :: String -> Bool -> IO () 27 | checkNot msg p = do 28 | putStr (msg ++ ": ") 29 | if p then putStrLn "failure" >> exitWith (ExitFailure 1) 30 | else putStrLn "success" -------------------------------------------------------------------------------- /CJK/Data/Internal.hs: -------------------------------------------------------------------------------- 1 | module CJK.Data.Internal where 2 | 3 | import qualified CJK.Data.Jyutping as Jyutping 4 | import qualified CJK.Data.Pinyin as Pinyin 5 | import CJK.Data.Types 6 | 7 | import Control.Applicative 8 | 9 | import Data.Char 10 | import Data.Attoparsec.Text 11 | 12 | 13 | jyutpingP :: Parser Jyutping.Phone 14 | jyutpingP = liftA2 Jyutping.Phone (takeWhile1 (\c -> isAsciiUpper c || isAsciiLower c || c == '[' || c == ']')) jyutpingToneP -- Some kCheungBauer says [ng]ai1 15 | 16 | jyutpingToneP :: Parser Jyutping.Tone 17 | jyutpingToneP = char '1' *> pure Jyutping.HighLevel 18 | <|> char '2' *> pure Jyutping.MidRising 19 | <|> char '3' *> pure Jyutping.MidLevel 20 | <|> char '4' *> pure Jyutping.LowFalling 21 | <|> char '5' *> pure Jyutping.LowRising 22 | <|> char '6' *> pure Jyutping.LowLevel 23 | 24 | 25 | tonedPinyinP :: Parser Pinyin.Phone 26 | tonedPinyinP = liftA2 Pinyin.Phone (takeWhile1 (\c -> isAsciiUpper c || isAsciiLower c || c == 'ü' || c == 'Ü')) pinyinToneP 27 | 28 | pinyinToneP :: Parser Pinyin.Tone 29 | pinyinToneP = char '1' *> pure Pinyin.Flat 30 | <|> char '2' *> pure Pinyin.Rising 31 | <|> char '3' *> pure Pinyin.FallingRising 32 | <|> char '4' *> pure Pinyin.Falling 33 | <|> char '5' *> pure Pinyin.Neutral 34 | 35 | hdzEntryP :: Parser HDZEntry 36 | hdzEntryP = takeWhile1 (\c -> isDigit c || c == '.') 37 | -------------------------------------------------------------------------------- /release: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | 4 | echo "Have you updated the version number? Type 'yes' if you have!" 5 | read version_response 6 | 7 | if [ "$version_response" != "yes" ]; then 8 | echo "Go and update the version number" 9 | exit 1 10 | fi 11 | 12 | sdist_output=`runghc Setup.lhs sdist` 13 | 14 | if [ "$?" != "0" ]; then 15 | echo "Cabal sdist failed, aborting" 16 | exit 1 17 | fi 18 | 19 | # Want to find a line like: 20 | # Source tarball created: dist/ansi-terminal-0.1.tar.gz 21 | 22 | # Test this with: 23 | # runghc Setup.lhs sdist | grep ... 24 | filename=`echo $sdist_output | sed 's/.*Source tarball created: \([^ ]*\).*/\1/'` 25 | echo "Filename: $filename" 26 | 27 | if [ "$filename" = "$sdist_output" ]; then 28 | echo "Could not find filename, aborting" 29 | exit 1 30 | fi 31 | 32 | # Test this with: 33 | # echo dist/ansi-terminal-0.1.tar.gz | sed ... 34 | version=`echo $filename | sed 's/^[^0-9]*\([0-9\.]*\).tar.gz$/\1/'` 35 | echo "Version: $version" 36 | 37 | if [ "$version" = "$filename" ]; then 38 | echo "Could not find version, aborting" 39 | exit 1 40 | fi 41 | 42 | echo "This is your last chance to abort! I'm going to upload in 10 seconds" 43 | sleep 10 44 | 45 | git tag "v$version" 46 | 47 | if [ "$?" != "0" ]; then 48 | echo "Git tag failed, aborting" 49 | exit 1 50 | fi 51 | 52 | # You need to have stored your Hackage username and password as directed by cabal upload 53 | # I use -v5 because otherwise the error messages can be cryptic :-) 54 | cabal upload -v2 $filename 55 | 56 | if [ "$?" != "0" ]; then 57 | echo "Hackage upload failed, aborting" 58 | exit 1 59 | fi 60 | 61 | # Success! 62 | exit 0 63 | -------------------------------------------------------------------------------- /CJK/Utilities.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP #-} 2 | module CJK.Utilities where 3 | 4 | import Control.Applicative 5 | 6 | import Data.Char 7 | import qualified Data.ByteString.Lazy as BS 8 | 9 | import qualified Data.Text as Text 10 | import qualified Data.Text.Lazy as TextL 11 | import qualified Data.Text.Lazy.Encoding as TextL 12 | 13 | import qualified Data.Attoparsec.Text as DAT 14 | import qualified Data.Attoparsec.Text.Lazy as DATL 15 | 16 | import Paths_cjk 17 | 18 | 19 | readUTF8DataFile :: FilePath -> IO TextL.Text 20 | readUTF8DataFile fp = do 21 | full_fp <- getDataFileName fp 22 | readUTF8File full_fp 23 | 24 | readUTF8File :: FilePath -> IO TextL.Text 25 | readUTF8File = fmap TextL.decodeUtf8 . BS.readFile 26 | 27 | parseLazy :: DAT.Parser a -> TextL.Text -> a 28 | #ifndef MIN_VERSION_attoparsec 29 | #define MIN_VERSION_attoparsec(A,B,C) 0 30 | #endif 31 | #if MIN_VERSION_attoparsec(0,10,4) 32 | -- Lazy parse is bugged in attoparsec-0.10.3 and below (see ticket #10/#21) 33 | parseLazy p s = case DATL.eitherResult (DATL.parse p s) of 34 | Left err -> error $ "parseLazy: " ++ err 35 | Right x -> x 36 | #else 37 | parseLazy p s = case DAT.parseOnly p (TextL.toStrict s) of 38 | Left err -> error $ "parseLazy: " ++ err 39 | Right x -> x 40 | #endif 41 | 42 | 43 | charP :: DAT.Parser Char 44 | charP = fmap chr $ DAT.string (Text.pack "U+") *> DAT.hexadecimal 45 | 46 | canParse :: DAT.Parser a -> DAT.Parser Bool 47 | canParse p = p *> pure True <|> pure False 48 | 49 | lineTerminator :: DAT.Parser () 50 | lineTerminator = DAT.endOfLine <|> DAT.endOfInput 51 | 52 | skipTrueSpace :: DAT.Parser () 53 | skipTrueSpace = DAT.skipWhile isTrueSpace 54 | 55 | isTrueSpace :: Char -> Bool 56 | isTrueSpace c = isSpace c && c /= '\r' && c /= '\n' 57 | 58 | 59 | liftA4 :: Applicative t => (a -> b -> c -> d -> e) -> t a -> t b -> t c -> t d -> t e 60 | liftA4 f ma mb mc md = pure f <*> ma <*> mb <*> mc <*> md 61 | -------------------------------------------------------------------------------- /cjk.cabal: -------------------------------------------------------------------------------- 1 | name: cjk 2 | version: 0.1.0.1 3 | synopsis: Data about Chinese, Japanese and Korean characters and languages 4 | description: A Haskell interface to the most important information from the Unicode Unihan character 5 | database and CC-CEDICT free Chinese-English dictionary. 6 | . 7 | Contributions of data from more sources are very welcome! 8 | homepage: http://github.com/batterseapower/cjk 9 | license: BSD3 10 | license-file: LICENSE 11 | author: Max Bolingbroke 12 | maintainer: Max Bolingbroke 13 | category: Text 14 | build-type: Simple 15 | cabal-version: >=1.8 16 | data-files: data/cedict_1_0_ts_utf-8_mdbg.txt 17 | data/Unihan/*.txt 18 | 19 | library 20 | exposed-modules: CJK.Data.Unihan.DictionaryLikeData 21 | CJK.Data.Unihan.NumericValues 22 | CJK.Data.Unihan.RadicalStrokeCounts 23 | CJK.Data.Unihan.Readings 24 | CJK.Data.Unihan.Variants 25 | CJK.Data.CEDICT 26 | CJK.Data.Hangul 27 | CJK.Data.Jyutping 28 | CJK.Data.KoreanYale 29 | CJK.Data.Pinyin 30 | CJK.Data.QuocNgu 31 | CJK.Data.Types 32 | other-modules: CJK.Data.Internal 33 | CJK.Utilities 34 | -- I don't think I should have to put this here, but if I don't then any executables 35 | -- linking against the cjk library will fail to find symbols exported by this module 36 | Paths_cjk 37 | build-depends: base >=4.5 && < 5 38 | , containers >=0.4.2 39 | , bytestring >=0.9 40 | , text >=0.11 41 | , text-icu >=0.6.3.5 42 | , attoparsec >=0.10.3 43 | 44 | test-suite tests 45 | type: exitcode-stdio-1.0 46 | hs-source-dirs: tests 47 | main-is: Tests.hs 48 | build-depends: base 49 | , cjk 50 | 51 | source-repository head 52 | type: git 53 | location: https://github.com/batterseapower/cjk.git 54 | -------------------------------------------------------------------------------- /CJK/Data/Unihan/NumericValues.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | module CJK.Data.Unihan.NumericValues ( 3 | NumericUse(..), numericValue 4 | ) where 5 | 6 | import CJK.Utilities 7 | 8 | import Control.Applicative 9 | 10 | import qualified Data.Text as Text 11 | import qualified Data.Text.Lazy as TextL 12 | import Data.Attoparsec.Text 13 | 14 | import Data.Char 15 | import Data.Maybe 16 | import qualified Data.Map as M 17 | import Data.List 18 | 19 | import System.IO.Unsafe 20 | 21 | 22 | data NumericUse = AccountingUse -- ^ Used in the writing of accounting numerals (to prevent fraud) 23 | | OtherUse -- ^ Used in certain unusual, specialized contexts 24 | | PrimaryUse -- ^ Used in the writing of numbers in the standard fashion 25 | deriving (Eq, Show) 26 | 27 | -- | The value of the character and the contexts in which it is used 28 | numericValue :: Char -> Maybe (NumericUse, Integer) 29 | numericValue c = M.lookup c numericValues 30 | 31 | 32 | type NumericValuesMap = M.Map Char (NumericUse, Integer) 33 | 34 | emptyNumericValuesMap :: NumericValuesMap 35 | emptyNumericValuesMap = M.empty 36 | 37 | unionNumericValuesMap :: NumericValuesMap -> NumericValuesMap -> NumericValuesMap 38 | unionNumericValuesMap = M.unionWith (error "unionNumericValuesMap: impossible") -- There is at most one line for each (character, field name) combination 39 | 40 | {-# NOINLINE contents #-} 41 | contents :: TextL.Text 42 | contents = unsafePerformIO (readUTF8DataFile "data/Unihan/Unihan_NumericValues.txt") 43 | 44 | numericValues :: NumericValuesMap 45 | numericValues = parseLazy fileP contents 46 | 47 | 48 | fileP :: Parser NumericValuesMap 49 | fileP = fmap (foldl' unionNumericValuesMap emptyNumericValuesMap) (lineP `manyTill` endOfInput) 50 | 51 | lineP :: Parser NumericValuesMap 52 | lineP = do { c <- charP <* skipSpace; dataP <- numericValueP c <* skipSpace; dataP <* skipTrueSpace <* lineTerminator } 53 | <|> char '#' *> manyTill anyChar lineTerminator *> pure emptyNumericValuesMap 54 | <|> manyTill skipTrueSpace lineTerminator *> pure emptyNumericValuesMap 55 | "line" 56 | 57 | numericValueP :: Char -> Parser (Parser NumericValuesMap) 58 | numericValueP c = string "kAccountingNumeric" *> pure (liftA (mk AccountingUse) decimal) 59 | <|> string "kOtherNumeric" *> pure (liftA (mk OtherUse) decimal) 60 | <|> string "kPrimaryNumeric" *> pure (liftA (mk PrimaryUse) decimal) 61 | where mk use x = M.singleton c (use, x) 62 | -------------------------------------------------------------------------------- /data/Unihan/Unihan_NumericValues.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Unihan_NumericValues.txt 3 | # Date: 2012-08-17 17:13:30 GMT [JHJ] 4 | # Unicode version: 6.2.0 5 | # 6 | # Unicode Character Database 7 | # Copyright (c) 1991-2012 Unicode, Inc. 8 | # For terms of use, see http://www.unicode.org/terms_of_use.html 9 | # For documentation, see http://www.unicode.org/reports/tr44/ 10 | # 11 | # This file contains data on the following fields from the Unihan database: 12 | # kAccountingNumeric 13 | # kOtherNumeric 14 | # kPrimaryNumeric 15 | # 16 | # For details on the file format, see http://www.unicode.org/reports/tr38/ 17 | # 18 | U+3405 kOtherNumeric 5 19 | U+3483 kOtherNumeric 2 20 | U+382A kOtherNumeric 5 21 | U+3B4D kOtherNumeric 7 22 | U+4E00 kPrimaryNumeric 1 23 | U+4E03 kPrimaryNumeric 7 24 | U+4E07 kPrimaryNumeric 10000 25 | U+4E09 kPrimaryNumeric 3 26 | U+4E5D kPrimaryNumeric 9 27 | U+4E8C kPrimaryNumeric 2 28 | U+4E94 kPrimaryNumeric 5 29 | U+4E96 kOtherNumeric 4 30 | U+4EBF kPrimaryNumeric 100000000 31 | U+4EC0 kOtherNumeric 10 32 | U+4EDF kAccountingNumeric 1000 33 | U+4EE8 kOtherNumeric 3 34 | U+4F0D kAccountingNumeric 5 35 | U+4F70 kAccountingNumeric 100 36 | U+5104 kPrimaryNumeric 100000000 37 | U+5146 kPrimaryNumeric 1000000000000 38 | U+5169 kOtherNumeric 2 39 | U+516B kPrimaryNumeric 8 40 | U+516D kPrimaryNumeric 6 41 | U+5341 kPrimaryNumeric 10 42 | U+5343 kPrimaryNumeric 1000 43 | U+5344 kOtherNumeric 20 44 | U+5345 kOtherNumeric 30 45 | U+534C kOtherNumeric 40 46 | U+53C1 kAccountingNumeric 3 47 | U+53C2 kAccountingNumeric 3 48 | U+53C3 kAccountingNumeric 3 49 | U+53C4 kOtherNumeric 3 50 | U+56DB kPrimaryNumeric 4 51 | U+58F1 kAccountingNumeric 1 52 | U+58F9 kAccountingNumeric 1 53 | U+5E7A kOtherNumeric 1 54 | U+5EFE kOtherNumeric 9 55 | U+5EFF kOtherNumeric 20 56 | U+5F0C kAccountingNumeric 1 57 | U+5F0D kAccountingNumeric 2 58 | U+5F0E kAccountingNumeric 3 59 | U+5F10 kAccountingNumeric 2 60 | U+62FE kAccountingNumeric 10 61 | U+634C kAccountingNumeric 8 62 | U+67D2 kAccountingNumeric 7 63 | U+6F06 kAccountingNumeric 7 64 | U+7396 kAccountingNumeric 9 65 | U+767E kPrimaryNumeric 100 66 | U+8086 kAccountingNumeric 4 67 | U+842C kAccountingNumeric 10000 68 | U+8CAE kAccountingNumeric 2 69 | U+8CB3 kAccountingNumeric 2 70 | U+8D30 kAccountingNumeric 2 71 | U+9621 kAccountingNumeric 1000 72 | U+9646 kAccountingNumeric 6 73 | U+964C kAccountingNumeric 100 74 | U+9678 kAccountingNumeric 6 75 | U+96F6 kPrimaryNumeric 0 76 | U+20001 kOtherNumeric 7 77 | U+20064 kOtherNumeric 4 78 | U+200E2 kOtherNumeric 4 79 | U+20121 kOtherNumeric 5 80 | U+2092A kOtherNumeric 1 81 | U+20983 kOtherNumeric 30 82 | U+2098C kOtherNumeric 40 83 | U+2099C kOtherNumeric 40 84 | U+20AEA kOtherNumeric 6 85 | U+20AFD kOtherNumeric 3 86 | U+20B19 kOtherNumeric 3 87 | U+22390 kOtherNumeric 2 88 | U+22998 kOtherNumeric 3 89 | U+23B1B kOtherNumeric 3 90 | U+2626D kOtherNumeric 4 91 | 92 | # EOF 93 | -------------------------------------------------------------------------------- /CJK/Data/Pinyin.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, PatternGuards #-} 2 | module CJK.Data.Pinyin where 3 | 4 | import qualified Data.Text as Text 5 | import qualified Data.Text.ICU.Normalize as Text 6 | import Data.Maybe 7 | 8 | 9 | data Tone = Flat 10 | | Rising 11 | | FallingRising 12 | | Falling 13 | | Neutral 14 | deriving (Eq, Ord) 15 | 16 | instance Show Tone where 17 | show = show . toneNumber 18 | 19 | toneNumber :: Tone -> Int 20 | toneNumber Flat = 1 21 | toneNumber Rising = 2 22 | toneNumber FallingRising = 3 23 | toneNumber Falling = 4 24 | toneNumber Neutral = 5 25 | 26 | -- | Returns the Unicode combining character used to produce the accent for this tone. Returns Nothing if no accent is required. 27 | toneCombiningMark :: Tone -> Maybe Char 28 | toneCombiningMark Flat = Just '\x304' 29 | toneCombiningMark Rising = Just '\x301' 30 | toneCombiningMark FallingRising = Just '\x30C' 31 | toneCombiningMark Falling = Just '\x300' 32 | toneCombiningMark Neutral = Nothing 33 | 34 | -- | Returns the tone associated with this Unicode combining character, if any. 35 | combiningMarkTone :: Char -> Maybe Tone 36 | combiningMarkTone '\x304' = Just Flat 37 | combiningMarkTone '\x301' = Just Rising 38 | combiningMarkTone '\x30C' = Just FallingRising 39 | combiningMarkTone '\x300' = Just Falling 40 | combiningMarkTone _ = Nothing 41 | 42 | 43 | data Phone = Phone { 44 | sound :: Text.Text, 45 | tone :: Tone 46 | } 47 | 48 | instance Show Phone where 49 | show yin = Text.unpack (sound yin) ++ show (tone yin) 50 | 51 | fromAccented :: Text.Text -> Phone 52 | fromAccented s = go [] Nothing $ Text.unpack $ Text.normalize Text.NFD s 53 | where go !tser !mb_tone cs = case cs of 54 | [] -> Phone { sound = Text.pack (reverse tser), tone = fromMaybe Neutral mb_tone } 55 | (c:cs) -> case combiningMarkTone c of 56 | Just tone -> go tser (jst tone) cs 57 | Nothing -> go (c:tser) mb_tone cs 58 | where jst tone' = case mb_tone of 59 | Just tone | tone /= tone' -> error $ "Conflicting tones " ++ show tone ++ " and " ++ show tone' ++ " in " ++ Text.unpack s 60 | _ -> Just tone' -- Allow multiple tones of the same time, even if it is technically incorrect 61 | 62 | -- Places an accent mark on the Pinyin according to these rules (from ): 63 | -- 64 | -- 1. If there is only one vowel, it takes the diacritic. 65 | -- 2. If there is more than one vowel, then the vowels {a}, {e}, or {o} take the diacritic. 66 | -- 3. If the vowel cluster is {ao}, then {a} takes the diacritic. 67 | -- 4. If the vowel cluster is {iu} or {ui}, the last letter takes the diacritic. 68 | toAccented :: Phone -> Text.Text 69 | toAccented yin = Text.normalize Text.NFC $ Text.pack $ go $ Text.unpack $ sound yin 70 | where go cs = case span isVowel cs of 71 | ([], []) -> [] -- All pinyin contain a vowel, so this can only happen when the pinyin is in fact invalid 72 | ([], c:cs) -> c:go cs 73 | (vws, cs) -> go' vws ++ cs 74 | 75 | go' :: String -> String 76 | -- 1. If there is only one vowel, it takes the diacritic. 77 | go' [vw] = vw : mark 78 | -- 2. If there is more than one vowel, then the vowels {a}, {e}, or {o} take the diacritic. 79 | go' vws | (vws1, vw:vws2) <- span (\vw -> not (isA vw || isE vw || isO vw)) vws = vws1 ++ (vw : mark ++ vws2) 80 | -- 3. If the vowel cluster is {ao}, then {a} takes the diacritic. 81 | go' [vw1, vw2] | isA vw1 && isO vw2 = vw1 : mark ++ [vw2] 82 | -- 4. If the vowel cluster is {iu} or {ui}, the last letter takes the diacritic. 83 | go' [vw1, vw2] | (isI vw1 && isU vw2) || (isU vw1 && isI vw2) = [vw1, vw2] ++ mark 84 | -- Default to just after the first vowel 85 | go' (vw:vws) = vw : mark ++ vws 86 | 87 | isA, isE, isI, isO, isU, isVowel :: Char -> Bool 88 | 89 | isA 'a' = True 90 | isA 'A' = True 91 | isA _ = False 92 | 93 | isE 'e' = True 94 | isE 'E' = True 95 | isE _ = False 96 | 97 | isI 'i' = True 98 | isI 'I' = True 99 | isI _ = False 100 | 101 | isO 'o' = True 102 | isO 'O' = True 103 | isO _ = False 104 | 105 | isU 'u' = True 106 | isU 'U' = True 107 | isU _ = False 108 | 109 | isVowel c = isA c || isE c || isI c || isO c || isU c 110 | 111 | mark = maybeToList (toneCombiningMark (tone yin)) 112 | -------------------------------------------------------------------------------- /CJK/Data/Unihan/Variants.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | module CJK.Data.Unihan.Variants ( 3 | SemanticVariantType(..), 4 | VariantSource, VariantCitation, Variant, 5 | compatibilityVariants, zVariants, 6 | semanticVariants, specializedSemanticVariants, 7 | simplifiedVariants, traditionalVariants 8 | ) where 9 | 10 | import CJK.Utilities 11 | 12 | import Control.Applicative 13 | 14 | import qualified Data.Text as Text 15 | import qualified Data.Text.Lazy as TextL 16 | import Data.Attoparsec.Text 17 | 18 | import Data.Char 19 | import Data.Maybe 20 | import qualified Data.Map as M 21 | import Data.List 22 | 23 | import System.IO.Unsafe 24 | 25 | 26 | data SemanticVariantType = T -- T for tòng, U+540C 同. The indicated source explicitly indicates the two are the same (e.g., by saying that the one character is “the same as” the other). 27 | | B -- T for bù, U+4E0D 不. The source explicitly indicates that the two are used improperly one for the other. 28 | | Z -- T for zhèng, U+6B63 正. The source explicitly indicates that the given character is the preferred form 29 | | F -- T for fán, U+7E41 繁. The source explicitly indicates that the given character is the traditional form. 30 | | J -- T for jiǎn U+7C21 簡/U+7B80 简. The source explicitly indicates that the given character is the simplified form. 31 | deriving (Eq, Ord, Show) 32 | 33 | type VariantSource = Text.Text 34 | type VariantCitation = [(VariantSource, [SemanticVariantType])] 35 | 36 | type Variant = (Char, VariantCitation) 37 | 38 | -- | The compatibility decomposition for this ideograph 39 | compatibilityVariants :: Char -> [Char] 40 | compatibilityVariants c = case variants of VMS mp _ _ _ _ _ -> M.findWithDefault [] c mp 41 | 42 | -- | A semantic variant is an x- or y-variant with similar or identical meaning which can generally be used in place of the indicated character 43 | semanticVariants :: Char -> [Variant] 44 | semanticVariants c = case variants of VMS _ mp _ _ _ _ -> M.findWithDefault [] c mp 45 | 46 | -- | Simplified Chinese variant(s) for this character 47 | simplifiedVariants :: Char -> [Char] 48 | simplifiedVariants c = case variants of VMS _ _ mp _ _ _ -> M.findWithDefault [] c mp 49 | 50 | -- | A specialized semantic variant is an x- or y-variant with similar or identical meaning only in certain contexts (such as accountants’ numerals) 51 | specializedSemanticVariants :: Char -> [Variant] 52 | specializedSemanticVariants c = case variants of VMS _ _ _ mp _ _ -> M.findWithDefault [] c mp 53 | 54 | -- | Traditional Chinese variant(s) for this character 55 | traditionalVariants :: Char -> [Char] 56 | traditionalVariants c = case variants of VMS _ _ _ _ mp _ -> M.findWithDefault [] c mp 57 | 58 | -- | The z-variant(s) for this character 59 | zVariants :: Char -> [Variant] 60 | zVariants c = case variants of VMS _ _ _ _ _ mp -> M.findWithDefault [] c mp 61 | 62 | 63 | type VariantMap = M.Map Char [Char] 64 | type CitedVariantMap = M.Map Char [Variant] 65 | data VariantsMap = VMS !VariantMap !CitedVariantMap !VariantMap !CitedVariantMap !VariantMap !CitedVariantMap 66 | deriving (Show) -- Useful for debugging in GHCi 67 | 68 | emptyVariantsMap :: VariantsMap 69 | emptyVariantsMap = VMS M.empty M.empty M.empty M.empty M.empty M.empty 70 | 71 | unionVariantsMap :: VariantsMap -> VariantsMap -> VariantsMap 72 | unionVariantsMap (VMS a1 a2 a3 a4 a5 a6) (VMS b1 b2 b3 b4 b5 b6) 73 | = VMS (M.unionWith (++) a1 b1) (M.unionWith (++) a2 b2) (M.unionWith (++) a3 b3) (M.unionWith (++) a4 b4) (M.unionWith (++) a5 b5) (M.unionWith (++) a6 b6) 74 | 75 | 76 | {-# NOINLINE contents #-} 77 | contents :: TextL.Text 78 | contents = unsafePerformIO (readUTF8DataFile "data/Unihan/Unihan_Variants.txt") 79 | 80 | variants :: VariantsMap 81 | variants = parseLazy fileP contents 82 | 83 | 84 | fileP :: Parser VariantsMap 85 | fileP = fmap (foldl' unionVariantsMap emptyVariantsMap) (lineP `manyTill` endOfInput) 86 | 87 | lineP :: Parser VariantsMap 88 | lineP = do { c <- charP <* skipSpace; dataP <- variantP c <* skipSpace; dataP <* skipTrueSpace <* lineTerminator } 89 | <|> char '#' *> manyTill anyChar lineTerminator *> pure emptyVariantsMap 90 | <|> manyTill skipTrueSpace lineTerminator *> pure emptyVariantsMap 91 | "line" 92 | 93 | variantP :: Char -> Parser (Parser VariantsMap) 94 | variantP c = string "kCompatibilityVariant" *> pure (liftA (\x -> VMS (mk x) M.empty M.empty M.empty M.empty M.empty) charsP) 95 | <|> string "kSemanticVariant" *> pure (liftA (\x -> VMS M.empty (mk x) M.empty M.empty M.empty M.empty) variantsP) 96 | <|> string "kSimplifiedVariant" *> pure (liftA (\x -> VMS M.empty M.empty (mk x) M.empty M.empty M.empty) charsP) 97 | <|> string "kSpecializedSemanticVariant" *> pure (liftA (\x -> VMS M.empty M.empty M.empty (mk x) M.empty M.empty) variantsP) 98 | <|> string "kTraditionalVariant" *> pure (liftA (\x -> VMS M.empty M.empty M.empty M.empty (mk x) M.empty) charsP) 99 | <|> string "kZVariant" *> pure (liftA (\x -> VMS M.empty M.empty M.empty M.empty M.empty (mk x)) variantsP) 100 | "variant" 101 | where mk x = M.singleton c x 102 | 103 | charsP :: Parser [Char] 104 | charsP = charP `sepBy1` skipTrueSpace 105 | 106 | variantsP :: Parser [Variant] 107 | variantsP = liftA2 (,) charP variantCitationP `sepBy1` skipTrueSpace 108 | 109 | semanticVariantTypeP :: Parser SemanticVariantType 110 | semanticVariantTypeP = char 'T' *> pure T 111 | <|> char 'B' *> pure B 112 | <|> char 'Z' *> pure Z 113 | <|> char 'F' *> pure F 114 | <|> char 'J' *> pure J 115 | "semantic variant type" 116 | 117 | variantCitationP :: Parser VariantCitation 118 | variantCitationP = char '<' *> (entryP `sepBy1` char ',') 119 | <|> pure [] -- Z-variants are commonly uncited 120 | "variant citation" 121 | where entryP = liftA2 (\which mb_xs -> (which, fromMaybe [] mb_xs)) sourceP (optional (char ':' *> many1 semanticVariantTypeP)) 122 | sourceP = takeWhile1 isAlphaNum 123 | -------------------------------------------------------------------------------- /CJK/Data/Unihan/RadicalStrokeCounts.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | module CJK.Data.Unihan.RadicalStrokeCounts ( 3 | -- * Dictionary- and standard-consistent radicals 4 | IsSimplifiedKangXi, unicode, kangXi, kanWa, 5 | 6 | -- * Language-consistent radicals 7 | korean, japanese, 8 | 9 | -- * Font-consistent radicals 10 | AdobeJapan1_6(..), adobeJapan1_6 11 | ) where 12 | 13 | import CJK.Data.Types 14 | import CJK.Utilities 15 | 16 | import Control.Applicative 17 | 18 | import qualified Data.Text as Text 19 | import qualified Data.Text.Lazy as TextL 20 | import Data.Attoparsec.Text 21 | 22 | import Data.Char 23 | import Data.Maybe 24 | import qualified Data.Map as M 25 | import Data.List 26 | 27 | import System.IO.Unsafe 28 | 29 | 30 | -- | Whether the character is formed from the simplifed version of the radical 31 | type IsSimplifiedKangXi = Bool 32 | 33 | data AdobeJapan1_6 = AJ1_6 { 34 | aJ1_6IsDirect :: Bool, -- ^ True if the Unicode code point maps directly to the Adobe-Japan1-6 CID, or False if it is a variant form which is not directly encoded 35 | aJ1_6CID :: Int, -- ^ The ID of the character in the Adobe-Japan1-6 font 36 | aJ1_6RadicalStrokeCount :: RadicalStrokeCount (KangXiRadical, StrokeCount) 37 | } deriving (Show) -- Useful for debugging in GHCi 38 | 39 | -- | Radical/stroke count in the Adobe-Japan1-6 font 40 | -- 41 | -- This data is unusual in that it explicitly includes the stroke count for the form that the radical takes in the glyph. 42 | adobeJapan1_6 :: Char -> [AdobeJapan1_6] 43 | adobeJapan1_6 c = M.findWithDefault [] c (kRSAdobe_Japan1_6 strokeCounts) 44 | 45 | -- | Radical/stroke counts usually used in Japanese 46 | japanese :: Char -> [RadicalStrokeCount KangXiRadical] 47 | japanese c = M.findWithDefault [] c (kRSJapanese strokeCounts) 48 | 49 | -- | Radical/stroke counts consistent with the KangXi dictionary 50 | kangXi :: Char -> [RadicalStrokeCount KangXiRadical] 51 | kangXi c = M.findWithDefault [] c (kRSKangXi strokeCounts) 52 | 53 | -- | Radical/stroke counts consistent with the Morohashi dictionary 54 | kanWa :: Char -> [RadicalStrokeCount KangXiRadical] 55 | kanWa c = M.findWithDefault [] c (kRSKanWa strokeCounts) 56 | 57 | -- | Radical/stroke counts usually used in Japanese 58 | korean :: Char -> [RadicalStrokeCount KangXiRadical] 59 | korean c = M.findWithDefault [] c (kRSKorean strokeCounts) 60 | 61 | -- | Radical/stroke count consistent with ISO/IEC 10646 62 | -- 63 | -- The first value in the returned list, if any, is equal to the normative radical-stroke value defined in ISO/IEC 10646. 64 | unicode :: Char -> [RadicalStrokeCount (KangXiRadical, IsSimplifiedKangXi)] 65 | unicode c = M.findWithDefault [] c (kRSUnicode strokeCounts) 66 | 67 | 68 | data StrokeCountsMap = SMS { 69 | kRSAdobe_Japan1_6 :: !(M.Map Char [AdobeJapan1_6]), 70 | kRSJapanese :: !(M.Map Char [RadicalStrokeCount KangXiRadical]), 71 | kRSKangXi :: !(M.Map Char [RadicalStrokeCount KangXiRadical]), 72 | kRSKanWa :: !(M.Map Char [RadicalStrokeCount KangXiRadical]), 73 | kRSKorean :: !(M.Map Char [RadicalStrokeCount KangXiRadical]), 74 | kRSUnicode :: !(M.Map Char [RadicalStrokeCount (KangXiRadical, IsSimplifiedKangXi)]) 75 | } deriving (Show) -- Useful for debugging in GHCi 76 | 77 | emptyStrokeCountsMap :: StrokeCountsMap 78 | emptyStrokeCountsMap = SMS M.empty M.empty M.empty M.empty M.empty M.empty 79 | 80 | unionStrokeCountsMap :: StrokeCountsMap -> StrokeCountsMap -> StrokeCountsMap 81 | unionStrokeCountsMap (SMS a1 a2 a3 a4 a5 a6) (SMS b1 b2 b3 b4 b5 b6) 82 | = SMS (plus a1 b1) (plus a2 b2) (plus a3 b3) (plus a4 b4) (plus a5 b5) (plus a6 b6) 83 | where plus = M.unionWith (error "unionStrokeCountsMap: impossible") -- There is at most one line for each (character, field name) combination 84 | 85 | {-# NOINLINE contents #-} 86 | contents :: TextL.Text 87 | contents = unsafePerformIO (readUTF8DataFile "data/Unihan/Unihan_RadicalStrokeCounts.txt") 88 | 89 | strokeCounts :: StrokeCountsMap 90 | strokeCounts = parseLazy fileP contents 91 | 92 | 93 | fileP :: Parser StrokeCountsMap 94 | fileP = fmap (foldl' unionStrokeCountsMap emptyStrokeCountsMap) (lineP `manyTill` endOfInput) 95 | 96 | lineP :: Parser StrokeCountsMap 97 | lineP = do { c <- charP <* skipSpace; dataP <- strokeCountP c <* skipSpace; dataP <* skipTrueSpace <* lineTerminator } 98 | <|> char '#' *> manyTill anyChar lineTerminator *> pure emptyStrokeCountsMap 99 | <|> manyTill skipTrueSpace lineTerminator *> pure emptyStrokeCountsMap 100 | "line" 101 | 102 | strokeCountP :: Char -> Parser (Parser StrokeCountsMap) 103 | strokeCountP c = string "kRSAdobe_Japan1_6" *> pure (liftA (\x -> emptyStrokeCountsMap { kRSAdobe_Japan1_6 = mk x }) (rsAdobe_Japan1_6P `sepBy1` skipTrueSpace)) 104 | <|> string "kRSJapanese" *> pure (liftA (\x -> emptyStrokeCountsMap { kRSJapanese = mk x }) (radicalStrokeCountP `sepBy1` skipTrueSpace)) 105 | <|> string "kRSKangXi" *> pure (liftA (\x -> emptyStrokeCountsMap { kRSKangXi = mk x }) (radicalStrokeCountP `sepBy1` skipTrueSpace)) 106 | <|> string "kRSKanWa" *> pure (liftA (\x -> emptyStrokeCountsMap { kRSKanWa = mk x }) (radicalStrokeCountP `sepBy1` skipTrueSpace)) 107 | <|> string "kRSKorean" *> pure (liftA (\x -> emptyStrokeCountsMap { kRSKorean = mk x }) (radicalStrokeCountP `sepBy1` skipTrueSpace)) 108 | <|> string "kRSUnicode" *> pure (liftA (\x -> emptyStrokeCountsMap { kRSUnicode = mk x }) (rsUnicodeP `sepBy1` skipTrueSpace)) 109 | where mk x = M.singleton c x 110 | 111 | rsAdobe_Japan1_6P :: Parser AdobeJapan1_6 112 | rsAdobe_Japan1_6P = liftA3 AJ1_6 isDirectP (char '+' *> decimal) (char '+' *> rscP) 113 | where isDirectP = char 'C' *> pure True 114 | <|> char 'V' *> pure False 115 | rscP = liftA3 (\kx kxn n -> RSC (KangXi kx, kxn) n) decimal (char '.' *> decimal) (char '.' *> decimal) 116 | 117 | rsUnicodeP :: Parser (RadicalStrokeCount (KangXiRadical, IsSimplifiedKangXi)) 118 | rsUnicodeP = liftA3 (\kx is_simp n -> RSC (KangXi kx, is_simp) n) decimal (canParse (char '\'')) (char '.' *> decimal) 119 | 120 | radicalStrokeCountP :: Parser (RadicalStrokeCount KangXiRadical) 121 | radicalStrokeCountP = liftA2 RSC (fmap KangXi decimal) (char '.' *> decimal) 122 | -------------------------------------------------------------------------------- /CJK/Data/CEDICT.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings, PatternGuards #-} 2 | module CJK.Data.CEDICT ( 3 | Reading, showReading, showSpacelessReading, showReadingAccented, showSpacelessReadingAccented, 4 | Word(..), showHeadWord, 5 | DefinitionToken(..), WordDefinition(..), Definition(..), 6 | entries 7 | ) where 8 | 9 | import CJK.Utilities 10 | import CJK.Data.Internal 11 | import CJK.Data.Pinyin 12 | 13 | import Control.Applicative 14 | import Data.Maybe 15 | import Data.List (intercalate) 16 | 17 | import qualified Data.ByteString.Lazy as BS 18 | 19 | import qualified Data.Text as Text 20 | import qualified Data.Text.Lazy as TextL 21 | 22 | import Data.Char 23 | import Data.Monoid 24 | import Data.Attoparsec.Combinator 25 | import Data.Attoparsec.Text hiding (parse, eitherResult) 26 | import Data.Attoparsec.Text.Lazy 27 | 28 | import System.IO.Unsafe 29 | 30 | import Prelude hiding (takeWhile) 31 | 32 | 33 | type Reading = [Either Text.Text Phone] 34 | 35 | showReading :: Reading -> String 36 | showReading yins = intercalate " " (map (either Text.unpack show) yins) 37 | 38 | showSpacelessReading :: Reading -> String 39 | showSpacelessReading yins = concat (map (either Text.unpack show) yins) 40 | 41 | showReadingAccented :: Reading -> String 42 | showReadingAccented yins = intercalate " " (map (either Text.unpack (Text.unpack . toAccented)) yins) 43 | 44 | showSpacelessReadingAccented :: Reading -> String 45 | showSpacelessReadingAccented yins = concat (map (either Text.unpack (Text.unpack . toAccented)) yins) 46 | 47 | 48 | data Word = Word { 49 | traditional :: [Char], 50 | simplified :: [Char], 51 | reading :: Reading 52 | } 53 | 54 | bracketed s = "[" ++ s ++ "]" 55 | 56 | instance Show Word where 57 | show word | simplified word == traditional word = traditional word ++ bracketed (showReading (reading word)) 58 | | otherwise = traditional word ++ "|" ++ simplified word ++ bracketed (showReading (reading word)) 59 | 60 | -- | Show a word as in the head of a dictionary entry 61 | showHeadWord :: Word -> String 62 | showHeadWord word = traditional word ++ " " ++ simplified word ++ " " ++ bracketed (showReading (reading word)) 63 | 64 | mkWord :: [Char] -> [Char] -> Reading -> Word 65 | -- Fix problems in dictionary: 66 | mkWord trad simp yins 67 | | ntrad /= nsimp = error $ "mkWord: differing numbers of traditional and simplified characters (" ++ show trad ++ " vs. " ++ show simp ++ ")" 68 | | otherwise = case (trad, simp) of 69 | ("中國左翼作家聯盟", "中国左翼作家联盟") | nyins == 6 -> Word trad simp ([Right (Phone "Zhong" Flat), Right (Phone "guo" Rising)] ++ yins) 70 | ("甘孜藏族自治州甘孜藏族自治州", "甘孜藏族自治州甘孜藏族自治州") | nyins == 7 -> Word "甘孜藏族自治州" "甘孜藏族自治州" yins 71 | ("睿宗", "睿宗") | nyins == 3 -> Word trad simp (tail yins) 72 | ("泰米爾納德", "泰米尔纳德") | nyins == 6 -> Word trad simp (init yins) 73 | ("Zhou周文王", "Zhou周文王") | nyins == 3 -> Word "周文王" "周文王" yins 74 | ("美國51區", "美国51区") | nyins == 6 -> Word "美國五十一區" "美国五十一区" yins 75 | _ -- Check for missing 市 suffix which is present in yins in examples like 棗莊|枣庄 76 | | ntrad + 1 == nyins, Right (Phone "shi" Falling) <- last yins -> Word trad simp (init yins) 77 | -- Check for 市 suffix which is missing in yins in examples like 鹿泉市 78 | | ntrad == nyins + 1, '市' <- last trad -> Word trad simp (yins ++ [Right (Phone "shi" Falling)]) 79 | -- Last-ditch check for an unhandled error 80 | | ntrad /= nyins -> error $ "mkWord: differing numbers of characters and readings (" ++ show trad ++ " vs. " ++ bracketed (showReading yins) ++ ")" 81 | | otherwise -> Word trad simp yins 82 | where ntrad = length trad 83 | nsimp = length simp 84 | nyins = length yins 85 | 86 | 87 | data DefinitionToken = PlainToken Text.Text 88 | | WordToken Word 89 | 90 | instance Show DefinitionToken where 91 | show (PlainToken text) = Text.unpack text 92 | show (WordToken word) = show word 93 | 94 | 95 | data WordDefinition = WordClassifiers [Word] 96 | | WordDefinition [DefinitionToken] 97 | 98 | instance Show WordDefinition where 99 | show (WordClassifiers wrds) = "CL:" ++ intercalate "," (map show wrds) 100 | show (WordDefinition tokens) = concatMap show tokens 101 | 102 | 103 | data Definition = Definition { 104 | word :: Word, 105 | definitions :: [WordDefinition] 106 | } 107 | 108 | instance Show Definition where 109 | show definition = showHeadWord (word definition) ++ " /" ++ intercalate "/" (map show (definitions definition)) ++ "/" 110 | 111 | 112 | {-# NOINLINE contents #-} 113 | contents :: TextL.Text 114 | contents = unsafePerformIO (readUTF8DataFile "data/cedict_1_0_ts_utf-8_mdbg.txt") 115 | 116 | entries :: [Definition] 117 | entries = parseLazy fileP contents 118 | 119 | fileP :: Parser [Definition] 120 | fileP = fmap catMaybes (many lineP) 121 | 122 | lineP :: Parser (Maybe Definition) 123 | lineP = char '#' *> manyTill anyChar lineTerminator *> pure Nothing 124 | <|> liftA4 (\trad simp yins defs -> Just (Definition { word = mkWord trad simp yins, definitions = defs })) nonSpaceP nonSpaceP (readingP <* space) definitionsP <* lineTerminator 125 | 126 | readingP :: Parser Reading 127 | readingP = char '[' *> (yinP `sepBy1` space) <* char ']' 128 | 129 | yinP :: Parser (Either Text.Text Phone) 130 | yinP = liftA Right tonedPinyinP 131 | <|> liftA Left (takeWhile1 (\c -> not (isSpace c) && c /= ']')) -- CEDICT explicitly writes tone 5, so any missing tones must be for non-Chinese 132 | 133 | toneP :: Parser Tone 134 | toneP = char '1' *> pure Flat 135 | <|> char '2' *> pure Rising 136 | <|> char '3' *> pure FallingRising 137 | <|> char '4' *> pure Falling 138 | <|> char '5' *> pure Neutral 139 | 140 | definitionsP :: Parser [WordDefinition] 141 | definitionsP = char '/' *> many1 (definitionP <* char '/') 142 | 143 | definitionP :: Parser WordDefinition 144 | definitionP = liftA WordClassifiers (string "CL:" *> (wordP `sepBy1` (char ',' >> skipWhile isSpace))) -- In entries like 個|个[ge4] or CL:個|个[ge4],隻|只[zhi1] the characters do not have to have a space before them, so special case it 145 | <|> liftA WordDefinition (many tokenP) 146 | 147 | tokenP :: Parser DefinitionToken 148 | tokenP = liftA WordToken wordP 149 | <|> liftA3 (\hoklo chars end -> PlainToken (hoklo <> chars <> end)) (string "Hoklo:") (takeWhile (/= ']')) (string "]") -- There are two rogue entries containing Hoklo: 無甚物[bô-siáⁿ-mi̍h] 150 | <|> liftA PlainToken (takeWhile1 (\c -> not (isSpace c || c == '(') && c /= '/')) 151 | <|> liftA PlainToken (takeWhile1 (\c -> isTrueSpace c || c == '(')) 152 | 153 | wordP :: Parser Word 154 | wordP = liftA3 (\trad mb_simp yins -> mkWord trad (fromMaybe trad mb_simp) yins) chineseP (optional (char '|' *> chineseP)) readingP 155 | where 156 | chineseP :: Parser [Char] 157 | chineseP = many1 (satisfy (\c -> not (isSpace c) && c /= '/' && c /= '|' && c /= '[')) 158 | 159 | nonSpaceP :: Parser [Char] 160 | nonSpaceP = many1 (satisfy (not . isSpace)) <* space 161 | -------------------------------------------------------------------------------- /CJK/Data/Unihan/DictionaryLikeData.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | module CJK.Data.Unihan.DictionaryLikeData ( 3 | cangjie, 4 | CheungBauer(..), cheungBauer, 5 | cihai, 6 | Fenn(..), fenn, 7 | fourCornerCode, 8 | frequency, 9 | gradeLevel, 10 | hdzRadBreak, 11 | hkGlyph, 12 | phonetic, 13 | totalStrokes 14 | ) where 15 | 16 | import qualified CJK.Data.Jyutping as Jyutping 17 | import CJK.Data.Internal 18 | import CJK.Data.Types 19 | import CJK.Utilities 20 | 21 | import Control.Applicative 22 | 23 | import qualified Data.Text as Text 24 | import qualified Data.Text.Lazy as TextL 25 | import Data.Attoparsec.Text 26 | 27 | import Data.Char 28 | import Data.Maybe 29 | import qualified Data.Map as M 30 | import Data.Monoid 31 | import Data.List 32 | 33 | import System.IO.Unsafe 34 | 35 | 36 | data CheungBauer = CB { 37 | cbRadicalStrokeCount :: RadicalStrokeCount KangXiRadical, 38 | cbCangjie :: Maybe CangjieInputCode, 39 | cbReading :: [Jyutping.Phone] -- ^ Readings are in alphabetical order 40 | } deriving (Show) -- Useful for debugging in GHCi 41 | 42 | data Fenn = Fenn { 43 | fennSoothill :: Maybe Int, -- ^ Soothill number of the character's phonetic, if any 44 | fennFrequency :: Maybe Int -- ^ Number from 1 to 11 indicating roughly which group of 500 most popular characters this character is included in (i.e. 1 is the first 500 characters, 2 the next 500 characters etc). Nothing if the character is rare. 45 | } deriving (Show) -- Useful for debugging in GHCi 46 | 47 | -- | The cangjie input code for the character 48 | cangjie :: Char -> Maybe CangjieInputCode 49 | cangjie c = M.lookup c (kCangjie dictionaryLikes) 50 | 51 | -- | Data regarding the character in Cheung Kwan-hin and Robert S. Bauer, _The Representation of Cantonese with Chinese Characters_, Journal of Chinese Linguistics, Monograph Series Number 18, 2002 52 | cheungBauer :: Char -> [CheungBauer] 53 | cheungBauer c = M.findWithDefault [] c (kCheungBauer dictionaryLikes) 54 | 55 | -- | The position(s) of this character in the Cihai (辭海) dictionary, single volume edition, published in Hong Kong by the Zhonghua Bookstore, 1983 (reprint of the 1947 edition), ISBN 962-231-005-2. 56 | -- 57 | -- The position is indicated by a decimal number. The digits to the left of the decimal are the page number. The first digit after the decimal is the row on the page, and the remaining two digits after the decimal are the position on the row. 58 | cihai :: Char -> [Text.Text] 59 | cihai c = M.findWithDefault [] c (kCihaiT dictionaryLikes) 60 | 61 | -- | Data on the character from The Five Thousand Dictionary (aka Fenn’s Chinese-English Pocket Dictionary) by Courtenay H. Fenn, Cambridge, Mass.: Harvard University Press, 1979. 62 | fenn :: Char -> [Fenn] 63 | fenn c = M.findWithDefault [] c (kFenn dictionaryLikes) 64 | 65 | -- | The four-corner code(s) for the character 66 | -- 67 | -- The four-corner system assigns each character a four-digit code from 0 through 9. The digit is derived from the “shape” of the four corners of the character (upper-left, upper-right, lower-left, lower-right). An optional fifth digit 68 | -- can be used to further distinguish characters; the fifth digit is derived from the shape in the character’s center or region immediately to the left of the fourth corner. 69 | -- 70 | -- The four-corner system is now used only rarely. Full descriptions are available online, e.g., at . 71 | fourCornerCode :: Char -> [Text.Text] 72 | fourCornerCode c = M.findWithDefault [] c (kFourCornerCode dictionaryLikes) 73 | 74 | -- | A rough frequency measurement for the character based on analysis of traditional Chinese USENET postings; characters with a kFrequency of 1 are the most common, those with a kFrequency of 2 are less common, and so on, through a kFrequency of 5. 75 | frequency :: Char -> Maybe Int 76 | frequency c = M.lookup c (kFrequency dictionaryLikes) 77 | 78 | -- | The primary grade in the Hong Kong school system by which a student is expected to know the character; this data is derived from 朗文初級中文詞典, Hong Kong: Longman, 2001 79 | gradeLevel :: Char -> Maybe Int 80 | gradeLevel c = M.lookup c (kGradeLevel dictionaryLikes) 81 | 82 | -- | Does 《漢語大字典》 Hanyu Da Zidian have a radical break beginning at this character’s position? If so, returns the radical and the Hanyu Da Zidian position as in the kHanyu field. 83 | hdzRadBreak :: Char -> Maybe (Char, HDZEntry) 84 | hdzRadBreak c = M.lookup c (kHDZRadBreak dictionaryLikes) 85 | 86 | -- | The index of the character in 常用字字形表 (二零零零年修訂本),香港: 香港教育學院, 2000, ISBN 962-949-040-4. This publication gives the “proper” shapes for 4759 characters as used in the Hong Kong school system 87 | hkGlyph :: Char -> [Int] 88 | hkGlyph c = M.findWithDefault [] c (kHKGlyph dictionaryLikes) 89 | 90 | -- | The phonetic index for the character from _Ten Thousand Characters: An Analytic Dictionary_, by G. Hugh Casey, S.J. Hong Kong: Kelley and Walsh, 1980 91 | phonetic :: Char -> [Text.Text] 92 | phonetic c = M.findWithDefault [] c (kPhonetic dictionaryLikes) 93 | 94 | -- | The total number of strokes in the character (including the radical), that is, the stroke count most commonly associated with the character in modern text using customary fonts. 95 | -- 96 | -- The first value is preferred for zh-Hans (CN) and the second is preferred for zh-Hant (TW) 97 | totalStrokes :: Char -> Maybe (StrokeCount, StrokeCount) 98 | totalStrokes c = M.lookup c (kTotalStrokes dictionaryLikes) 99 | 100 | 101 | data DictionaryLikesMap = DMS { 102 | kCangjie :: M.Map Char CangjieInputCode, 103 | kCheungBauer :: M.Map Char [CheungBauer], 104 | kCihaiT :: M.Map Char [Text.Text], 105 | kFenn :: M.Map Char [Fenn], 106 | kFourCornerCode :: M.Map Char [Text.Text], 107 | kFrequency :: M.Map Char Int, 108 | kGradeLevel :: M.Map Char Int, 109 | kHDZRadBreak :: M.Map Char (Char, HDZEntry), 110 | kHKGlyph :: M.Map Char [Int], 111 | kPhonetic :: M.Map Char [Text.Text], 112 | kTotalStrokes :: M.Map Char (StrokeCount, StrokeCount) 113 | } deriving (Show) -- Useful for debugging in GHCi 114 | 115 | emptyDictionaryLikesMap :: DictionaryLikesMap 116 | emptyDictionaryLikesMap = DMS M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty 117 | 118 | unionDictionaryLikesMap :: DictionaryLikesMap -> DictionaryLikesMap -> DictionaryLikesMap 119 | unionDictionaryLikesMap (DMS a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11) (DMS b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11) 120 | = DMS (plus a1 b1) (plus a2 b2) (plus a3 b3) (plus a4 b4) (plus a5 b5) (plus a6 b6) 121 | (plus a7 b7) (plus a8 b8) (plus a9 b9) (plus a10 b10) (plus a11 b11) 122 | where plus = M.unionWith (error "unionReadingsMap: impossible") -- There is at most one line for each (character, field name) combination 123 | 124 | {-# NOINLINE contents #-} 125 | contents :: TextL.Text 126 | contents = unsafePerformIO (readUTF8DataFile "data/Unihan/Unihan_DictionaryLikeData.txt") 127 | 128 | dictionaryLikes :: DictionaryLikesMap 129 | dictionaryLikes = parseLazy fileP contents 130 | 131 | 132 | fileP :: Parser DictionaryLikesMap 133 | fileP = fmap (foldl' unionDictionaryLikesMap emptyDictionaryLikesMap) (lineP `manyTill` endOfInput) 134 | 135 | lineP :: Parser DictionaryLikesMap 136 | lineP = do { c <- charP <* skipSpace; dataP <- dictionaryLikeP c <* skipSpace; dataP <* skipTrueSpace <* lineTerminator } 137 | <|> char '#' *> manyTill anyChar lineTerminator *> pure emptyDictionaryLikesMap 138 | <|> manyTill skipTrueSpace lineTerminator *> pure emptyDictionaryLikesMap 139 | "line" 140 | 141 | dictionaryLikeP :: Char -> Parser (Parser DictionaryLikesMap) 142 | dictionaryLikeP c = string "kCangjie" *> pure (liftA (\x -> emptyDictionaryLikesMap { kCangjie = mk x }) cangjieP) 143 | <|> string "kCheungBauer" *> pure (liftA (\x -> emptyDictionaryLikesMap { kCheungBauer = mk x }) (cheungBauerP `sepBy1` skipTrueSpace)) 144 | <|> string "kCihaiT" *> pure (liftA (\x -> emptyDictionaryLikesMap { kCihaiT = mk x }) (takeWhile1 (\c -> isDigit c || c == '.') `sepBy1` skipTrueSpace)) 145 | <|> string "kFenn" *> pure (liftA (\x -> emptyDictionaryLikesMap { kFenn = mk x }) (fennP `sepBy1` skipTrueSpace)) 146 | <|> string "kFourCornerCode" *> pure (liftA (\x -> emptyDictionaryLikesMap { kFourCornerCode = mk x }) (takeWhile1 (\c -> isDigit c || c == '.') `sepBy1` skipTrueSpace)) 147 | <|> string "kFrequency" *> pure (liftA (\x -> emptyDictionaryLikesMap { kFrequency = mk x }) decimal) 148 | <|> string "kGradeLevel" *> pure (liftA (\x -> emptyDictionaryLikesMap { kGradeLevel = mk x }) decimal) 149 | <|> string "kHDZRadBreak" *> pure (liftA (\x -> emptyDictionaryLikesMap { kHDZRadBreak = mk x }) hdzRadBreakP) 150 | <|> string "kHKGlyph" *> pure (liftA (\x -> emptyDictionaryLikesMap { kHKGlyph = mk x }) (decimal `sepBy1` skipTrueSpace)) 151 | <|> string "kPhonetic" *> pure (liftA (\x -> emptyDictionaryLikesMap { kPhonetic = mk x }) (takeWhile1 (\c -> isDigit c || isAsciiUpper c || c == '*') `sepBy1` skipTrueSpace)) 152 | <|> string "kTotalStrokes" *> pure (liftA (\x -> emptyDictionaryLikesMap { kTotalStrokes = mk x }) totalStrokesP) 153 | where mk x = M.singleton c x 154 | 155 | cangjieP :: Parser CangjieInputCode 156 | cangjieP = takeWhile1 isAsciiUpper 157 | 158 | cheungBauerP :: Parser CheungBauer 159 | cheungBauerP = liftA3 CB rscP (char ';' *> optional cangjieP) (char ';' *> liftA concat (jyutpingPatternP `sepBy1` char ',')) 160 | where rscP = liftA2 RSC (fmap KangXi decimal) (char '/' *> decimal) 161 | 162 | jyutpingPatternP :: Parser [Jyutping.Phone] 163 | jyutpingPatternP = liftA2 (\sounds tones -> [Jyutping.Phone sound tone | sound <- sounds, tone <- tones]) soundP toneP 164 | where 165 | -- Some kCheungBauer says [ng]ai1 166 | soundP = liftA2 (\opt nexts -> [here | next <- nexts, here <- [next, opt <> next]]) (char '[' *> takeWhile1 (/= ']') <* char ']') soundP 167 | <|> liftA (\x -> [x]) (takeWhile1 (\c -> isAsciiUpper c || isAsciiLower c)) 168 | 169 | -- Some kCheungBauer says min6/2 170 | toneP = jyutpingToneP `sepBy1` char '/' 171 | 172 | fennP :: Parser Fenn 173 | fennP = liftA2 Fenn groupP (optional (char 'a') *> frequencyP) -- Can't find any info on what the optional 'a' means 174 | where groupP = char '0' *> pure Nothing -- Characters which have a frequency letter but no Soothill phonetic group 175 | <|> fmap Just decimal 176 | frequencyP = char 'A' *> return (Just 1) 177 | <|> char 'B' *> return (Just 2) 178 | <|> char 'C' *> return (Just 3) 179 | <|> char 'D' *> return (Just 4) 180 | <|> char 'E' *> return (Just 5) 181 | <|> char 'F' *> return (Just 6) 182 | <|> char 'G' *> return (Just 7) 183 | <|> char 'H' *> return (Just 8) 184 | <|> char 'I' *> return (Just 9) 185 | <|> char 'J' *> return (Just 10) 186 | <|> char 'K' *> return (Just 11) 187 | <|> char 'P' *> return Nothing -- Conflate these two cases: 188 | <|> char '*' *> return Nothing -- who really cares? 189 | 190 | hdzRadBreakP :: Parser (Char, HDZEntry) 191 | hdzRadBreakP = liftA2 (,) anyChar (char '[' *> string "U+" *> takeWhile1 isHexDigit *> char ']' *> char ':' *> hdzEntryP) 192 | 193 | totalStrokesP :: Parser (Int, Int) 194 | totalStrokesP = liftA2 (\simp mb_trad -> (simp, fromMaybe simp mb_trad)) decimal (skipTrueSpace *> optional decimal) 195 | -------------------------------------------------------------------------------- /CJK/Data/Unihan/Readings.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | module CJK.Data.Unihan.Readings ( 3 | CharDefinition, definition, 4 | -- * Mandarin 5 | OccurrenceCount, IsHDZSubstitution, 6 | mandarinBestEffort, mandarin, hanyuPinlu, hanyuPinyin, xhc1983, 7 | -- * Cantonese 8 | cantonese, 9 | -- * Ancient Chinese 10 | CommonTangCharacter, tang, 11 | -- * Korean 12 | hangul, korean, 13 | -- * Japanese 14 | japaneseKun, japaneseOn, 15 | -- * Vietnamese 16 | vietnamese 17 | ) where 18 | 19 | import qualified CJK.Data.Hangul as Hangul 20 | import qualified CJK.Data.Jyutping as Jyutping 21 | import qualified CJK.Data.KoreanYale as KoreanYale 22 | import qualified CJK.Data.Pinyin as Pinyin 23 | import qualified CJK.Data.QuocNgu as QuocNgu 24 | import CJK.Data.Internal 25 | import CJK.Data.Types 26 | import CJK.Utilities 27 | 28 | import Control.Applicative 29 | 30 | import qualified Data.Text as Text 31 | import qualified Data.Text.Lazy as TextL 32 | import Data.Attoparsec.Text 33 | 34 | import Data.Char 35 | import Data.Maybe 36 | import qualified Data.Map as M 37 | import Data.List 38 | 39 | import System.IO.Unsafe 40 | 41 | 42 | type CharDefinition = Text.Text 43 | 44 | -- | The sum total of the frequencies of the pronunciations of the character as given in 《現代漢語頻率詞典》 45 | type OccurrenceCount = Int 46 | 47 | -- | Whether the word or morpheme represented in toto or in part by the given character with the given reading occurs more than 48 | -- four times in the seven hundred poems covered by "T’ang Poetic Vocabulary" by Hugh M. Stimson, Far Eastern Publications, Yale Univ. 1976 49 | type CommonTangCharacter = Bool 50 | 51 | -- | Whether this reference had an encoded variant substituted for an unencoded character used by the Hànyǔ Dà Zìdiǎn 52 | type IsHDZSubstitution = Bool 53 | 54 | 55 | -- | Returns how to pronounce an ideograph in Mandarin, making the best effort to use all of the CEDICT data to get a good answer. 56 | -- Readings are returned in approximate frequency order. 57 | -- 58 | -- This algorithm is based on the Unihan FAQ , which states that the best way is to use the kHanyuPinlu, kXHC1983, 59 | -- and kHanyuPinyin fields in that order. The kMandarin field may have some readings the other three do not but should be used with caution. The kHanyuPinlu 60 | -- field lists the most common readings for ideographs in order of frequency of use and is the most useful for most purposes. The kXHC1983 61 | -- field contains the most important readings for characters in modern use, and the kHanyuPinyin field contains an exhaustive set of readings 62 | -- for a large set of characters, but includes obscure readings of historic interest only 63 | mandarinBestEffort :: Char -> [Pinyin.Phone] 64 | mandarinBestEffort c = nubBy eq $ map fst (hanyuPinlu c) ++ 65 | concatMap snd (xhc1983 c) ++ 66 | concatMap snd (hanyuPinyin c) ++ 67 | (case mandarin c of Nothing -> []; Just (simp, trad) -> [simp, trad]) -- NB: technically an abuse since this data is differentiated by mainland/Taiwan 68 | where yin1 `eq` yin2 = Text.toLower (Pinyin.sound yin1) == Text.toLower (Pinyin.sound yin2) && Pinyin.tone yin1 == Pinyin.tone yin2 69 | 70 | -- | The Cantonese pronunciation(s) for this character using the jyutping romanization. 71 | -- Cantonese pronunciations are sorted alphabetically, not in order of frequency. 72 | -- 73 | -- Cantonese data are derived from the following sources: 74 | -- * Casey, G. Hugh, S.J. Ten Thousand Characters: An Analytic Dictionary. Hong Kong: Kelley and Walsh,1980 (kPhonetic). 75 | -- * Cheung Kwan-hin and Robert S. Bauer, The Representation of Cantonese with Chinese Characters, Journal of Chinese Linguistics Monograph Series Number 18, 2002. 76 | -- * Roy T. Cowles, A Pocket Dictionary of Cantonese, Hong Kong: University Press, 1999 (kCowles). 77 | -- * Sidney Lau, A Practical Cantonese-English Dictionary, Hong Kong: Government Printer, 1977 (kLau). 78 | -- * Bernard F. Meyer and Theodore F. Wempe, Student’s Cantonese-English Dictionary, Maryknoll, New York: Catholic Foreign Mission Society of America, 1947 (kMeyerWempe). 79 | -- * 饒秉才, ed. 廣州音字典, Hong Kong: Joint Publishing (H.K.) Co., Ltd., 1989. 80 | -- * 中華新字典, Hong Kong:中華書局, 1987. 81 | -- * 黃港生, ed. 商務新詞典, Hong Kong: The Commercial Press, 1991. 82 | -- * 朗文初級中文詞典, Hong Kong: Longman, 2001. 83 | cantonese :: Char -> [Jyutping.Phone] 84 | cantonese c = M.findWithDefault [] c (kCantonese readings) 85 | 86 | -- | An English definition for this character. Definitions are for modern written Chinese and are usually (but not always) the 87 | -- same as the definition in other Chinese dialects or non-Chinese languages. In some cases, synonyms are indicated. Fuller variant 88 | -- information can be found using the various variant fields. 89 | -- 90 | -- Definitions specific to non-Chinese languages or Chinese dialects other than modern Mandarin are marked, e.g., (Cant.) or (J). 91 | -- Minor definitions are separated by commas. 92 | definition :: Char -> [CharDefinition] 93 | definition c = M.findWithDefault [] c (kDefinition readings) 94 | 95 | -- | The modern Korean pronunciation(s) for this character in Hangul. 96 | hangul :: Char -> [Hangul.Phone] 97 | hangul c = M.findWithDefault [] c (kHangul readings) 98 | 99 | -- | The Pronunciations and Frequencies of this character, based in part on those appearing in 100 | -- 《現代漢語頻率詞典》 (XDHYPLCD) [Modern Standard Beijing Chinese Frequency Dictionary]. 101 | -- 102 | -- Where more than one pronunciation exists, these are sorted by descending frequency. 103 | -- The occurrence count indicates the sum total of the frequencies of the pronunciations of the character as given in HYPLCD. 104 | -- 105 | -- You may want to use 'mandarinBestEffort' instead of this function. 106 | hanyuPinlu :: Char -> [(Pinyin.Phone, OccurrenceCount)] 107 | hanyuPinlu c = M.findWithDefault [] c (kHanyuPinlu readings) 108 | 109 | -- | The 漢語拼音 Hànyǔ Pīnyīn reading(s) appearing in the edition of 《漢語大字典》 Hànyǔ Dà Zìdiǎn (HDZ). 110 | -- 111 | -- Where multiple pīnyīn readings are associated with a given mapping, these are ordered as in HDZ 112 | -- (for the most part reflecting relative commonality). 113 | -- 114 | -- Individual entries are in same order as they are found in the Hanyu Da Zidian. This is true both for 115 | -- the locations and the individual readings. While this is generally in the order of utility for modern Chinese, such is not invariably the case. 116 | -- 117 | -- You may want to use 'mandarinBestEffort' instead of this function. 118 | hanyuPinyin :: Char -> [([HDZEntry], [Pinyin.Phone])] 119 | hanyuPinyin c = M.findWithDefault [] c (kHanyuPinyin readings) 120 | 121 | -- | The Japanese kun'yomi pronunciation of this character, in an undefined romanization system. 122 | -- It is recommended that you use kanjidic2 instead of this data. 123 | japaneseKun :: Char -> [Text.Text] 124 | japaneseKun c = M.findWithDefault [] c (kJapaneseKun readings) 125 | 126 | -- | The Japanese on'yomi pronunciation of this character, in an undefined romanization system. 127 | -- It is recommended that you use kanjidic2 instead of this data. 128 | japaneseOn :: Char -> [Text.Text] 129 | japaneseOn c = M.findWithDefault [] c (kJapaneseOn readings) 130 | 131 | -- | The Korean pronunciation(s) of this character, using the Yale romanization system. 132 | korean :: Char -> [KoreanYale.Phone] 133 | korean c = M.findWithDefault [] c (kKorean readings) 134 | 135 | -- | The most customary pinyin reading for this character; that is, the reading most commonly used in modern text, 136 | -- with some preference given to readings most likely to be in sorted lists. 137 | -- 138 | -- The first value returned is preferred for zh-Hans (CN) and the second is preferred for 139 | -- zh-Hant (TW). Commonly, they will be exactly the same. 140 | -- 141 | -- You may want to use 'mandarinBestEffort' instead of this function. 142 | mandarin :: Char -> Maybe (Pinyin.Phone, Pinyin.Phone) 143 | mandarin c = M.lookup c (kMandarin readings) 144 | 145 | -- | The Tang dynasty pronunciation(s) of this character, in an undefined romanization. 146 | tang :: Char -> [(CommonTangCharacter, Text.Text)] 147 | tang c = M.findWithDefault [] c (kTang readings) 148 | 149 | -- | The character’s pronunciation(s) in Quốc ngữ. 150 | vietnamese :: Char -> [QuocNgu.Phone] 151 | vietnamese c = M.findWithDefault [] c (kVietnamese readings) 152 | 153 | -- | One or more Hànyǔ Pīnyīn readings as given in the Xiàndài Hànyǔ Cídiǎn. 154 | -- 155 | -- You may want to use 'mandarinBestEffort' instead of this function. 156 | xhc1983 :: Char -> [([(HDZEntry, IsHDZSubstitution)], [Pinyin.Phone])] 157 | xhc1983 c = M.findWithDefault [] c (kXHC1983 readings) 158 | 159 | 160 | data ReadingsMap = RMS { 161 | kCantonese :: !(M.Map Char [Jyutping.Phone]), 162 | kDefinition :: !(M.Map Char [CharDefinition]), 163 | kHangul :: !(M.Map Char [Hangul.Phone]), 164 | kHanyuPinlu :: !(M.Map Char [(Pinyin.Phone, OccurrenceCount)]), 165 | kHanyuPinyin :: !(M.Map Char [([HDZEntry], [Pinyin.Phone])]), 166 | kJapaneseKun :: !(M.Map Char [Text.Text]), -- Kun and On readings are in mixed 167 | kJapaneseOn :: !(M.Map Char [Text.Text]), -- romanization systems! Worthless... 168 | kKorean :: !(M.Map Char [KoreanYale.Phone]), 169 | kMandarin :: !(M.Map Char (Pinyin.Phone, Pinyin.Phone)), 170 | kTang :: !(M.Map Char [(CommonTangCharacter, Text.Text)]), -- Who knows how this is romanized? 171 | kVietnamese :: !(M.Map Char [QuocNgu.Phone]), 172 | kXHC1983 :: !(M.Map Char [([(HDZEntry, IsHDZSubstitution)], [Pinyin.Phone])]) 173 | } deriving (Show) -- Useful for debugging in GHCi 174 | 175 | emptyReadingsMap :: ReadingsMap 176 | emptyReadingsMap = RMS M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty M.empty 177 | 178 | unionReadingsMap :: ReadingsMap -> ReadingsMap -> ReadingsMap 179 | unionReadingsMap (RMS a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12) (RMS b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12) 180 | = RMS (plus a1 b1) (plus a2 b2) (plus a3 b3) (plus a4 b4) (plus a5 b5) (plus a6 b6) 181 | (plus a7 b7) (plus a8 b8) (plus a9 b9) (plus a10 b10) (plus a11 b11) (plus a12 b12) 182 | where plus = M.unionWith (error "unionReadingsMap: impossible") -- There is at most one line for each (character, field name) combination 183 | 184 | {-# NOINLINE contents #-} 185 | contents :: TextL.Text 186 | contents = unsafePerformIO (readUTF8DataFile "data/Unihan/Unihan_Readings.txt") 187 | 188 | readings :: ReadingsMap 189 | readings = parseLazy fileP contents 190 | 191 | 192 | fileP :: Parser ReadingsMap 193 | fileP = fmap (foldl' unionReadingsMap emptyReadingsMap) (lineP `manyTill` endOfInput) 194 | 195 | lineP :: Parser ReadingsMap 196 | lineP = do { c <- charP <* skipSpace; dataP <- readingP c <* skipSpace; dataP <* skipTrueSpace <* lineTerminator } 197 | <|> char '#' *> manyTill anyChar lineTerminator *> pure emptyReadingsMap 198 | <|> manyTill skipTrueSpace lineTerminator *> pure emptyReadingsMap 199 | "line" 200 | 201 | readingP :: Char -> Parser (Parser ReadingsMap) 202 | readingP c = string "kCantonese" *> pure (liftA (\x -> emptyReadingsMap { kCantonese = mk x }) (jyutpingP `sepBy1` skipTrueSpace)) 203 | <|> string "kDefinition" *> pure (liftA (\x -> emptyReadingsMap { kDefinition = mk x }) definitionsP) 204 | <|> string "kHangul" *> pure (liftA (\x -> emptyReadingsMap { kHangul = mk x }) (hangulP `sepBy1` skipTrueSpace)) 205 | <|> string "kHanyuPinlu" *> pure (liftA (\x -> emptyReadingsMap { kHanyuPinlu = mk x }) (hanyuPinluP `sepBy1` skipTrueSpace)) 206 | <|> string "kHanyuPinyin" *> pure (liftA (\x -> emptyReadingsMap { kHanyuPinyin = mk x }) (hanyuPinyinP `sepBy1` skipTrueSpace)) 207 | <|> string "kJapaneseKun" *> pure (liftA (\x -> emptyReadingsMap { kJapaneseKun = mk x }) (takeWhile1 isAsciiUpper `sepBy1` skipTrueSpace)) 208 | <|> string "kJapaneseOn" *> pure (liftA (\x -> emptyReadingsMap { kJapaneseOn = mk x }) (takeWhile1 isAsciiUpper `sepBy1` skipTrueSpace)) 209 | <|> string "kKorean" *> pure (liftA (\x -> emptyReadingsMap { kKorean = mk x }) (yaleP `sepBy1` skipTrueSpace)) 210 | <|> string "kMandarin" *> pure (liftA (\x -> emptyReadingsMap { kMandarin = mk x }) mandarinP) 211 | <|> string "kTang" *> pure (liftA (\x -> emptyReadingsMap { kTang = mk x }) (tangP `sepBy1` skipTrueSpace)) 212 | <|> string "kVietnamese" *> pure (liftA (\x -> emptyReadingsMap { kVietnamese = mk x }) (quocNguP `sepBy1` skipTrueSpace)) 213 | <|> string "kXHC1983" *> pure (liftA (\x -> emptyReadingsMap { kXHC1983 = mk x }) (xhc1983P `sepBy1` skipTrueSpace)) 214 | where mk x = M.singleton c x 215 | 216 | definitionsP :: Parser [CharDefinition] 217 | definitionsP = takeWhile1 (\c -> c /= '\r' && c /= '\n' && c /= ';') `sepBy1` (takeWhile1 (== ';') <* skipTrueSpace) -- Entry for U+4156 mistakely includes a double ;; 218 | 219 | hangulP :: Parser Hangul.Phone 220 | hangulP = liftA Hangul.fromJamos (takeWhile1 (not . isSpace)) 221 | 222 | hanyuPinluP :: Parser (Pinyin.Phone, OccurrenceCount) 223 | hanyuPinluP = liftA2 (,) tonedPinyinP (char '(' *> decimal <* char ')') 224 | 225 | mandarinP :: Parser (Pinyin.Phone, Pinyin.Phone) 226 | mandarinP = liftA2 (\simp mb_trad -> (simp, fromMaybe simp mb_trad)) accentedPinyinP (optional (skipTrueSpace *> accentedPinyinP)) 227 | 228 | accentedPinyinP :: Parser Pinyin.Phone 229 | accentedPinyinP = liftA Pinyin.fromAccented (takeWhile1 (\c -> not (isSpace c) && c /= ',')) 230 | 231 | hanyuPinyinP :: Parser ([HDZEntry], [Pinyin.Phone]) 232 | hanyuPinyinP = liftA2 (,) (hdzEntryP `sepBy1` char ',') (char ':' *> (accentedPinyinP `sepBy1` char ',')) 233 | 234 | yaleP :: Parser KoreanYale.Phone 235 | yaleP = takeWhile1 isAsciiUpper 236 | 237 | tangP :: Parser (CommonTangCharacter, Text.Text) 238 | tangP = liftA2 (,) (canParse (char '*')) (takeWhile1 (not . isSpace)) 239 | 240 | quocNguP :: Parser QuocNgu.Phone 241 | quocNguP = takeWhile1 (not . isSpace) 242 | 243 | xhc1983P :: Parser ([(HDZEntry, IsHDZSubstitution)], [Pinyin.Phone]) 244 | xhc1983P = liftA2 (,) (locP `sepBy1` char ',') (char ':' *> (accentedPinyinP `sepBy1` char ',')) 245 | where locP = liftA2 (,) hdzEntryP (canParse (char '*')) 246 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The library code and data files are all distributed under their own licenses. 2 | 3 | 4 | === Code === 5 | 6 | Copyright (c) 2008, Maximilian Bolingbroke 7 | All rights reserved. 8 | 9 | Redistribution and use in source and binary forms, with or without modification, are permitted 10 | provided that the following conditions are met: 11 | 12 | * Redistributions of source code must retain the above copyright notice, this list of 13 | conditions and the following disclaimer. 14 | * Redistributions in binary form must reproduce the above copyright notice, this list of 15 | conditions and the following disclaimer in the documentation and/or other materials 16 | provided with the distribution. 17 | * Neither the name of Maximilian Bolingbroke nor the names of other contributors may be used to 18 | endorse or promote products derived from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 21 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 22 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 23 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 26 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 27 | OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | 30 | === CC-CEDICT === 31 | 32 | This work is licensed under a Creative Commons Attribution-Share Alike 3.0 33 | License reproduced from http://creativecommons.org/licenses/by-sa/3.0/ below: 34 | 35 | THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE 36 | COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY 37 | COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS 38 | AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED. 39 | 40 | BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE 41 | TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY 42 | BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS 43 | CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND 44 | CONDITIONS. 45 | 46 | 1. Definitions 47 | 48 | a. "Adaptation" means a work based upon the Work, or upon the Work and 49 | other pre-existing works, such as a translation, adaptation, 50 | derivative work, arrangement of music or other alterations of a 51 | literary or artistic work, or phonogram or performance and includes 52 | cinematographic adaptations or any other form in which the Work may be 53 | recast, transformed, or adapted including in any form recognizably 54 | derived from the original, except that a work that constitutes a 55 | Collection will not be considered an Adaptation for the purpose of 56 | this License. For the avoidance of doubt, where the Work is a musical 57 | work, performance or phonogram, the synchronization of the Work in 58 | timed-relation with a moving image ("synching") will be considered an 59 | Adaptation for the purpose of this License. 60 | b. "Collection" means a collection of literary or artistic works, such as 61 | encyclopedias and anthologies, or performances, phonograms or 62 | broadcasts, or other works or subject matter other than works listed 63 | in Section 1(f) below, which, by reason of the selection and 64 | arrangement of their contents, constitute intellectual creations, in 65 | which the Work is included in its entirety in unmodified form along 66 | with one or more other contributions, each constituting separate and 67 | independent works in themselves, which together are assembled into a 68 | collective whole. A work that constitutes a Collection will not be 69 | considered an Adaptation (as defined below) for the purposes of this 70 | License. 71 | c. "Creative Commons Compatible License" means a license that is listed 72 | at http://creativecommons.org/compatiblelicenses that has been 73 | approved by Creative Commons as being essentially equivalent to this 74 | License, including, at a minimum, because that license: (i) contains 75 | terms that have the same purpose, meaning and effect as the License 76 | Elements of this License; and, (ii) explicitly permits the relicensing 77 | of adaptations of works made available under that license under this 78 | License or a Creative Commons jurisdiction license with the same 79 | License Elements as this License. 80 | d. "Distribute" means to make available to the public the original and 81 | copies of the Work or Adaptation, as appropriate, through sale or 82 | other transfer of ownership. 83 | e. "License Elements" means the following high-level license attributes 84 | as selected by Licensor and indicated in the title of this License: 85 | Attribution, ShareAlike. 86 | f. "Licensor" means the individual, individuals, entity or entities that 87 | offer(s) the Work under the terms of this License. 88 | g. "Original Author" means, in the case of a literary or artistic work, 89 | the individual, individuals, entity or entities who created the Work 90 | or if no individual or entity can be identified, the publisher; and in 91 | addition (i) in the case of a performance the actors, singers, 92 | musicians, dancers, and other persons who act, sing, deliver, declaim, 93 | play in, interpret or otherwise perform literary or artistic works or 94 | expressions of folklore; (ii) in the case of a phonogram the producer 95 | being the person or legal entity who first fixes the sounds of a 96 | performance or other sounds; and, (iii) in the case of broadcasts, the 97 | organization that transmits the broadcast. 98 | h. "Work" means the literary and/or artistic work offered under the terms 99 | of this License including without limitation any production in the 100 | literary, scientific and artistic domain, whatever may be the mode or 101 | form of its expression including digital form, such as a book, 102 | pamphlet and other writing; a lecture, address, sermon or other work 103 | of the same nature; a dramatic or dramatico-musical work; a 104 | choreographic work or entertainment in dumb show; a musical 105 | composition with or without words; a cinematographic work to which are 106 | assimilated works expressed by a process analogous to cinematography; 107 | a work of drawing, painting, architecture, sculpture, engraving or 108 | lithography; a photographic work to which are assimilated works 109 | expressed by a process analogous to photography; a work of applied 110 | art; an illustration, map, plan, sketch or three-dimensional work 111 | relative to geography, topography, architecture or science; a 112 | performance; a broadcast; a phonogram; a compilation of data to the 113 | extent it is protected as a copyrightable work; or a work performed by 114 | a variety or circus performer to the extent it is not otherwise 115 | considered a literary or artistic work. 116 | i. "You" means an individual or entity exercising rights under this 117 | License who has not previously violated the terms of this License with 118 | respect to the Work, or who has received express permission from the 119 | Licensor to exercise rights under this License despite a previous 120 | violation. 121 | j. "Publicly Perform" means to perform public recitations of the Work and 122 | to communicate to the public those public recitations, by any means or 123 | process, including by wire or wireless means or public digital 124 | performances; to make available to the public Works in such a way that 125 | members of the public may access these Works from a place and at a 126 | place individually chosen by them; to perform the Work to the public 127 | by any means or process and the communication to the public of the 128 | performances of the Work, including by public digital performance; to 129 | broadcast and rebroadcast the Work by any means including signs, 130 | sounds or images. 131 | k. "Reproduce" means to make copies of the Work by any means including 132 | without limitation by sound or visual recordings and the right of 133 | fixation and reproducing fixations of the Work, including storage of a 134 | protected performance or phonogram in digital form or other electronic 135 | medium. 136 | 137 | 2. Fair Dealing Rights. Nothing in this License is intended to reduce, 138 | limit, or restrict any uses free from copyright or rights arising from 139 | limitations or exceptions that are provided for in connection with the 140 | copyright protection under copyright law or other applicable laws. 141 | 142 | 3. License Grant. Subject to the terms and conditions of this License, 143 | Licensor hereby grants You a worldwide, royalty-free, non-exclusive, 144 | perpetual (for the duration of the applicable copyright) license to 145 | exercise the rights in the Work as stated below: 146 | 147 | a. to Reproduce the Work, to incorporate the Work into one or more 148 | Collections, and to Reproduce the Work as incorporated in the 149 | Collections; 150 | b. to create and Reproduce Adaptations provided that any such Adaptation, 151 | including any translation in any medium, takes reasonable steps to 152 | clearly label, demarcate or otherwise identify that changes were made 153 | to the original Work. For example, a translation could be marked "The 154 | original work was translated from English to Spanish," or a 155 | modification could indicate "The original work has been modified."; 156 | c. to Distribute and Publicly Perform the Work including as incorporated 157 | in Collections; and, 158 | d. to Distribute and Publicly Perform Adaptations. 159 | e. For the avoidance of doubt: 160 | 161 | i. Non-waivable Compulsory License Schemes. In those jurisdictions in 162 | which the right to collect royalties through any statutory or 163 | compulsory licensing scheme cannot be waived, the Licensor 164 | reserves the exclusive right to collect such royalties for any 165 | exercise by You of the rights granted under this License; 166 | ii. Waivable Compulsory License Schemes. In those jurisdictions in 167 | which the right to collect royalties through any statutory or 168 | compulsory licensing scheme can be waived, the Licensor waives the 169 | exclusive right to collect such royalties for any exercise by You 170 | of the rights granted under this License; and, 171 | iii. Voluntary License Schemes. The Licensor waives the right to 172 | collect royalties, whether individually or, in the event that the 173 | Licensor is a member of a collecting society that administers 174 | voluntary licensing schemes, via that society, from any exercise 175 | by You of the rights granted under this License. 176 | 177 | The above rights may be exercised in all media and formats whether now 178 | known or hereafter devised. The above rights include the right to make 179 | such modifications as are technically necessary to exercise the rights in 180 | other media and formats. Subject to Section 8(f), all rights not expressly 181 | granted by Licensor are hereby reserved. 182 | 183 | 4. Restrictions. The license granted in Section 3 above is expressly made 184 | subject to and limited by the following restrictions: 185 | 186 | a. You may Distribute or Publicly Perform the Work only under the terms 187 | of this License. You must include a copy of, or the Uniform Resource 188 | Identifier (URI) for, this License with every copy of the Work You 189 | Distribute or Publicly Perform. You may not offer or impose any terms 190 | on the Work that restrict the terms of this License or the ability of 191 | the recipient of the Work to exercise the rights granted to that 192 | recipient under the terms of the License. You may not sublicense the 193 | Work. You must keep intact all notices that refer to this License and 194 | to the disclaimer of warranties with every copy of the Work You 195 | Distribute or Publicly Perform. When You Distribute or Publicly 196 | Perform the Work, You may not impose any effective technological 197 | measures on the Work that restrict the ability of a recipient of the 198 | Work from You to exercise the rights granted to that recipient under 199 | the terms of the License. This Section 4(a) applies to the Work as 200 | incorporated in a Collection, but this does not require the Collection 201 | apart from the Work itself to be made subject to the terms of this 202 | License. If You create a Collection, upon notice from any Licensor You 203 | must, to the extent practicable, remove from the Collection any credit 204 | as required by Section 4(c), as requested. If You create an 205 | Adaptation, upon notice from any Licensor You must, to the extent 206 | practicable, remove from the Adaptation any credit as required by 207 | Section 4(c), as requested. 208 | b. You may Distribute or Publicly Perform an Adaptation only under the 209 | terms of: (i) this License; (ii) a later version of this License with 210 | the same License Elements as this License; (iii) a Creative Commons 211 | jurisdiction license (either this or a later license version) that 212 | contains the same License Elements as this License (e.g., 213 | Attribution-ShareAlike 3.0 US)); (iv) a Creative Commons Compatible 214 | License. If you license the Adaptation under one of the licenses 215 | mentioned in (iv), you must comply with the terms of that license. If 216 | you license the Adaptation under the terms of any of the licenses 217 | mentioned in (i), (ii) or (iii) (the "Applicable License"), you must 218 | comply with the terms of the Applicable License generally and the 219 | following provisions: (I) You must include a copy of, or the URI for, 220 | the Applicable License with every copy of each Adaptation You 221 | Distribute or Publicly Perform; (II) You may not offer or impose any 222 | terms on the Adaptation that restrict the terms of the Applicable 223 | License or the ability of the recipient of the Adaptation to exercise 224 | the rights granted to that recipient under the terms of the Applicable 225 | License; (III) You must keep intact all notices that refer to the 226 | Applicable License and to the disclaimer of warranties with every copy 227 | of the Work as included in the Adaptation You Distribute or Publicly 228 | Perform; (IV) when You Distribute or Publicly Perform the Adaptation, 229 | You may not impose any effective technological measures on the 230 | Adaptation that restrict the ability of a recipient of the Adaptation 231 | from You to exercise the rights granted to that recipient under the 232 | terms of the Applicable License. This Section 4(b) applies to the 233 | Adaptation as incorporated in a Collection, but this does not require 234 | the Collection apart from the Adaptation itself to be made subject to 235 | the terms of the Applicable License. 236 | c. If You Distribute, or Publicly Perform the Work or any Adaptations or 237 | Collections, You must, unless a request has been made pursuant to 238 | Section 4(a), keep intact all copyright notices for the Work and 239 | provide, reasonable to the medium or means You are utilizing: (i) the 240 | name of the Original Author (or pseudonym, if applicable) if supplied, 241 | and/or if the Original Author and/or Licensor designate another party 242 | or parties (e.g., a sponsor institute, publishing entity, journal) for 243 | attribution ("Attribution Parties") in Licensor's copyright notice, 244 | terms of service or by other reasonable means, the name of such party 245 | or parties; (ii) the title of the Work if supplied; (iii) to the 246 | extent reasonably practicable, the URI, if any, that Licensor 247 | specifies to be associated with the Work, unless such URI does not 248 | refer to the copyright notice or licensing information for the Work; 249 | and (iv) , consistent with Ssection 3(b), in the case of an 250 | Adaptation, a credit identifying the use of the Work in the Adaptation 251 | (e.g., "French translation of the Work by Original Author," or 252 | "Screenplay based on original Work by Original Author"). The credit 253 | required by this Section 4(c) may be implemented in any reasonable 254 | manner; provided, however, that in the case of a Adaptation or 255 | Collection, at a minimum such credit will appear, if a credit for all 256 | contributing authors of the Adaptation or Collection appears, then as 257 | part of these credits and in a manner at least as prominent as the 258 | credits for the other contributing authors. For the avoidance of 259 | doubt, You may only use the credit required by this Section for the 260 | purpose of attribution in the manner set out above and, by exercising 261 | Your rights under this License, You may not implicitly or explicitly 262 | assert or imply any connection with, sponsorship or endorsement by the 263 | Original Author, Licensor and/or Attribution Parties, as appropriate, 264 | of You or Your use of the Work, without the separate, express prior 265 | written permission of the Original Author, Licensor and/or Attribution 266 | Parties. 267 | d. Except as otherwise agreed in writing by the Licensor or as may be 268 | otherwise permitted by applicable law, if You Reproduce, Distribute or 269 | Publicly Perform the Work either by itself or as part of any 270 | Adaptations or Collections, You must not distort, mutilate, modify or 271 | take other derogatory action in relation to the Work which would be 272 | prejudicial to the Original Author's honor or reputation. Licensor 273 | agrees that in those jurisdictions (e.g. Japan), in which any exercise 274 | of the right granted in Section 3(b) of this License (the right to 275 | make Adaptations) would be deemed to be a distortion, mutilation, 276 | modification or other derogatory action prejudicial to the Original 277 | Author's honor and reputation, the Licensor will waive or not assert, 278 | as appropriate, this Section, to the fullest extent permitted by the 279 | applicable national law, to enable You to reasonably exercise Your 280 | right under Section 3(b) of this License (right to make Adaptations) 281 | but not otherwise. 282 | 283 | 5. Representations, Warranties and Disclaimer 284 | 285 | UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR 286 | OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY 287 | KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, 288 | INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, 289 | FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF 290 | LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, 291 | WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION 292 | OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU. 293 | 294 | 6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE 295 | LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR 296 | ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES 297 | ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS 298 | BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 299 | 300 | 7. Termination 301 | 302 | a. This License and the rights granted hereunder will terminate 303 | automatically upon any breach by You of the terms of this License. 304 | Individuals or entities who have received Adaptations or Collections 305 | from You under this License, however, will not have their licenses 306 | terminated provided such individuals or entities remain in full 307 | compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will 308 | survive any termination of this License. 309 | b. Subject to the above terms and conditions, the license granted here is 310 | perpetual (for the duration of the applicable copyright in the Work). 311 | Notwithstanding the above, Licensor reserves the right to release the 312 | Work under different license terms or to stop distributing the Work at 313 | any time; provided, however that any such election will not serve to 314 | withdraw this License (or any other license that has been, or is 315 | required to be, granted under the terms of this License), and this 316 | License will continue in full force and effect unless terminated as 317 | stated above. 318 | 319 | 8. Miscellaneous 320 | 321 | a. Each time You Distribute or Publicly Perform the Work or a Collection, 322 | the Licensor offers to the recipient a license to the Work on the same 323 | terms and conditions as the license granted to You under this License. 324 | b. Each time You Distribute or Publicly Perform an Adaptation, Licensor 325 | offers to the recipient a license to the original Work on the same 326 | terms and conditions as the license granted to You under this License. 327 | c. If any provision of this License is invalid or unenforceable under 328 | applicable law, it shall not affect the validity or enforceability of 329 | the remainder of the terms of this License, and without further action 330 | by the parties to this agreement, such provision shall be reformed to 331 | the minimum extent necessary to make such provision valid and 332 | enforceable. 333 | d. No term or provision of this License shall be deemed waived and no 334 | breach consented to unless such waiver or consent shall be in writing 335 | and signed by the party to be charged with such waiver or consent. 336 | e. This License constitutes the entire agreement between the parties with 337 | respect to the Work licensed here. There are no understandings, 338 | agreements or representations with respect to the Work not specified 339 | here. Licensor shall not be bound by any additional provisions that 340 | may appear in any communication from You. This License may not be 341 | modified without the mutual written agreement of the Licensor and You. 342 | f. The rights granted under, and the subject matter referenced, in this 343 | License were drafted utilizing the terminology of the Berne Convention 344 | for the Protection of Literary and Artistic Works (as amended on 345 | September 28, 1979), the Rome Convention of 1961, the WIPO Copyright 346 | Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 347 | and the Universal Copyright Convention (as revised on July 24, 1971). 348 | These rights and subject matter take effect in the relevant 349 | jurisdiction in which the License terms are sought to be enforced 350 | according to the corresponding provisions of the implementation of 351 | those treaty provisions in the applicable national law. If the 352 | standard suite of rights granted under applicable copyright law 353 | includes additional rights not granted under this License, such 354 | additional rights are deemed to be included in the License; this 355 | License is not intended to restrict the license of any rights under 356 | applicable law. 357 | 358 | 359 | === Unihan === 360 | 361 | UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE 362 | 363 | Unicode Data Files include all data files under the directories http://www.unicode.org/Public/, 364 | http://www.unicode.org/reports/, and http://www.unicode.org/cldr/data/. Unicode Data Files do not 365 | include PDF online code charts under the directory http://www.unicode.org/Public/. Software includes 366 | any source code published in the Unicode Standard or under the directories 367 | http://www.unicode.org/Public/, http://www.unicode.org/reports/, and 368 | http://www.unicode.org/cldr/data/. 369 | 370 | NOTICE TO USER: Carefully read the following legal agreement. BY DOWNLOADING, INSTALLING, COPYING OR 371 | OTHERWISE USING UNICODE INC.'S DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), YOU 372 | UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE TERMS AND CONDITIONS OF THIS AGREEMENT. 373 | IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE. 374 | 375 | COPYRIGHT AND PERMISSION NOTICE 376 | 377 | Copyright © 1991-2012 Unicode, Inc. All rights reserved. Distributed under the Terms of Use in 378 | http://www.unicode.org/copyright.html. 379 | 380 | Permission is hereby granted, free of charge, to any person obtaining a copy of the Unicode data 381 | files and any associated documentation (the "Data Files") or Unicode software and any associated 382 | documentation (the "Software") to deal in the Data Files or Software without restriction, including 383 | without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sell copies 384 | of the Data Files or Software, and to permit persons to whom the Data Files or Software are 385 | furnished to do so, provided that (a) the above copyright notice(s) and this permission notice 386 | appear with all copies of the Data Files or Software, (b) both the above copyright notice(s) and 387 | this permission notice appear in associated documentation, and (c) there is clear notice in each 388 | modified Data File or in the Software as well as in the documentation associated with the Data 389 | File(s) or Software that the data or software has been modified. 390 | 391 | THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 392 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 393 | NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN 394 | THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY 395 | DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, 396 | NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 397 | THE DATA FILES OR SOFTWARE. 398 | 399 | Except as contained in this notice, the name of a copyright holder shall not be used in advertising 400 | or otherwise to promote the sale, use or other dealings in these Data Files or Software without 401 | prior written authorization of the copyright holder. 402 | --------------------------------------------------------------------------------