{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedLists #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NamedFieldPuns #-}
-- | This module deals with Chinese characters and Sino-Korean words written in
-- hanja.
module Text.Seonbi.Hanja
    ( -- * Korean mixed-script (國漢文混用) transformation
      HanjaPhoneticization (..)
    , def
    , phoneticizeHanja
      -- * Single character phoneticization
    , phoneticizeHanjaChar
      -- * Word phoneticization
    , HanjaDictionary
    , HanjaWordPhoneticizer
    , phoneticizeHanjaWord
    , phoneticizeHanjaWordWithInitialSoundLaw
    , withDictionary
      -- * Word rendering
    , HanjaWordRenderer
    , hangulOnly
    , hanjaInParentheses
    , hanjaInRuby
      -- * Initial sound law (頭音法則)
    , convertInitialSoundLaw
    , initialSoundLawTable
    , initialSoundLawTable'
    , revertInitialSoundLaw
    ) where

import Prelude hiding (lookup)

import Control.Applicative
import Control.Monad
import Data.Char
import Data.List hiding (lookup)
import Data.Maybe
import Data.Ord (comparing)

import Data.Attoparsec.Text
import Data.Default
import Data.Map.Strict
import Data.Set
import Data.Text hiding (concatMap)

import Text.Seonbi.Hangul
import Text.Seonbi.Html
import Text.Seonbi.Html.Lang
import Text.Seonbi.Html.Preservation
import Text.Seonbi.Html.TagStack (push)
import qualified Text.Seonbi.Trie as Trie
import Text.Seonbi.Unihan.KHangul

-- $setup
-- >>> import qualified Text.Show.Unicode
-- >>> :set -interactive-print=Text.Show.Unicode.uprint

-- | Settings to transform Sino-Korean words written in hanja into hangul
-- letters.
data HanjaPhoneticization = HanjaPhoneticization
    { -- | A function to phoneticize a hanja word.
      -- Use 'phoneticizeHanjaWordWithInitialSoundLaw' for South Korean
      -- orthography, or 'phoneticizeHanjaWord' for North Korean orthography.
      HanjaPhoneticization -> HanjaWordPhoneticizer
phoneticizer :: HanjaWordPhoneticizer
      -- | A function to render a hanja word.  See also 'HanjaWordRenderer'.
    , HanjaPhoneticization -> HanjaWordRenderer
wordRenderer :: HanjaWordRenderer
      -- | A function to render a hanja word which should be disambiguated.
      -- It's used instead of 'wordRenderer' when two or more words in
      -- a text have the same hangul reading but actually are dictinct
      -- each other in hanja characters, e.g., 小數\/素數 (소수).
    , HanjaPhoneticization -> HanjaWordRenderer
homophoneRenderer :: HanjaWordRenderer
      -- | Whether to insert some HTML comments that contain useful information
      -- for debugging into the result.  This does not affect the rendering
      -- of the result HTML, but only the HTML code.
    , HanjaPhoneticization -> Bool
debugComment :: Bool
    }

-- | A function to phoneticize a Sino-Korean (i.e., hanja) word (漢字語)
-- into hangul letters.
-- See also 'phoneticizeHanjaWord', 'phoneticizeHanjaWordWithInitialSoundLaw',
-- and 'withDictionary'.
type HanjaWordPhoneticizer
    = Text  -- ^ A Sino-Korean (i.e., hanja) word (漢字語) to phoneticize.
    -> Text -- ^ Hangul letters that phoneticize the given Sino-Korean word.

-- | A function to render a Sino-Korean (i.e., hanja) word (漢字語).
-- Choose one in 'hangulOnly', 'hanjaInParentheses', and 'hanjaInRuby'.
type HanjaWordRenderer
    = HtmlTagStack
    -- ^ Where rendered HTML entities get interleaved into.
    -> Text
    -- ^ A Sino-Korean (i.e., hanja) word (漢字語) to render.
    -> Text
    -- ^ Hangul letters that phoneticized the Sino-Korean word.
    -> [HtmlEntity]
    -- ^ Rendered HTML entities.

-- | Renders a word in hangul-only, no hanja at all (e.g., @안녕히@).
hangulOnly :: HanjaWordRenderer
hangulOnly :: HanjaWordRenderer
hangulOnly HtmlTagStack
stack Text
_ Text
hangul = [HtmlTagStack -> Text -> HtmlEntity
HtmlCdata HtmlTagStack
stack Text
hangul]

-- | Renders a word in hangul followed by hanja in parentheses
-- (e.g., @안녕(安寧)히@).
hanjaInParentheses :: HanjaWordRenderer
hanjaInParentheses :: HanjaWordRenderer
hanjaInParentheses HtmlTagStack
stack Text
hanja Text
hangul =
    [HtmlTagStack -> Text -> HtmlEntity
HtmlCdata HtmlTagStack
stack forall a b. (a -> b) -> a -> b
$ [Text] -> Text
Data.Text.concat [Text
hangul, Text
"(", Text
hanja, Text
")"]]

-- | Renders a word in @<ruby>@ tag (e.g.,
-- @\<ruby\>安寧\<rp\>(\<\/rp\>\<rt\>안녕\<\/rt\>\<rp\>)\<\/rp\>\<\/ruby\>히@).
--
-- Please read [Use Cases & Exploratory Approaches for Ruby
-- Markup](https://www.w3.org/TR/ruby-use-cases/) as well for more information.
hanjaInRuby :: HanjaWordRenderer
hanjaInRuby :: HanjaWordRenderer
hanjaInRuby HtmlTagStack
stack Text
hanja Text
hangul =
    [ HtmlTagStack -> HtmlTag -> Text -> HtmlEntity
HtmlStartTag HtmlTagStack
stack HtmlTag
Ruby Text
""
    , HtmlTagStack -> Text -> HtmlEntity
HtmlCdata HtmlTagStack
rubyStack Text
hanja
    , HtmlTagStack -> HtmlTag -> Text -> HtmlEntity
HtmlStartTag HtmlTagStack
rubyStack HtmlTag
RP Text
""
    , HtmlTagStack -> Text -> HtmlEntity
HtmlText (HtmlTag -> HtmlTagStack -> HtmlTagStack
push HtmlTag
RP HtmlTagStack
rubyStack) Text
"("
    , HtmlTagStack -> HtmlTag -> HtmlEntity
HtmlEndTag HtmlTagStack
rubyStack HtmlTag
RP
    , HtmlTagStack -> HtmlTag -> Text -> HtmlEntity
HtmlStartTag HtmlTagStack
rubyStack HtmlTag
RT Text
""
    , HtmlTagStack -> Text -> HtmlEntity
HtmlCdata (HtmlTag -> HtmlTagStack -> HtmlTagStack
push HtmlTag
RT HtmlTagStack
rubyStack) Text
hangul
    , HtmlTagStack -> HtmlTag -> HtmlEntity
HtmlEndTag HtmlTagStack
rubyStack HtmlTag
RT
    , HtmlTagStack -> HtmlTag -> Text -> HtmlEntity
HtmlStartTag HtmlTagStack
rubyStack HtmlTag
RP Text
""
    , HtmlTagStack -> Text -> HtmlEntity
HtmlText (HtmlTag -> HtmlTagStack -> HtmlTagStack
push HtmlTag
RP HtmlTagStack
rubyStack) Text
")"
    , HtmlTagStack -> HtmlTag -> HtmlEntity
HtmlEndTag HtmlTagStack
rubyStack HtmlTag
RP
    , HtmlTagStack -> HtmlTag -> HtmlEntity
HtmlEndTag HtmlTagStack
stack HtmlTag
Ruby
    ]
  where
    rubyStack :: HtmlTagStack
    rubyStack :: HtmlTagStack
rubyStack = HtmlTag -> HtmlTagStack -> HtmlTagStack
push HtmlTag
Ruby HtmlTagStack
stack

instance Default HanjaPhoneticization where
    def :: HanjaPhoneticization
def = HanjaPhoneticization
        { phoneticizer :: HanjaWordPhoneticizer
phoneticizer = HanjaWordPhoneticizer
phoneticizeHanjaWordWithInitialSoundLaw
        , wordRenderer :: HanjaWordRenderer
wordRenderer = HanjaWordRenderer
hangulOnly
        , homophoneRenderer :: HanjaWordRenderer
homophoneRenderer = HanjaWordRenderer
hanjaInParentheses
        , debugComment :: Bool
debugComment = Bool
False
        }

-- | Transforms hanja words in the given HTML entities into corresponding
-- hangul words.
phoneticizeHanja
    :: HanjaPhoneticization
    -- ^ Configures the phoneticization details.
    -> [HtmlEntity]
    -- ^ HTML entities (that may contain some hanja words) to phoneticize
    -- all hanja words into corresponding hangul-only words.
    -> [HtmlEntity]
    -- ^ HTML entities that have no hanja words but hangul-only words instead.
phoneticizeHanja :: HanjaPhoneticization -> [HtmlEntity] -> [HtmlEntity]
phoneticizeHanja HanjaPhoneticization { HanjaWordPhoneticizer
phoneticizer :: HanjaWordPhoneticizer
phoneticizer :: HanjaPhoneticization -> HanjaWordPhoneticizer
phoneticizer
                                      , HanjaWordRenderer
wordRenderer :: HanjaWordRenderer
wordRenderer :: HanjaPhoneticization -> HanjaWordRenderer
wordRenderer
                                      , HanjaWordRenderer
homophoneRenderer :: HanjaWordRenderer
homophoneRenderer :: HanjaPhoneticization -> HanjaWordRenderer
homophoneRenderer
                                      , Bool
debugComment :: Bool
debugComment :: HanjaPhoneticization -> Bool
debugComment
                                      }
                 [HtmlEntity]
entities =
    (forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
`concatMap` [Either HtmlEntity (HtmlTagStack, Text, Text)]
normalized) forall a b. (a -> b) -> a -> b
$ \ case
        Left HtmlEntity
e' ->
            [HtmlEntity
e']
        Right (HtmlTagStack
stack, Text
hanja, Text
hangul) ->
            if forall a. Set a -> Int
Data.Set.size (forall k a. Ord k => a -> k -> Map k a -> a
findWithDefault [] Text
hangul Map Text (Set Text)
frequencyDict) forall a. Ord a => a -> a -> Bool
> Int
1
            then HanjaWordRenderer
homophoneRenderer' HtmlTagStack
stack Text
hanja Text
hangul
            else HanjaWordRenderer
wordRenderer' HtmlTagStack
stack Text
hanja Text
hangul
  where
    frequencyDict :: Map Text (Set Text)
    frequencyDict :: Map Text (Set Text)
frequencyDict = forall k a. Ord k => (a -> a -> a) -> [(k, a)] -> Map k a
Data.Map.Strict.fromListWith
        forall a. Ord a => Set a -> Set a -> Set a
Data.Set.union
        [(Text
hangul, [Text
hanja]) | Right (HtmlTagStack
_, Text
hanja, Text
hangul) <- [Either HtmlEntity (HtmlTagStack, Text, Text)]
normalized]
    normalized :: [Either HtmlEntity (HtmlTagStack, Text, Text)]
    normalized :: [Either HtmlEntity (HtmlTagStack, Text, Text)]
normalized = forall (t :: * -> *) a. Foldable t => t [a] -> [a]
Data.List.concat
        [ case Either HtmlEntity (HtmlTagStack, Text, Text)
e of
            Left HtmlEntity
_ ->
                [Either HtmlEntity (HtmlTagStack, Text, Text)
e]
            Right (HtmlTagStack
stack, Text
hanja, Text
hangul) ->
                let hanjaWords :: [Text]
hanjaWords = Text -> [Text]
splitByDigits Text
hanja
                    hangulWords :: [Text]
hangulWords = Text -> [Text]
splitByDigits Text
hangul
                    hanjaWordsLen :: Int
hanjaWordsLen = forall (t :: * -> *) a. Foldable t => t a -> Int
Prelude.length [Text]
hanjaWords
                    hangulWordsLen :: Int
hangulWordsLen = forall (t :: * -> *) a. Foldable t => t a -> Int
Prelude.length [Text]
hangulWords
                in
                    if Int
hanjaWordsLen forall a. Eq a => a -> a -> Bool
/= Int
hangulWordsLen
                    then [Either HtmlEntity (HtmlTagStack, Text, Text)
e]
                    else
                        [ if (Char -> Bool) -> Text -> Bool
Data.Text.any Char -> Bool
isDigit Text
hanj
                          then forall a b. a -> Either a b
Left forall a b. (a -> b) -> a -> b
$ HtmlTagStack -> Text -> HtmlEntity
HtmlText HtmlTagStack
stack Text
hanj
                          else forall a b. b -> Either a b
Right (HtmlTagStack
stack, Text
hanj, Text
hang)
                        | (Text
hanj, Text
hang) <- forall a b. [a] -> [b] -> [(a, b)]
Prelude.zip [Text]
hanjaWords [Text]
hangulWords
                        ]
        | Either HtmlEntity (HtmlTagStack, Text, Text)
e <- forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap LangHtmlEntity -> [Either HtmlEntity (HtmlTagStack, Text, Text)]
transform ([HtmlEntity] -> [LangHtmlEntity]
annotateWithLang [HtmlEntity]
entities)
        ]
    splitByDigits :: Text -> [Text]
    splitByDigits :: Text -> [Text]
splitByDigits = (Char -> Char -> Bool) -> Text -> [Text]
Data.Text.groupBy (\ Char
a Char
b -> Char -> Bool
isDigit Char
a forall a. Eq a => a -> a -> Bool
== Char -> Bool
isDigit Char
b)
    transform :: LangHtmlEntity
              -> [Either HtmlEntity (HtmlTagStack, Text, Text)]
    transform :: LangHtmlEntity -> [Either HtmlEntity (HtmlTagStack, Text, Text)]
transform LangHtmlEntity
                { lang :: LangHtmlEntity -> Maybe Text
lang = Maybe Text
lang
                , entity :: LangHtmlEntity -> HtmlEntity
entity = entity :: HtmlEntity
entity@HtmlText
                    { tagStack :: HtmlEntity -> HtmlTagStack
tagStack = HtmlTagStack
tagStack'
                    , rawText :: HtmlEntity -> Text
rawText = Text
rawText'
                    }
                }
      | HtmlTagStack -> Bool
isPreservedTagStack HtmlTagStack
tagStack' Bool -> Bool -> Bool
|| Maybe Text -> Bool
isNeverKorean Maybe Text
lang =
            [forall a b. a -> Either a b
Left HtmlEntity
entity]
      | Bool
otherwise =
            case Text -> Maybe [(Bool, Text)]
analyzeHanjaText Text
rawText' of
                Maybe [(Bool, Text)]
Nothing -> [forall a b. a -> Either a b
Left forall a b. (a -> b) -> a -> b
$ HtmlEntity
entity { rawText :: Text
rawText = Text
rawText' }]
                Just [(Bool, Text)]
pairs ->
                    [ if Bool
trueIfHanja
                      then forall a b. b -> Either a b
Right (HtmlTagStack
tagStack', Text
htmlText, HanjaWordPhoneticizer
phoneticizer Text
htmlText)
                      else forall a b. a -> Either a b
Left (HtmlEntity
entity { rawText :: Text
rawText = Text
htmlText })
                    -- Note that htmlText here can have HTML entities.
                    | (Bool
trueIfHanja, Text
htmlText) <- [(Bool, Text)]
pairs
                    ]
    transform LangHtmlEntity { HtmlEntity
entity :: HtmlEntity
entity :: LangHtmlEntity -> HtmlEntity
entity } =
        [forall a b. a -> Either a b
Left HtmlEntity
entity]
    -- FIXME: This should be public:
    debugRenderer :: HanjaWordRenderer -> HanjaWordRenderer
    debugRenderer :: HanjaWordRenderer -> HanjaWordRenderer
debugRenderer HanjaWordRenderer
render HtmlTagStack
stack Text
hanja Text
hangul =
        HtmlTagStack -> Text -> HtmlEntity
HtmlComment HtmlTagStack
stack (Text
" Hanja: " Text -> HanjaWordPhoneticizer
`append` Text
hanja)
            forall a. a -> [a] -> [a]
: HanjaWordRenderer
render HtmlTagStack
stack Text
hanja Text
hangul forall a. [a] -> [a] -> [a]
++ [HtmlTagStack -> Text -> HtmlEntity
HtmlComment HtmlTagStack
stack Text
" /Hanja "]
    wordRenderer' :: HanjaWordRenderer
    wordRenderer' :: HanjaWordRenderer
wordRenderer'
      | Bool
debugComment = HanjaWordRenderer -> HanjaWordRenderer
debugRenderer HanjaWordRenderer
wordRenderer
      | Bool
otherwise = HanjaWordRenderer
wordRenderer
    homophoneRenderer' :: HanjaWordRenderer
    homophoneRenderer' :: HanjaWordRenderer
homophoneRenderer'
      | Bool
debugComment = HanjaWordRenderer -> HanjaWordRenderer
debugRenderer HanjaWordRenderer
homophoneRenderer
      | Bool
otherwise = HanjaWordRenderer
homophoneRenderer

analyzeHanjaText :: Text -> Maybe [(Bool, Text)]
analyzeHanjaText :: Text -> Maybe [(Bool, Text)]
analyzeHanjaText Text
text' =
    case forall a. Parser a -> Text -> Either [Char] a
parseOnly (Parser Text [(Bool, Text)]
textParser forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* forall t. Chunk t => Parser t ()
endOfInput) Text
text' of
        Left [Char]
_ -> forall a. Maybe a
Nothing
        Right [(Bool, Text)]
pairs -> forall a. a -> Maybe a
Just
            [ (Bool
trueIfHanja, Text
text)
            | (Bool
trueIfHanja, Text
text) <- [(Bool, Text)]
pairs
            , Bool -> Bool
not (Text -> Bool
Data.Text.null Text
text)
            ]

-- | Reads a hanja word and returns a corresponding hangul word.
--
-- >>> :set -XOverloadedStrings
-- >>> phoneticizeHanjaWord "漢字"
-- "한자"
--
-- Note that it does not apply Initial Sound Law (頭音法則):
--
-- >>> phoneticizeHanjaWord  "來日"
-- "래일"
phoneticizeHanjaWord :: HanjaWordPhoneticizer
phoneticizeHanjaWord :: HanjaWordPhoneticizer
phoneticizeHanjaWord =
    (Char -> Char) -> HanjaWordPhoneticizer
Data.Text.map Char -> Char
phoneticizeHanjaChar

-- | It is like 'phoneticizeHanjaWord', but it also applies
-- Initial Sound Law (頭音法則).
--
-- >>> :set -XOverloadedStrings
-- >>> phoneticizeHanjaWordWithInitialSoundLaw  "來日"
-- "내일"
-- >>> phoneticizeHanjaWordWithInitialSoundLaw  "未來"
-- "미래"
phoneticizeHanjaWordWithInitialSoundLaw :: HanjaWordPhoneticizer
phoneticizeHanjaWordWithInitialSoundLaw :: HanjaWordPhoneticizer
phoneticizeHanjaWordWithInitialSoundLaw Text
word =
    case forall a. Parser a -> Text -> Either [Char] a
parseOnly (Parser Text Text
parser forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* forall t. Chunk t => Parser t ()
endOfInput) Text
word of
        Left [Char]
_ -> Text
word
        Right Text
"" -> Text
word
        Right Text
hangulWord -> Text
hangulWord
  where
    parser :: Parser Text
    parser :: Parser Text Text
parser = do
        [Text]
chars <- forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many'
            ( forall i a. Parser i a -> Parser i a
try Parser Text Text
yeolYul
            forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> forall i a. Parser i a -> Parser i a
try Parser Text Text
prefixedNumber
            forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> forall i a. Parser i a -> Parser i a
try Parser Text Text
hanNumber
            forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> forall i a. Parser i a -> Parser i a
try (Char -> Text
Data.Text.singleton forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Char
phoneticize forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Char
anyChar)
            )
        let hangulWord :: Text
hangulWord = [Text] -> Text
Data.Text.concat [Text]
chars
        forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ [Text] -> Text
Data.Text.concat
            [ (Char -> Char) -> HanjaWordPhoneticizer
Data.Text.map Char -> Char
convertInitialSoundLaw forall a b. (a -> b) -> a -> b
$ Int -> HanjaWordPhoneticizer
Data.Text.take Int
1 Text
hangulWord
            , Int -> HanjaWordPhoneticizer
Data.Text.drop Int
1 Text
hangulWord
            ]
    yeolYul :: Parser Text
    yeolYul :: Parser Text Text
yeolYul = do
        Char
former <- (Char -> Bool) -> Parser Char
satisfy forall a b. (a -> b) -> a -> b
$ \ Char
c ->
            Char
c Char -> Maybe Char -> Bool
`hasBatchim` forall a. a -> Maybe a
Just Char
'\x11ab' Bool -> Bool -> Bool
|| Char
c Char -> Maybe Char -> Bool
`hasBatchim` forall a. Maybe a
Nothing
        Char
later <- Char -> Parser Char
phone Char
'렬' forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Char -> Parser Char
phone Char
'률'
        forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ [Char] -> Text
pack
            [ Char -> Char
phoneticize Char
former
            , Char -> Char
convert Char
later
            ]
    prefixedNumber :: Parser Text
    prefixedNumber :: Parser Text Text
prefixedNumber = do
        Char
prefix <- Char -> Parser Char
char Char
'第'
        Text
digits <- (Char -> Bool) -> Parser Text Text
takeWhile1 Char -> Bool
isHanDigit
        forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ Char -> HanjaWordPhoneticizer
Data.Text.cons
            (Char -> Char
phoneticize Char
prefix)
            ((Char -> Char) -> HanjaWordPhoneticizer
Data.Text.map Char -> Char
convertDigit Text
digits)
    hanNumber :: Parser Text
    hanNumber :: Parser Text Text
hanNumber = do
        Char
first <- Parser Char
hanDigit
        Text
rest <- (Char -> Bool) -> Parser Text Text
takeWhile1 Char -> Bool
isHanDigit
        forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ (Char -> Char) -> HanjaWordPhoneticizer
Data.Text.map Char -> Char
convertDigit forall a b. (a -> b) -> a -> b
$ Char -> HanjaWordPhoneticizer
Data.Text.cons Char
first Text
rest
    hanDigit :: Parser Char
    hanDigit :: Parser Char
hanDigit = (Char -> Bool) -> Parser Char
satisfy Char -> Bool
isHanDigit
    phone :: Char -> Parser Char
    phone :: Char -> Parser Char
phone Char
hangul = (Char -> Bool) -> Parser Char
satisfy ((forall a. Eq a => a -> a -> Bool
== Char
hangul) forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Char
phoneticize)
    convertDigit :: Char -> Char
    convertDigit :: Char -> Char
convertDigit = Char -> Char
convertInitialSoundLaw forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Char
phoneticizeDigit
    convert :: Char -> Char
    convert :: Char -> Char
convert = Char -> Char
convertInitialSoundLaw forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Char
phoneticize
    phoneticizeDigit :: Char -> Char
    phoneticizeDigit :: Char -> Char
phoneticizeDigit Char
'參' = Char
'삼'
    phoneticizeDigit Char
'叁' = Char
'삼'
    phoneticizeDigit Char
'参' = Char
'삼'
    phoneticizeDigit Char
'叄' = Char
'삼'
    phoneticizeDigit Char
'拾' = Char
'십'
    phoneticizeDigit Char
c = Char -> Char
phoneticize Char
c
    phoneticize :: Char -> Char
    phoneticize :: Char -> Char
phoneticize = Char -> Char
phoneticizeHanjaChar
    hasBatchim :: Char -> Maybe Char -> Bool
    hasBatchim :: Char -> Maybe Char -> Bool
hasBatchim Char
c Maybe Char
batchim =
        case Char -> Maybe JamoTriple
toJamoTriple (Char -> Char
phoneticize Char
c) of
            Just (Char
_, Char
_, Maybe Char
final) -> Maybe Char
final forall a. Eq a => a -> a -> Bool
== Maybe Char
batchim
            Maybe JamoTriple
_ -> Bool
False
    isHanDigit :: Char -> Bool
    isHanDigit :: Char -> Bool
isHanDigit = [Char] -> Char -> Bool
inClass forall a b. (a -> b) -> a -> b
$
        [Char]
"零一壹壱弌夁二貳贰弐弍貮三參叁参弎叄四肆䦉五伍六陸陆陸七柒漆八捌" forall a. [a] -> [a] -> [a]
++
        [Char]
"九玖十拾百佰陌千仟阡萬万億兆京垓秭穰溝澗"

-- | Represents a dictionary that has hanja keys and values of their
-- corresponding hangul readings, e.g., @[("敗北", "패배")]@.
type HanjaDictionary = Trie.Trie Text

-- | Reads a hanja word according to the given dictionary, or falls back to
-- the other phoneticizer if there is no such word in the dictionary.
--
-- It's basically replace one with one:
--
-- >>> :set -XOverloadedLists -XOverloadedStrings
-- >>> let phone = withDictionary [("自轉車", "자전거")] phoneticizeHanjaWord
-- >>> phone "自轉車"
-- "자전거"
--
-- But, if it faces any words or characters that are not registered in
-- the dictionary, it does the best to interpolate prefixes\/infixes\/suffixes
-- using the fallback phoneticizer:
--
-- >>> phone "自轉車道路"
-- "자전거도로"
-- >>> phone "二輪自轉車"
-- "이륜자전거"
withDictionary
    :: HanjaDictionary
    -- ^ Hangul readings of Sino-Korean words.
    -> HanjaWordPhoneticizer
    -- ^ A fallback phoneticize for unregistered words.
    -- E.g., 'phoneticizeHanjaWordWithInitialSoundLaw'.
    -> HanjaWordPhoneticizer
    -- ^ A combined phoneticizer.
withDictionary :: HanjaDictionary -> HanjaWordPhoneticizer -> HanjaWordPhoneticizer
withDictionary HanjaDictionary
_ HanjaWordPhoneticizer
_ Text
"" = Text
""
withDictionary HanjaDictionary
dic HanjaWordPhoneticizer
fallback Text
word =
    case [(Text, Text)]
matches of
        [] ->
            HanjaWordPhoneticizer
fallback Text
word
        (Text
replaced, Text
rest) : [(Text, Text)]
_ ->
            if Text -> Bool
Data.Text.null Text
rest
            then Text
replaced
            else Text
replaced Text -> HanjaWordPhoneticizer
`append` HanjaDictionary -> HanjaWordPhoneticizer -> HanjaWordPhoneticizer
withDictionary HanjaDictionary
dic HanjaWordPhoneticizer
fallback Text
rest
  where
    lookupDic :: Text -> Maybe Text
    lookupDic :: Text -> Maybe Text
lookupDic = (forall a. Text -> Trie a -> Maybe a
`Trie.lookup` HanjaDictionary
dic)
    tries :: [(Text, Text)]
    tries :: [(Text, Text)]
tries =
        [Int -> Text -> (Text, Text)
Data.Text.splitAt Int
pos Text
word | Int
pos <- [Int
0..Text -> Int
Data.Text.length Text
word]]
    patterns :: Text -> [Text]
    patterns :: Text -> [Text]
patterns Text
word' =
        Text
word' forall a. a -> [a] -> [a]
: case Text -> Maybe (Text, Char)
unsnoc Text
word' of
            Just (Text
next, Char
_) -> Text -> [Text]
patterns Text
next
            Maybe (Text, Char)
Nothing -> []
    matchTries :: [Maybe (Text, Text, Text)]
    matchTries :: [Maybe (Text, Text, Text)]
matchTries =
        (forall a b. (a -> b) -> [a] -> [b]
`Prelude.map` [(Text, Text)]
tries) forall a b. (a -> b) -> a -> b
$ \ (Text
unmatched, Text
wd) ->
            case [(Text
p, Text
m) | Text
p <- Text -> [Text]
patterns Text
wd, Text
m <- forall a. Maybe a -> [a]
maybeToList (Text -> Maybe Text
lookupDic Text
p)] of
                [] -> forall a. Maybe a
Nothing
                (Text, Text)
pair : [(Text, Text)]
_ -> forall a. a -> Maybe a
Just
                    ( Text
unmatched
                    , forall a b. (a, b) -> b
snd (Text, Text)
pair
                    , Int -> HanjaWordPhoneticizer
Data.Text.drop (Text -> Int
Data.Text.length forall a b. (a -> b) -> a -> b
$ forall a b. (a, b) -> a
fst (Text, Text)
pair) Text
wd
                    )
    matches :: [(Text, Text)]
    matches :: [(Text, Text)]
matches =
        [ (HanjaWordPhoneticizer
fallback Text
unmatched Text -> HanjaWordPhoneticizer
`append` Text
matched, Text
rest)
        | Just (Text
unmatched, Text
matched, Text
rest) <- [Maybe (Text, Text, Text)]
matchTries
        ]

-- | Reads a hanja character as a hangul character.
--
-- >>> phoneticizeHanjaChar '漢'
-- '한'
--
-- Note that it does not follow Initial Sound Law (頭音法則):
--
-- >>> phoneticizeHanjaChar '六'
-- '륙'
phoneticizeHanjaChar :: Char -> Char
phoneticizeHanjaChar :: Char -> Char
phoneticizeHanjaChar Char
c = forall a. a -> Maybe a -> a
fromMaybe Char
c forall a b. (a -> b) -> a -> b
$ do
    HanjaReadings
readings <- forall k a. Ord k => k -> Map k a -> Maybe a
lookup Char
c KHangulData
kHangulData
    let readings' :: [(Char, HanjaReadingCitation)]
readings' = forall k a. Map k a -> [(k, a)]
Data.Map.Strict.toList HanjaReadings
readings
    let (Char
sound, HanjaReadingCitation
_) = forall (t :: * -> *) a.
Foldable t =>
(a -> a -> Ordering) -> t a -> a
minimumBy (forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing forall a b. (a, b) -> b
snd) [(Char, HanjaReadingCitation)]
readings'
    let initialLawReverted :: Set Char
initialLawReverted = forall a. (a -> Bool) -> Set a -> Set a
Data.Set.filter
            (forall k a. Ord k => k -> Map k a -> Bool
`Data.Map.Strict.member` HanjaReadings
readings)
            (Char -> Set Char
revertInitialSoundLaw Char
sound)
    forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ case forall a. Set a -> [a]
Data.Set.toList Set Char
initialLawReverted of
        [] -> Char
sound
        Char
reverted : [Char]
_ -> Char
reverted

withoutBatchim :: Char -> Maybe (Char, Maybe Char)
withoutBatchim :: Char -> Maybe (Char, Maybe Char)
withoutBatchim Char
hangul = do
    (Char
initial, Char
vowel, Maybe Char
final) <- Char -> Maybe JamoTriple
toJamoTriple Char
hangul
    Char
noBatchim <- JamoTriple -> Maybe Char
fromJamoTriple (Char
initial, Char
vowel, forall a. Maybe a
Nothing)
    forall (m :: * -> *) a. Monad m => a -> m a
return (Char
noBatchim, Maybe Char
final)

withBatchim :: Char -> Maybe Char -> Maybe Char
withBatchim :: Char -> Maybe Char -> Maybe Char
withBatchim Char
hangul Maybe Char
final = do
    (Char
initial, Char
vowel, Maybe Char
_) <- Char -> Maybe JamoTriple
toJamoTriple Char
hangul
    JamoTriple -> Maybe Char
fromJamoTriple (Char
initial, Char
vowel, Maybe Char
final)

-- | Converts a hangul character according to Initial Sound Law (頭音法則).
--
-- >>> convertInitialSoundLaw '념'
-- '염'
--
-- If an input is not a hangul syllable or a syllable is not applicable to
-- the law it returns the given input without change:
--
-- >>> convertInitialSoundLaw 'A'
-- 'A'
-- >>> convertInitialSoundLaw '가'
-- '가'
convertInitialSoundLaw :: Char -> Char
convertInitialSoundLaw :: Char -> Char
convertInitialSoundLaw Char
sound = forall a. a -> Maybe a -> a
fromMaybe Char
sound forall a b. (a -> b) -> a -> b
$ do
    (Char
pattern', Maybe Char
final) <- Char -> Maybe (Char, Maybe Char)
withoutBatchim Char
sound
    let converted :: Char
converted = forall k a. Ord k => a -> k -> Map k a -> a
findWithDefault Char
pattern' Char
pattern' Map Char Char
initialSoundLawTable
    Char -> Maybe Char -> Maybe Char
withBatchim Char
converted Maybe Char
final

-- | It's a kind of inverse function of 'convertInitialSoundLaw',
-- except it returns a set of candidates instead of a single canonical answer
-- because Initial Sound Law (頭音法則) is not a bijective function.
--
-- >>> revertInitialSoundLaw '예'
-- fromList "례"
-- >>> revertInitialSoundLaw '염'
-- fromList "념렴"
--
-- It returns an empty set if an input is not applicable to the law:
--
-- >>> revertInitialSoundLaw '가'
-- fromList ""
revertInitialSoundLaw :: Char -> Set Char
revertInitialSoundLaw :: Char -> Set Char
revertInitialSoundLaw Char
sound = forall a. a -> Maybe a -> a
fromMaybe forall a. Set a
Data.Set.empty forall a b. (a -> b) -> a -> b
$ do
    (Char
pattern', Maybe Char
final) <- Char -> Maybe (Char, Maybe Char)
withoutBatchim Char
sound
    let candidates :: [Char]
candidates = forall a. Set a -> [a]
Data.Set.toList forall a b. (a -> b) -> a -> b
$
            forall k a. Ord k => a -> k -> Map k a -> a
findWithDefault forall a. Set a
Data.Set.empty Char
pattern' Map Char (Set Char)
initialSoundLawTable'
    forall a. a -> Maybe a
Just forall a b. (a -> b) -> a -> b
$ forall a. Ord a => [a] -> Set a
Data.Set.fromList forall a b. (a -> b) -> a -> b
$ forall a. [Maybe a] -> [a]
catMaybes forall a b. (a -> b) -> a -> b
$ [Char]
candidates forall (f :: * -> *) a b. Functor f => f a -> (a -> b) -> f b
<&> (Char -> Maybe Char -> Maybe Char
`withBatchim` Maybe Char
final)
  where
    (<&>) :: Functor f => f a -> (a -> b) -> f b
    <&> :: forall (f :: * -> *) a b. Functor f => f a -> (a -> b) -> f b
(<&>) = forall a b c. (a -> b -> c) -> b -> a -> c
flip forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap

textParser :: Parser [(Bool, Text)]
textParser :: Parser Text [(Bool, Text)]
textParser = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap forall (t :: * -> *) a. Foldable t => t [a] -> [a]
Data.List.concat forall a b. (a -> b) -> a -> b
$ forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many' forall a b. (a -> b) -> a -> b
$ do
    -- We have 3 passes to optimize by utilizing takeWhile instead of many'
    [Char]
hanjaEntities <- forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many' forall a b. (a -> b) -> a -> b
$ forall i a. Parser i a -> Parser i a
try forall a b. (a -> b) -> a -> b
$ do
        Char
c <- Parser Char
unnamedCharRef
        forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless (Char -> Bool
isHanjaOrDigit Char
c) (forall (m :: * -> *) a. MonadFail m => [Char] -> m a
fail [Char]
"not a hanja")
        forall (m :: * -> *) a. Monad m => a -> m a
return Char
c
    Text
hanjaCharsText <- (Char -> Bool) -> Parser Text Text
Data.Attoparsec.Text.takeWhile Char -> Bool
isHanjaOrDigit
    [Char]
hanjaChars <- forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many' forall a b. (a -> b) -> a -> b
$ forall i a. Parser i a -> Parser i a
try forall a b. (a -> b) -> a -> b
$ do
        Char
c <- Parser Char
unnamedCharRef forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Char
anyChar
        forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless (Char -> Bool
isHanjaOrDigit Char
c) (forall (m :: * -> *) a. MonadFail m => [Char] -> m a
fail [Char]
"not a hanja")
        forall (m :: * -> *) a. Monad m => a -> m a
return Char
c
    -- Note that the parsed result can still have HTML entities; these
    -- are never touched.
    [Char]
entities <- forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many' forall a b. (a -> b) -> a -> b
$ forall i a. Parser i a -> Parser i a
try forall a b. (a -> b) -> a -> b
$ do
        Char
c <- Parser Char
unnamedCharRef
        forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Char -> Bool
isHanjaOrDigit Char
c) (forall (m :: * -> *) a. MonadFail m => [Char] -> m a
fail [Char]
"a hanja")
        forall (m :: * -> *) a. Monad m => a -> m a
return Char
c
    Text
charsText <- (Char -> Bool) -> Parser Text Text
takeTill Char -> Bool
isHanjaOrDigit
    [Char]
chars <- forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many' forall a b. (a -> b) -> a -> b
$ forall i a. Parser i a -> Parser i a
try forall a b. (a -> b) -> a -> b
$ do
        Char
c <- Parser Char
unnamedCharRef forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Char
anyChar
        forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Char -> Bool
isHanjaOrDigit Char
c) (forall (m :: * -> *) a. MonadFail m => [Char] -> m a
fail [Char]
"a hanja")
        forall (m :: * -> *) a. Monad m => a -> m a
return Char
c
    let hanjaText :: Text
hanjaText = [Text] -> Text
Data.Text.concat
            [[Char] -> Text
pack [Char]
hanjaEntities, Text
hanjaCharsText, [Char] -> Text
pack [Char]
hanjaChars]
    let text' :: Text
text' = [Text] -> Text
Data.Text.concat [[Char] -> Text
pack [Char]
entities, Text
charsText, [Char] -> Text
pack [Char]
chars]
    forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Text -> Bool
Data.Text.null forall a b. (a -> b) -> a -> b
$ Text
hanjaText Text -> HanjaWordPhoneticizer
`append` Text
text') (forall (m :: * -> *) a. MonadFail m => [Char] -> m a
fail [Char]
"parsed nothing")
    forall (m :: * -> *) a. Monad m => a -> m a
return [(Bool
True, Text
hanjaText), (Bool
False, Text
text')]
  where
    isHanjaOrDigit :: Char -> Bool
    isHanjaOrDigit :: Char -> Bool
isHanjaOrDigit Char
c =
        Char -> Bool
isDigit Char
c Bool -> Bool -> Bool
|| Char -> Bool
isHanja Char
c
    isHanja :: Char -> Bool
    isHanja :: Char -> Bool
isHanja Char
c =
        -- Ideographic Description Character
        Char
'\x2f00'  forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'\x2fff' Bool -> Bool -> Bool
||
        -- U+3007 IDEOGRAPHIC NUMBER ZERO (〇)
        Char
'\x3007'  forall a. Eq a => a -> a -> Bool
== Char
c Bool -> Bool -> Bool
||
        -- CJK Unified Ideographs Extension A
        Char
'\x3400'  forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'\x4dbf' Bool -> Bool -> Bool
||
        -- CJK Unified Ideographs
        Char
'\x4e00'  forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'\x9fcc' Bool -> Bool -> Bool
||
        -- CJK Compatibility Ideographs
        Char
'\xf900'  forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'\xfaff' Bool -> Bool -> Bool
||
        -- CJK Unified Ideographs Extension B
        Char
'\x20000' forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'\x2a6d6' Bool -> Bool -> Bool
||
        -- CJK Unified Ideographs Extension C
        Char
'\x2a700' forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'\x2b734' Bool -> Bool -> Bool
||
        -- CJK Unified Ideographs Extension D
        Char
'\x2b740' forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'\x2b81d' Bool -> Bool -> Bool
||
        -- CJK Unified Ideographs Extension E
        Char
'\x2b820' forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'\x2cea1' Bool -> Bool -> Bool
||
        -- CJK Unified Ideographs Extension F
        Char
'\x2ceb0' forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'\x2ebe0' Bool -> Bool -> Bool
||
        -- CJK Compatibility Ideographs Supplement
        Char
'\x2f800' forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'\x2fa1f'
    unnamedCharRef :: Parser Char
    unnamedCharRef :: Parser Char
unnamedCharRef = do
        Char
_ <- Char -> Parser Char
char Char
'&'
        Char
_ <- Char -> Parser Char
char Char
'#'
        Bool
hex <- forall (f :: * -> *) a. Alternative f => a -> f a -> f a
option Bool
False ((Char -> Parser Char
char Char
'x' forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Char -> Parser Char
char Char
'X') forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return Bool
True)
        Int
codepoint <- if Bool
hex then forall a. (Integral a, Bits a) => Parser a
hexadecimal else forall a. Integral a => Parser a
decimal
        Char
_ <- Char -> Parser Char
char Char
';'
        forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ Int -> Char
chr Int
codepoint

-- | The Initial Sound Law (頭音法則) table according to South Korean
-- /Hangul Orthography/ (한글 맞춤법) Clause 5, Section 52, Chapter 6
-- (第6章52項5節).  Keys are an original Sino-Korean sound and values
-- are a converted sound according to the law.
initialSoundLawTable :: Map Char Char
initialSoundLawTable :: Map Char Char
initialSoundLawTable =
    [ (Char
'녀', Char
'여')
    , (Char
'뇨', Char
'요')
    , (Char
'뉴', Char
'유')
    , (Char
'니', Char
'이')
    , (Char
'랴', Char
'야')
    , (Char
'려', Char
'여')
    , (Char
'례', Char
'예')
    , (Char
'료', Char
'요')
    , (Char
'류', Char
'유')
    , (Char
'리', Char
'이')
    , (Char
'라', Char
'나')
    , (Char
'래', Char
'내')
    , (Char
'로', Char
'노')
    , (Char
'뢰', Char
'뇌')
    , (Char
'루', Char
'누')
    , (Char
'르', Char
'느')
    ]

-- | Contains the same contents to 'initialSoundLawTable' except that
-- keys and values are crossed: keys are a converted sound and values are
-- possible original sounds.
initialSoundLawTable' :: Map Char (Set Char)
initialSoundLawTable' :: Map Char (Set Char)
initialSoundLawTable' =
    forall k a b. (k -> a -> b -> b) -> b -> Map k a -> b
foldrWithKey Char -> Char -> Map Char (Set Char) -> Map Char (Set Char)
f forall k a. Map k a
Data.Map.Strict.empty Map Char Char
initialSoundLawTable
  where
    f :: Char -> Char -> Map Char (Set Char) -> Map Char (Set Char)
    f :: Char -> Char -> Map Char (Set Char) -> Map Char (Set Char)
f Char
original Char
converted =
        forall k a. Ord k => (a -> a -> a) -> k -> a -> Map k a -> Map k a
insertWith forall a. Ord a => Set a -> Set a -> Set a
Data.Set.union Char
converted (forall a. a -> Set a
Data.Set.singleton Char
original)