{-# LANGUAGE OverloadedStrings #-}
module Text.Seonbi.Html.Lang
    ( LangHtmlEntity (..)
    , LanguageTag
    , annotateWithLang
    , extractLang
    , isKorean
    , isNeverKorean
    ) where

import Control.Applicative
import Data.Char (isSpace)
import Data.Maybe

import Data.Attoparsec.Text
import Data.Text

import Text.Seonbi.Html.Entity
import Text.Seonbi.Html.Tag (HtmlTag)

-- | Represents a language tag.  Although it is defined as an alias for 'Text',
-- it can be structured in the future.  Do not use its contents directly.
type LanguageTag = Text

-- | Extracts the language tag from the given raw HTML attributes if it has
-- @lang@ attribute.
--
-- >>> extractLang ""
-- Nothing
-- >>> extractLang "lang=en"
-- Just "en"
-- >>> extractLang "lang=\"ko-KR\""
-- Just "ko-kr"
-- >>> extractLang " lang='ko-Hang'"
-- Just "ko-hang"
extractLang
    :: HtmlRawAttrs
    -- ^ A raw HTML attributes to extract the language tag from.
    -> Maybe LanguageTag
    -- ^ A language tag extracted from the given raw HTML attributes.
    -- If the given raw HTML attributes does not have @lang@ attribute or
    -- its value is invalid, 'Nothing' is returned.
extractLang :: Text -> Maybe Text
extractLang Text
attrs =
    case forall a. Parser a -> Text -> Either String a
parseOnly Parser (Maybe Text)
parser' Text
attrs of
        Right (Just Text
lang') ->
            let lt :: Text
lt = Text -> Text
toLower forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text
strip forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text
normalizeEntities forall a b. (a -> b) -> a -> b
$ Text
lang'
            in if Text -> Bool
Data.Text.null Text
lt then forall a. Maybe a
Nothing else forall a. a -> Maybe a
Just Text
lt
        Either String (Maybe Text)
_ -> forall a. Maybe a
Nothing
  where
    parser' :: Parser (Maybe Text)
    parser' :: Parser (Maybe Text)
parser' = do
        Parser ()
skipSpace
        [Maybe Text]
attrs' <- Parser (Maybe Text)
langAttr forall (f :: * -> *) a s. Alternative f => f a -> f s -> f [a]
`sepBy` Parser Char
space
        Parser ()
skipSpace
        forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall a. [a] -> Maybe a
listToMaybe forall a b. (a -> b) -> a -> b
$ forall a. [Maybe a] -> [a]
catMaybes [Maybe Text]
attrs'
    langAttr :: Parser (Maybe Text)
    langAttr :: Parser (Maybe Text)
langAttr = do
        (Bool
isLang, Bool
cont) <- Parser (Bool, Bool)
attrIsLang
        Text
value <- if Bool
cont then Parser Text
attrValue else forall (m :: * -> *) a. Monad m => a -> m a
return Text
""
        forall (m :: * -> *) a. Monad m => a -> m a
return (if Bool
isLang then forall a. a -> Maybe a
Just Text
value else forall a. Maybe a
Nothing)
    attrIsLang :: Parser (Bool, Bool)
    attrIsLang :: Parser (Bool, Bool)
attrIsLang = forall (f :: * -> *) a. Alternative f => [f a] -> f a
choice
        [ Text -> Parser Text
asciiCI Text
"lang=" forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall (m :: * -> *) a. Monad m => a -> m a
return (Bool
True, Bool
True)
        , do { Text
_ <- (Char -> Bool) -> Parser Text
takeWhile1 (forall a. Eq a => a -> a -> Bool
/= Char
'=')
             ; Maybe Char
eq <- forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional (Char -> Parser Char
char Char
'=')
             ; forall (m :: * -> *) a. Monad m => a -> m a
return (Bool
False, forall a. Maybe a -> Bool
isJust Maybe Char
eq)
             }
        ]
    attrValue :: Parser Text
    attrValue :: Parser Text
attrValue = forall (f :: * -> *) a. Alternative f => [f a] -> f a
choice
        [ do { (Char -> Bool) -> Parser ()
skip (forall a. Eq a => a -> a -> Bool
== Char
'"'); Text
v <- (Char -> Bool) -> Parser Text
takeTill (forall a. Eq a => a -> a -> Bool
== Char
'"'); (Char -> Bool) -> Parser ()
skip (forall a. Eq a => a -> a -> Bool
== Char
'"'); forall (m :: * -> *) a. Monad m => a -> m a
return Text
v }
        , do { (Char -> Bool) -> Parser ()
skip (forall a. Eq a => a -> a -> Bool
== Char
'\'')
             ; Text
v <- (Char -> Bool) -> Parser Text
takeTill (forall a. Eq a => a -> a -> Bool
== Char
'\'')
             ; (Char -> Bool) -> Parser ()
skip (forall a. Eq a => a -> a -> Bool
== Char
'\''); forall (m :: * -> *) a. Monad m => a -> m a
return Text
v
             }
        , (Char -> Bool) -> Parser Text
takeWhile1 (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
isSpace)
        ]
    normalizeEntities :: Text -> Text
    normalizeEntities :: Text -> Text
normalizeEntities
        = Text -> Text -> Text -> Text
Data.Text.replace Text
"&hyphen;" Text
"-"
        forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text -> Text -> Text
Data.Text.replace Text
"&dash;" Text
"-"
        forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text -> Text -> Text
Data.Text.replace Text
"&#8208;" Text
"-"
        forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text -> Text -> Text
Data.Text.replace Text
"&#x2010;" Text
"-"
        forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text -> Text -> Text
Data.Text.replace Text
"&#X2010;" Text
"-"

-- | Annotates 'HtmlEntity' with the 'lang' tag extracted from it or its
-- ancestors.
data LangHtmlEntity = LangHtmlEntity
    { -- | The @lang@ tag extracted from the HTML 'entity' or its ancestors.
      LangHtmlEntity -> Maybe Text
lang :: Maybe LanguageTag
      -- | The annotated HTML 'entity'.
    , LangHtmlEntity -> HtmlEntity
entity :: HtmlEntity
    } deriving (Int -> LangHtmlEntity -> ShowS
[LangHtmlEntity] -> ShowS
LangHtmlEntity -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [LangHtmlEntity] -> ShowS
$cshowList :: [LangHtmlEntity] -> ShowS
show :: LangHtmlEntity -> String
$cshow :: LangHtmlEntity -> String
showsPrec :: Int -> LangHtmlEntity -> ShowS
$cshowsPrec :: Int -> LangHtmlEntity -> ShowS
Show, LangHtmlEntity -> LangHtmlEntity -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: LangHtmlEntity -> LangHtmlEntity -> Bool
$c/= :: LangHtmlEntity -> LangHtmlEntity -> Bool
== :: LangHtmlEntity -> LangHtmlEntity -> Bool
$c== :: LangHtmlEntity -> LangHtmlEntity -> Bool
Eq)

-- | Annotates the given HTML entities with the language tag extracted from
-- their @lang@ attributes.  If a parent entity has @lang@ attribute, its
-- all descendants are annotated with the same language tag.
annotateWithLang :: [HtmlEntity] -> [LangHtmlEntity]
annotateWithLang :: [HtmlEntity] -> [LangHtmlEntity]
annotateWithLang =
    [(HtmlTag, Maybe Text)] -> [HtmlEntity] -> [LangHtmlEntity]
annotate []
  where
    annotate :: [(HtmlTag, Maybe LanguageTag)]
             -> [HtmlEntity]
             -> [LangHtmlEntity]
    annotate :: [(HtmlTag, Maybe Text)] -> [HtmlEntity] -> [LangHtmlEntity]
annotate [(HtmlTag, Maybe Text)]
_ [] = []
    annotate [(HtmlTag, Maybe Text)]
stack (x :: HtmlEntity
x@HtmlStartTag { tag :: HtmlEntity -> HtmlTag
tag = HtmlTag
tag', rawAttributes :: HtmlEntity -> Text
rawAttributes = Text
attrs } : [HtmlEntity]
xs) =
        Maybe Text -> HtmlEntity -> LangHtmlEntity
LangHtmlEntity Maybe Text
thisLang HtmlEntity
x forall a. a -> [a] -> [a]
: [(HtmlTag, Maybe Text)] -> [HtmlEntity] -> [LangHtmlEntity]
annotate [(HtmlTag, Maybe Text)]
nextStack [HtmlEntity]
xs
      where
        parentLang :: Maybe LanguageTag
        parentLang :: Maybe Text
parentLang = case [(HtmlTag, Maybe Text)]
stack of
            (HtmlTag
_, Maybe Text
l):[(HtmlTag, Maybe Text)]
_ -> Maybe Text
l
            [(HtmlTag, Maybe Text)]
_ -> forall a. Maybe a
Nothing
        thisLang :: Maybe LanguageTag
        thisLang :: Maybe Text
thisLang = Text -> Maybe Text
extractLang Text
attrs forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Maybe Text
parentLang
        nextStack :: [(HtmlTag, Maybe LanguageTag)]
        nextStack :: [(HtmlTag, Maybe Text)]
nextStack = (HtmlTag
tag', Maybe Text
thisLang) forall a. a -> [a] -> [a]
: [(HtmlTag, Maybe Text)]
stack
    annotate [(HtmlTag, Maybe Text)]
stack (x :: HtmlEntity
x@HtmlEndTag { tag :: HtmlEntity -> HtmlTag
tag = HtmlTag
tag' } : [HtmlEntity]
xs) =
        Maybe Text -> HtmlEntity -> LangHtmlEntity
LangHtmlEntity Maybe Text
thisLang HtmlEntity
x forall a. a -> [a] -> [a]
: [(HtmlTag, Maybe Text)] -> [HtmlEntity] -> [LangHtmlEntity]
annotate [(HtmlTag, Maybe Text)]
nextStack [HtmlEntity]
xs
      where
        ([(HtmlTag, Maybe Text)]
nextStack, Maybe Text
thisLang) = case [(HtmlTag, Maybe Text)]
stack of
            [] -> ([], forall a. Maybe a
Nothing)
            s :: [(HtmlTag, Maybe Text)]
s@((HtmlTag
t, Maybe Text
lang'):[(HtmlTag, Maybe Text)]
ys) ->
                (if HtmlTag
t forall a. Eq a => a -> a -> Bool
== HtmlTag
tag' then [(HtmlTag, Maybe Text)]
ys else [(HtmlTag, Maybe Text)]
s, Maybe Text
lang')
    annotate [(HtmlTag, Maybe Text)]
stack (HtmlEntity
x : [HtmlEntity]
xs) =
        Maybe Text -> HtmlEntity -> LangHtmlEntity
LangHtmlEntity Maybe Text
parentLang HtmlEntity
x forall a. a -> [a] -> [a]
: [(HtmlTag, Maybe Text)] -> [HtmlEntity] -> [LangHtmlEntity]
annotate [(HtmlTag, Maybe Text)]
stack [HtmlEntity]
xs
      where
        parentLang :: Maybe LanguageTag
        parentLang :: Maybe Text
parentLang = case [(HtmlTag, Maybe Text)]
stack of
            (HtmlTag
_, Maybe Text
l):[(HtmlTag, Maybe Text)]
_ -> Maybe Text
l
            [(HtmlTag, Maybe Text)]
_ -> forall a. Maybe a
Nothing

-- | Determines whether the given language tag refers to any kind of Korean.
--
-- >>> isKorean "ko"
-- True
-- >>> isKorean "ko-KR"
-- True
-- >>> isKorean "kor-Hang"
-- True
-- >>> isKorean "en"
-- False
-- >>> isKorean "en-KR"
-- False
isKorean :: LanguageTag -> Bool
isKorean :: Text -> Bool
isKorean Text
lang' =
    Text
l forall a. Eq a => a -> a -> Bool
== Text
"ko" Bool -> Bool -> Bool
|| Text
l forall a. Eq a => a -> a -> Bool
== Text
"kor" Bool -> Bool -> Bool
||
    Text
"ko-" Text -> Text -> Bool
`isPrefixOf` Text
l Bool -> Bool -> Bool
||
    Text
"kor-" Text -> Text -> Bool
`isPrefixOf` Text
l
  where
    l :: Text
    l :: Text
l = Text -> Text
toLower Text
lang'

-- | Determines whether the given language tag undoubtedly does not refer
-- to any kind of Korean.
--
-- >>> isNeverKorean $ Just "ko"
-- False
-- >>> isNeverKorean $ Just "ko-KR"
-- False
-- >>> isNeverKorean Nothing
-- False
-- >>> isNeverKorean $ Just "en"
-- True
isNeverKorean :: Maybe LanguageTag -> Bool
isNeverKorean :: Maybe Text -> Bool
isNeverKorean Maybe Text
Nothing = Bool
False
isNeverKorean (Just Text
lang') = Bool -> Bool
not (Text -> Bool
isKorean Text
lang')