├── README.md ├── Setup.hs ├── data └── stopwords.txt ├── html └── fer3.css ├── fer3-catalogue.cabal ├── LICENSE └── src ├── CSV.hs ├── sim-lists.hs ├── SimLists.hs ├── Data └── EdgeLabeledGraph.hs ├── Patch.hs ├── WorkGroups.hs ├── Catalogue.hs └── fer3.hs /README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /data/stopwords.txt: -------------------------------------------------------------------------------- 1 | a 2 | about 3 | above 4 | after 5 | again 6 | against 7 | all 8 | am 9 | an 10 | and 11 | any 12 | are 13 | aren't 14 | as 15 | at 16 | be 17 | because 18 | been 19 | before 20 | being 21 | below 22 | between 23 | both 24 | but 25 | by 26 | can't 27 | cannot 28 | could 29 | couldn't 30 | did 31 | didn't 32 | do 33 | does 34 | doesn't 35 | doing 36 | don't 37 | down 38 | during 39 | each 40 | few 41 | for 42 | from 43 | further 44 | had 45 | hadn't 46 | has 47 | hasn't 48 | have 49 | haven't 50 | having 51 | he 52 | he'd 53 | he'll 54 | he's 55 | her 56 | here 57 | here's 58 | hers 59 | herself 60 | him 61 | himself 62 | his 63 | how 64 | how's 65 | i 66 | i'd 67 | i'll 68 | i'm 69 | i've 70 | if 71 | in 72 | into 73 | is 74 | isn't 75 | it 76 | it's 77 | its 78 | itself 79 | let's 80 | me 81 | more 82 | most 83 | mustn't 84 | my 85 | myself 86 | no 87 | nor 88 | not 89 | of 90 | off 91 | on 92 | once 93 | only 94 | or 95 | other 96 | ought 97 | our 98 | ours 99 | -------------------------------------------------------------------------------- /html/fer3.css: -------------------------------------------------------------------------------- 1 | table { 2 | border-collapse: collapse; 3 | width: 100%; 4 | } 5 | 6 | table, td, th { 7 | border: 0px solid black; 8 | border-bottom: 1px dotted gray; 9 | border-top: 1px dotted gray; 10 | } 11 | 12 | th { 13 | background-color: gray; 14 | color: white; 15 | } 16 | 17 | td, th { 18 | padding: 2px; 19 | padding-right: 8px; 20 | padding-left: 8px; 21 | text-align: left; 22 | } 23 | 24 | h2 { 25 | color: gray; 26 | } 27 | 28 | .KA-font { 29 | font-weight: bold 30 | } 31 | 32 | .KU-font { 33 | font-weight: bold 34 | } 35 | 36 | .KT-font { 37 | font-weight: normal; 38 | } 39 | 40 | .sim-low-font { 41 | color: inherit; 42 | } 43 | 44 | .sim-med-font { 45 | color: orange; 46 | } 47 | 48 | .sim-high-font { 49 | color: red; 50 | } 51 | 52 | body { 53 | font-family: "Arial"; 54 | } 55 | 56 | a:link, a:visited, a:active { 57 | text-decoration: none; 58 | color: inherit; 59 | } 60 | 61 | a:hover { 62 | background-color: #DDDDDD; 63 | } 64 | -------------------------------------------------------------------------------- /fer3-catalogue.cabal: -------------------------------------------------------------------------------- 1 | -- Initial fer3-knowledge-catalogue.cabal generated by cabal init. For 2 | -- further documentation, see http://haskell.org/cabal/users-guide/ 3 | 4 | name: fer3-catalogue 5 | version: 0.1.0.0 6 | synopsis: FER3 Knowledge Catalogue Processing 7 | -- description: 8 | homepage: http://github.com/jsnajder/fer3-catalogue 9 | license: BSD3 10 | license-file: LICENSE 11 | author: Jan Snajder 12 | maintainer: jan.snajder@fer.hr 13 | -- copyright: 14 | -- category: 15 | build-type: Simple 16 | extra-source-files: README.md 17 | cabal-version: >=1.10 18 | 19 | library 20 | exposed-modules: CSV, Catalogue, SimLists, WorkGroups 21 | -- other-modules: 22 | -- other-extensions: 23 | build-depends: base >=4.6 && <4.7, 24 | parsec ==3.1.*, 25 | filepath ==1.3.*, 26 | directory ==1.2.*, 27 | text ==0.11.*, 28 | containers ==0.5.*, 29 | split ==0.2.*, 30 | list-partition ==0.1.*, 31 | fgl ==5.5.*, 32 | fgl-visualize ==0.1.*, 33 | counts ==0.1.* 34 | hs-source-dirs: src 35 | default-language: Haskell2010 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Jan Snajder 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of Jan Snajder nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /src/CSV.hs: -------------------------------------------------------------------------------- 1 | module CSV 2 | ( Field 3 | , Row 4 | , CSV -- todo: rename to "table" 5 | , (!!!) 6 | , getField 7 | , getFieldMaybe 8 | , ParseError 9 | , readCSV 10 | , showCSV 11 | , printCSV 12 | , csvToForest ) where 13 | 14 | import Control.Applicative ((<$>)) 15 | import Data.Char (isSpace) 16 | import Data.List (intercalate) 17 | import Data.Maybe 18 | import Data.Tree 19 | import Text.Parsec 20 | import Text.Parsec.String 21 | 22 | type Field = String 23 | type Row = [Field] 24 | type CSV = [Row] 25 | 26 | ------------------------------------------------------------------------------ 27 | -- Reading a CSV 28 | 29 | readCSV :: String -> Either ParseError [[Field]] 30 | readCSV s = map pruneRow <$> parse parseCSV "" s 31 | 32 | parseCSV :: Parser [[Field]] 33 | parseCSV = parseRow `sepEndBy` eol 34 | 35 | pruneRow :: [Field] -> [Field] 36 | pruneRow = reverse . dropWhile null . reverse . map pruneCell 37 | 38 | pruneCell :: Field -> Field 39 | pruneCell s | all isSpace s = "" 40 | | otherwise = s 41 | 42 | parseRow :: Parser [Field] 43 | parseRow = parseField `sepEndBy` char ',' 44 | 45 | parseField :: Parser Field 46 | parseField = quotedString <|> many (noneOf ",\n\r") 47 | 48 | quotedString :: Parser String 49 | quotedString = 50 | do char '"' 51 | content <- many quotedChar 52 | char '"' > "quote at end of cell" 53 | return content 54 | 55 | quotedChar :: Parser Char 56 | quotedChar = noneOf "\"" <|> try (string "\"\"" >> return '"') 57 | 58 | eol :: Parser String 59 | eol = try (string "\n\r") 60 | <|> try (string "\r\n") 61 | <|> string "\n" 62 | <|> string "\r" 63 | > "eol" 64 | 65 | ------------------------------------------------------------------------------ 66 | -- Showing a CSV 67 | 68 | showCSV :: [[Field]] -> String 69 | showCSV = unlines . map (intercalate "," . map csvQuote) 70 | 71 | printCSV :: CSV -> IO () 72 | printCSV = putStr . showCSV 73 | 74 | csvQuote :: String -> String 75 | csvQuote s | needsQuotes = "\"" ++ escape s ++ "\"" 76 | | otherwise = s 77 | where needsQuotes = any (==',') s 78 | escape [] = [] 79 | escape ('"':xs) = "\"\"" ++ escape xs 80 | escape (x:xs) = x : escape xs 81 | 82 | ------------------------------------------------------------------------------ 83 | -- Reading into a CSV 84 | 85 | toForest :: [(Int,a)] -> Forest a 86 | toForest [] = [] 87 | toForest ((lx,x):xs) = let (ys,zs) = break ((==lx) . fst) xs 88 | in Node x (toForest ys) : toForest zs 89 | 90 | csvToForest :: [[Field]] -> Forest [Field] 91 | csvToForest = toForest . map (\r -> (indentLevel r,r)) 92 | 93 | indentLevel :: [Field] -> Int 94 | indentLevel = length . takeWhile null 95 | 96 | ------------------------------------------------------------------------------ 97 | -- Accessing CSV fields 98 | 99 | (!!!) :: [[a]] -> Int -> Maybe [a] 100 | [] !!! _ = Nothing 101 | ([]:_) !!! 0 = Nothing 102 | (x:_) !!! 0 = Just x 103 | (_:xs) !!! n = xs !!! (n - 1) 104 | 105 | getField :: CSV -> Int -> Int -> Maybe Field 106 | getField ys y x = (ys !!! y) >>= (!!! x) 107 | 108 | getFieldMaybe :: CSV -> Int -> Int -> Field -> Field 109 | getFieldMaybe ys y x f = fromMaybe f $ getField ys y x 110 | 111 | -------------------------------------------------------------------------------- /src/sim-lists.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TupleSections, TypeSynonymInstances, FlexibleInstances, MultiParamTypeClasses #-} 2 | 3 | module SimLists where 4 | 5 | import Catalogue 6 | import Control.Applicative ((<$>), liftA2) 7 | import Control.Monad 8 | import CSV (readCSV, Field) 9 | import Data.Char 10 | import qualified Data.EdgeLabeledGraph as G 11 | import Data.Function 12 | import Data.List 13 | import Data.List.Partition (eqClassesGen) 14 | import Data.List.Split (splitWhen) 15 | import Data.Maybe 16 | import qualified Data.Map as M 17 | import qualified Data.Text as T 18 | import System.Directory 19 | import System.Environment 20 | import System.FilePath 21 | import Text.Parsec hiding (label,labels) 22 | import Text.Parsec.String 23 | import Text.Printf 24 | 25 | data SimLabel = X | R | O | E deriving (Eq,Show,Read,Ord) 26 | type ItemKey = T.Text 27 | type SimGraph = G.Graph ItemKey ItemId SimLabel 28 | type SimGraphPaired = G.Graph ItemKey ItemId (SimLabel, SimLabel) 29 | 30 | instance G.Vertex ItemId T.Text where 31 | index = T.pack . showItemId 32 | 33 | dir = "/home/jan/fer3/fer3-catalogue/data/catalogue/v0.1/sim-lists" 34 | 35 | diagreements = do 36 | fs <- csvFiles dir 37 | gs <- loadSimLists fs 38 | return $ disagreementGraph gs 39 | 40 | overlaps = do 41 | fs <- csvFiles dir 42 | gs <- loadSimLists fs 43 | return . overlapGraph . pairGraph $ G.unions gs 44 | 45 | overlapUnits g = 46 | eqClassesGen (\v -> outEdges' v g) (G.vertices g) 47 | 48 | csvFiles d = 49 | map (d >) . filter (".sim.csv" `isSuffixOf`) <$> getDirectoryContents d 50 | 51 | loadSimLists :: [FilePath] -> IO [SimGraph] 52 | loadSimLists fs = do 53 | forM fs $ \f -> do 54 | putStrLn f 55 | Right g <- readSimGraphCSV <$> readFile f 56 | g `seq` return g 57 | 58 | disagreementGraph :: [SimGraph] -> SimGraphPaired 59 | disagreementGraph = G.filterEdges dis . pairGraph . G.unions 60 | where dis _ _ (l1,l2) = l1 /= l2 && 61 | (not ((l1 == E && l2 == O) || (l1 == O && l2 == E))) 62 | {- 63 | partitionByCat :: SimGraphPaired -> [(String,SimGraphPaired)] 64 | partitionByCat g = 65 | [ (edgeOutCat e, G.fromEdgeList es) | 66 | es@(e:_) <- groupBy ((==) `on` edgeOutCat) $ G.toEdgeList g ] 67 | where edgeOutCat (v1,_,_) = catCode v1 68 | -} 69 | 70 | pairGraph :: 71 | (G.Vertex v k, Eq v, Ord k, Eq l) => G.Graph k v l -> G.Graph k v (l,l) 72 | pairGraph g = G.fromEdgeList $ do 73 | v1 <- G.vertices g 74 | (l1,v2) <- G.outEdges v1 g 75 | (l2,v1') <- G.outEdges v2 g 76 | guard $ v1==v1' 77 | return (v1,(l1,l2),v2) 78 | 79 | readSimGraphCSV :: String -> Either ParseError SimGraph 80 | readSimGraphCSV s = G.fromAdjacencyList <$> readSimListCSV s 81 | 82 | readSimListCSV 83 | :: String -> Either ParseError [(ItemId, [(SimLabel, ItemId)])] 84 | readSimListCSV s = 85 | mapMaybe readItemList . filter (not . null) . splitWhen null <$> readCSV s 86 | 87 | readItemList :: [[Field]] -> Maybe (ItemId, [(SimLabel,ItemId)]) 88 | readItemList ((itemId:_):items) 89 | | null items' = Nothing 90 | | otherwise = (,items') <$> readItemId' itemId 91 | where readItem (label:_:itemId:_) = 92 | liftA2 (,) (readSimLabel label) (readItemId' itemId) 93 | items' = mapMaybe readItem items 94 | 95 | readItemId' = readItemId . init . tail 96 | 97 | readSimLabel :: String -> Maybe SimLabel 98 | readSimLabel = 99 | disambigLabel . map (read . (:[])) . filter (`elem` "XROE") . map toUpper 100 | 101 | disambigLabel :: [SimLabel] -> Maybe SimLabel 102 | disambigLabel [] = Nothing 103 | disambigLabel [l] = Just l 104 | disambigLabel ls | E `elem` ls = Just E 105 | | O `elem` ls = Just O 106 | | R `elem` ls = Just R 107 | | otherwise = Just X 108 | 109 | ------------------------------------------------------------------------------ 110 | 111 | type OverlapGraph = G.Graph ItemKey ItemId () 112 | 113 | overlapGraph :: SimGraphPaired -> OverlapGraph 114 | overlapGraph = G.fromEdgeList . mapMaybe f . G.toEdgeList 115 | where f (v1,(l1,l2),v2) 116 | | l1 `elem` [O,E] || l2 `elem` [O,E] = Just (v1,(),v2) 117 | | otherwise = Nothing 118 | --TODO: enforce hierachical constraints! 119 | 120 | -------------------------------------------------------------------------------- /src/SimLists.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TupleSections, TypeSynonymInstances, FlexibleInstances, MultiParamTypeClasses #-} 2 | 3 | module SimLists where 4 | 5 | import Catalogue 6 | import Control.Applicative ((<$>), liftA2) 7 | import Control.Monad 8 | import CSV (readCSV, showCSV, Field, CSV) 9 | import Data.Char 10 | import qualified Data.EdgeLabeledGraph as G 11 | import Data.Function 12 | import Data.List 13 | import Data.List.Partition (eqClassesGenOut) 14 | import Data.List.Split (splitWhen) 15 | import Data.Maybe 16 | import qualified Data.Map as M 17 | import qualified Data.Text as T 18 | import System.Directory 19 | import System.Environment 20 | import System.FilePath 21 | import Text.Parsec hiding (label,labels) 22 | import Text.Parsec.String 23 | import Text.Printf 24 | import Debug.Trace 25 | 26 | data SimLabel = X | R | O | E deriving (Eq,Show,Read,Ord) 27 | type ItemKey = String 28 | type SimGraph = G.Graph ItemKey ItemId SimLabel 29 | type SimGraphPaired = G.Graph ItemKey ItemId (SimLabel, SimLabel) 30 | 31 | instance G.Vertex ItemId ItemKey where 32 | index = showItemId 33 | 34 | dir = "/home/jan/fer3/fer3-catalogue/data/catalogue/v0.2/" 35 | simDir = dir > "sim-lists-corr" 36 | catFile = dir > "catalogue/FER3-v0.2.csv" 37 | outDir = dir > "sim-lists-disagree/csv" 38 | 39 | csvFiles d = 40 | map (d >) . filter (".csv" `isSuffixOf`) <$> getDirectoryContents d 41 | 42 | loadSimLists :: [FilePath] -> IO SimGraph 43 | loadSimLists fs = G.unions <$> do 44 | forM fs $ \f -> do 45 | putStrLn f 46 | Right g <- readSimGraphCSV <$> readFile f 47 | g `seq` return g 48 | 49 | generateDisagreementLists :: IO () 50 | generateDisagreementLists = do 51 | Right c <- loadCatalogue catFile 52 | fs <- csvFiles simDir 53 | g <- filterSpurious <$> loadSimLists fs 54 | let d = disagreementGraph g 55 | forM_ (splitSimGraph d) $ \(cat,d) -> do 56 | let cvs = csvSimList d c 57 | fn = printf "FER3-%s.sim.disagree.csv" cat 58 | writeFile (outDir > fn) $ showCSV cvs 59 | 60 | -- filters out some spurious labelings 61 | filterSpurious :: SimGraph -> SimGraph 62 | filterSpurious = G.filterEdges (\v1 v2 _ -> not $ spuriousMatch v1 v2) 63 | 64 | spuriousMatch x1 x2 = 65 | catCode x1 `elem` ["CS","CE","SE"] && 66 | catCode x2 `elem` ["CS","CE","SE"] && 67 | catCode x1 /= catCode x2 && 68 | areaCode x1 == areaCode x2 && 69 | unitId x1 /= unitId x2 70 | 71 | -- add CS-CE-SE edges 72 | addCompEdges :: Catalogue -> SimGraph -> SimGraph 73 | addCompEdges c g = g `G.union` G.fromEdgeList es 74 | where kus = filter ((`elem` ["CS","CE","SE"]) . catCode) . 75 | map itemId $ knowledgeUnits c 76 | es = [(x,E,y) | x <- kus, y <- filter (matches x) kus ] 77 | matches x y = catCode y `elem` ["CS","CE","SE"] && catCode x /= catCode y && 78 | areaCode x == areaCode y && unitId x == unitId y && 79 | isNothing (topicId y) 80 | 81 | disagreementGraph :: SimGraph -> SimGraphPaired 82 | disagreementGraph = G.filterEdges dis . pairGraph2 83 | where dis _ _ (l1,l2) = l1 /= l2 && 84 | (not ((l1 == E && l2 == O) || (l1 == O && l2 == E))) 85 | -- (l1 == X && l2 == R) || (l1 == R && l2 == X))) 86 | 87 | csvSimList :: SimGraphPaired -> Catalogue -> CSV 88 | csvSimList sg c = concatMap (csvItemList sg c) . G.vertices $ catAreas c 89 | 90 | csvItemList :: SimGraphPaired -> Catalogue -> Item -> CSV 91 | csvItemList sg c x 92 | | null xs = [] 93 | | otherwise = 94 | [showItemId $ itemId x, itemLabel x, [], [], [], showEditors x] : 95 | map csvLabeledItem xs ++ [[]] 96 | where xs = simItems sg c x 97 | csvLabeledItem (l1,l2,x) = 98 | [show l1, show l2, showItemId $ itemId x, itemLabel x, showEditors x] 99 | showEditors = intercalate ", " . itemEditors 100 | 101 | splitSimGraph :: SimGraphPaired -> [(String, SimGraphPaired)] 102 | splitSimGraph = 103 | map (\es@(e:_) -> (outCatCode e, G.fromEdgeList es)) . 104 | groupBy ((==) `on` outCatCode) . 105 | sortBy (compare `on` outCatCode) . 106 | G.toEdgeList 107 | where outCatCode (x,_,_) = catCode x 108 | 109 | simItems :: 110 | SimGraphPaired -> Catalogue -> Item -> [(SimLabel, SimLabel, Item)] 111 | simItems sg cat x = 112 | mapMaybe (\((l1,l2),id) -> (l1,l2,) <$> getItem cat id) $ 113 | G.outEdges (itemId x) sg 114 | 115 | pairGraph :: 116 | (G.Vertex v k, Eq v, Ord k, Eq l) => G.Graph k v l -> G.Graph k v (l,l) 117 | pairGraph g = G.fromEdgeList $ do 118 | v1 <- G.vertices g 119 | (l1,v2) <- G.outEdges v1 g 120 | (l2,v1') <- G.outEdges v2 g 121 | guard $ v1==v1' 122 | return (v1,(l1,l2),v2) 123 | 124 | -- if you don't find a complementary edge, insert one with the same label 125 | pairGraph2 :: 126 | (G.Vertex v k, Eq v, Ord k, Eq l) => G.Graph k v l -> G.Graph k v (l,l) 127 | pairGraph2 g = G.fromEdgeList $ do 128 | v1 <- G.vertices g 129 | (l1,v2) <- G.outEdges v1 g 130 | let Just (l2,v1') = find (\(_,v1') -> v1'==v1) (G.outEdges v1 g ++ [(l1,v1)]) 131 | return (v1,(l1,l2),v2) 132 | 133 | readSimGraphCSV :: String -> Either ParseError SimGraph 134 | readSimGraphCSV s = G.fromAdjacencyList <$> readSimListCSV s 135 | 136 | readSimListCSV 137 | :: String -> Either ParseError [(ItemId, [(SimLabel, ItemId)])] 138 | readSimListCSV s = 139 | mapMaybe readItemList . filter (not . null) . splitWhen null <$> readCSV s 140 | 141 | readItemList :: [[Field]] -> Maybe (ItemId, [(SimLabel,ItemId)]) 142 | readItemList ((itemId:_):items) 143 | | null items' = Nothing 144 | | otherwise = (,items') <$> readItemId' itemId 145 | where readItem (label:_:itemId:_) = 146 | liftA2 (,) (readSimLabel label) (readItemId' itemId) 147 | items' = mapMaybe readItem items 148 | 149 | readItemId' s | head s == '[' = readItemId . init $ tail s 150 | | otherwise = readItemId s 151 | 152 | 153 | readSimLabel :: String -> Maybe SimLabel 154 | readSimLabel = 155 | disambigLabel . map (read . (:[])) . filter (`elem` "XROE") . map toUpper 156 | 157 | disambigLabel :: [SimLabel] -> Maybe SimLabel 158 | disambigLabel [] = Nothing 159 | disambigLabel [l] = Just l 160 | disambigLabel ls | E `elem` ls = Just E 161 | | O `elem` ls = Just O 162 | | R `elem` ls = Just R 163 | | otherwise = Just X 164 | 165 | -------------------------------------------------------------------------------- /src/Data/EdgeLabeledGraph.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiParamTypeClasses, TypeSynonymInstances, FlexibleInstances #-} 2 | 3 | module Data.EdgeLabeledGraph 4 | ( Vertex (..) 5 | , Graph 6 | , empty 7 | , addVertex 8 | , removeVertex 9 | , removeVertex' 10 | , removeVertices 11 | , removeVertices' 12 | , addEdge 13 | , removeEdge 14 | , removeEdge' 15 | , outEdges 16 | , outEdges' 17 | , inEdges' 18 | , vertex 19 | , vertices 20 | , indices 21 | , findIndex 22 | , updateKeys 23 | , toAdjacencyList' 24 | , toEdgeList 25 | , toEdgeList' 26 | , fromEdgeList 27 | , fromAdjacencyList 28 | , filterEdges 29 | , modifyEdges 30 | , union 31 | , unions 32 | , fromTree 33 | , toTree 34 | , combineEdges 35 | , findVertex 36 | , modifyVertex ) where 37 | 38 | import Data.List hiding (union) 39 | import qualified Data.Map as M 40 | import Data.Maybe 41 | import Data.Tree 42 | 43 | -- edge-labeled graph 44 | 45 | data Graph k v l = Graph 46 | { vertexMap :: M.Map k v 47 | , adjMap :: M.Map k [(l,k)] } deriving (Eq,Ord,Show,Read) 48 | 49 | empty :: Graph k v l 50 | empty = Graph M.empty M.empty 51 | 52 | addVertex :: (Vertex v k, Ord k) => v -> Graph k v l -> Graph k v l 53 | addVertex v g = g { vertexMap = M.insert (index v) v (vertexMap g) } 54 | 55 | removeVertex :: (Vertex v k, Ord k) => v -> Graph k v l -> Graph k v l 56 | removeVertex = removeVertex' . index 57 | 58 | removeVertex' :: (Vertex v k, Ord k) => k -> Graph k v l -> Graph k v l 59 | removeVertex' k g = g 60 | { vertexMap = M.delete k $ vertexMap g 61 | , adjMap = M.map (filter ((/=k) . snd)) . M.delete k $ adjMap g } 62 | 63 | removeVertices' :: (Vertex v k, Ord k) => [k] -> Graph k v l -> Graph k v l 64 | removeVertices' ks g = foldl' (flip removeVertex') g ks 65 | 66 | removeVertices :: (Vertex v k, Ord k) => [v] -> Graph k v l -> Graph k v l 67 | removeVertices vs g = foldl' (flip removeVertex) g vs 68 | 69 | addEdge :: (Ord k, Eq l, Vertex v k) => 70 | v -> v -> l -> Graph k v l -> Graph k v l 71 | addEdge v1 v2 l g = g 72 | { vertexMap = M.insert k2 v2 . M.insert k1 v1 $ vertexMap g 73 | , adjMap = M.insertWith (\x y -> nub $ y ++ x) k1 [(l,k2)] (adjMap g) } 74 | where k1 = index v1 75 | k2 = index v2 76 | 77 | removeEdge :: (Ord k, Eq l, Vertex v k) => v -> v -> l -> Graph k v l -> Graph k v l 78 | removeEdge v1 v2 l = removeEdge' (index v1) (index v2) l 79 | 80 | removeEdge' :: (Ord k, Eq l, Vertex v k) => k -> k -> l -> Graph k v l -> Graph k v l 81 | removeEdge' k1 k2 l g = g 82 | { adjMap = M.adjust (delete (l,k2)) k1 (adjMap g) } 83 | 84 | -- NB: this will also remove singleton vertices 85 | -- TODO: fix this 86 | filterEdges :: (Ord k, Eq l, Vertex v k) => 87 | (v -> v -> l -> Bool) -> Graph k v l -> Graph k v l 88 | filterEdges p = fromEdgeList . filter (\(v1,l,v2) -> p v1 v2 l) . toEdgeList 89 | 90 | modifyEdges :: (Ord k, Eq l, Vertex v k) => 91 | ((v,l,v) -> Maybe (v,l,v)) -> Graph k v l -> Graph k v l 92 | modifyEdges f = fromEdgeList . mapMaybe f . toEdgeList 93 | 94 | modifyVertex :: (Vertex v k, Ord k, Eq l) => 95 | v -> (v -> v) -> Graph k v l -> Graph k v l 96 | modifyVertex v f g = g { vertexMap = M.adjust f (index v) (vertexMap g) } 97 | 98 | filterVertices = undefined 99 | 100 | vertex :: Ord k => k -> Graph k v l -> Maybe v 101 | vertex k = M.lookup k . vertexMap 102 | 103 | outEdges' :: (Vertex v k, Ord k) => k -> Graph k v l -> [(l,k)] 104 | outEdges' k g = concat . maybeToList $ M.lookup k (adjMap g) 105 | 106 | outEdges :: (Vertex v k, Ord k) => v -> Graph k v l -> [(l,v)] 107 | outEdges v g = do 108 | (l,k) <- outEdges' (index v) g 109 | let Just v' = vertex k g 110 | return (l,v') 111 | 112 | outDegree' :: (Vertex v k, Ord k) => k -> Graph k v l -> Int 113 | outDegree' k = length . outEdges' k 114 | 115 | inEdges' :: (Vertex v k, Ord k) => k -> Graph k v l -> [(l,k)] 116 | inEdges' k g = [ (l,k1) | (k1,l,k2) <- toEdgeList' g, k2==k ] 117 | 118 | vertices :: Graph k v l -> [v] 119 | vertices = M.elems . vertexMap 120 | 121 | indices :: Graph k v l -> [k] 122 | indices = M.keys . vertexMap 123 | 124 | findVertex :: (v -> Bool) -> Graph k v l -> [v] 125 | findVertex p = filter p . vertices 126 | 127 | toAdjacencyList' :: Ord k => Graph k v l -> [(k,[(l,k)])] 128 | toAdjacencyList' = M.toList . adjMap 129 | 130 | toEdgeList' :: Ord k => Graph k v l -> [(k,l,k)] 131 | toEdgeList' g = do 132 | (k1,xs) <- toAdjacencyList' g 133 | (l,k2) <- xs 134 | return (k1,l,k2) 135 | 136 | toEdgeList :: Ord k => Graph k v l -> [(v,l,v)] 137 | toEdgeList g = do 138 | (k1,l,k2) <- toEdgeList' g 139 | let Just v1 = vertex k1 g 140 | Just v2 = vertex k2 g 141 | return (v1,l,v2) 142 | 143 | fromEdgeList :: (Vertex v k, Ord k, Eq l) => [(v,l,v)] -> Graph k v l 144 | fromEdgeList = foldl' (\g (v1,l,v2) -> addEdge v1 v2 l g) empty 145 | 146 | fromAdjacencyList :: (Vertex v k, Ord k, Eq l) => [(v,[(l,v)])] -> Graph k v l 147 | fromAdjacencyList xs = 148 | fromEdgeList [(v1,l,v2) | (v1,lvs) <- xs, (l,v2) <- lvs ] 149 | 150 | updateKeys :: (Ord k, Eq l) => (k -> k) -> Graph k v l -> Graph k v l 151 | updateKeys f g = Graph 152 | { vertexMap = M.mapKeys f $ vertexMap g 153 | , adjMap = M.mapKeys f (M.map (nub . map (\(l,k2) -> (l,f k2))) $ adjMap g) } 154 | 155 | union :: (Vertex v k, Ord k, Eq l) => Graph k v l -> Graph k v l -> Graph k v l 156 | union g1 g2 = foldl' (\g (v1,l,v2) -> addEdge v1 v2 l g) g1 (toEdgeList g2) 157 | 158 | unions :: (Vertex v k, Ord k, Eq l) => [Graph k v l] -> Graph k v l 159 | unions [] = empty 160 | unions gs = foldl1 union gs 161 | 162 | fromTree :: (Vertex v k, Ord k, Eq l) => (Int -> l) -> Tree v -> Graph k v l 163 | fromTree f = fromEdgeList . map (\(v1,d,v2) -> (v1,f d,v2)) . treeEdges 164 | 165 | treeEdges :: Tree v -> [(v,Int,v)] 166 | treeEdges = edges 0 167 | where edges d (Node l ns) = map (\n -> (l,d,rootLabel n)) ns ++ 168 | concatMap (edges $ d+1) ns 169 | 170 | -- constructs a tree rooted in v, traversing edges for which label evaluates to True 171 | -- If the graph has dicycles, the tree will be infinite 172 | toTree :: (Vertex v k, Ord k) => v -> (l -> Bool) -> Graph k v l -> Tree v 173 | toTree v p g = Node v [toTree v p g | (l,v) <- outEdges v g, p l ] 174 | 175 | combineEdges :: (Vertex v1 k, Vertex v2 k, Eq v2, Ord k, Eq l1, Eq l2) => 176 | Graph k v1 l1 -> Graph k v2 l2 -> Graph k v1 (l1,l2) 177 | combineEdges g1 g2 = fromEdgeList . concatMap f $ toEdgeList g1 178 | where f (v1,l1,v2) = [ (v1,(l1,l2),v2) | 179 | (l2,k2) <- outEdges' (index v1) g2, index v2 == k2 ] 180 | 181 | class Vertex v k where 182 | index :: v -> k 183 | 184 | instance Vertex Int Int where 185 | index = id 186 | 187 | instance Vertex String String where 188 | index = id 189 | 190 | type IntGraph = Graph Int Int Int 191 | 192 | g = fromEdgeList [(1,12,2),(2,12,1),(2,23,3),(2,24,4)] :: IntGraph 193 | 194 | -- add: to/from functional graph transformations 195 | -- zapravo, ne, napravi novu verziju ovoga, tako da bude LIGHTWEIGHT omotač 196 | -- oko FG-a. Trebaš pohranjivati node map (to već postoji, ali možda napravi 197 | -- svoje) i trebaš pohranjivati key to node map 198 | 199 | 200 | 201 | -------------------------------------------------------------------------------- /src/Patch.hs: -------------------------------------------------------------------------------- 1 | 2 | module Patch where 3 | 4 | import Catalogue 5 | import Control.Applicative 6 | import Control.Monad 7 | import CSV 8 | import qualified Data.EdgeLabeledGraph as G -- TMP! 9 | import Data.List 10 | import Data.Maybe 11 | import qualified Data.Set as S 12 | import System.Directory 13 | import System.Environment 14 | import System.FilePath 15 | import Data.Tree 16 | 17 | import Debug.Trace 18 | 19 | type Patch = (Catalogue, [(ItemId,Op)]) 20 | 21 | data Op = ReplaceBy [ItemId] | Delete | Resolve | Modify | Add 22 | deriving (Eq,Ord,Show,Read) 23 | 24 | readPatch :: String -> Either ParseError Patch 25 | readPatch s = mk . readItemForest . dropWhile isHeader <$> readCSV s 26 | where mk ts = ( emptyCat { catAreas = G.unions $ 27 | map (G.fromTree (const SubItem) . fixTreeTopicIds . fmap fst) ts } 28 | , mapMaybe readOp . filter ((==KU) . itemType . fst) $ concatMap flatten ts ) 29 | isHeader [] = True 30 | isHeader (x:_) = isNothing $ readItemId' x 31 | 32 | readOp :: (Item,[Field]) -> Maybe (ItemId,Op) 33 | readOp (x, zs) = case zs !!! 1 of 34 | Nothing -> Nothing 35 | Just "DEL" -> Just (x', Delete) 36 | Just "ADD" -> Just (x', Add) 37 | Just xs -> Just $ (x', let ys = readItemIds xs 38 | in if ys==[itemId x] then Resolve else ReplaceBy ys) 39 | where x' = itemId x 40 | 41 | loadPatch :: FilePath -> IO (Either ParseError Patch) 42 | loadPatch f = readPatch <$> readFile f 43 | 44 | data Diff = Diff 45 | { removed :: [ItemId] 46 | , added :: [ItemId] 47 | , modified :: [ItemId] 48 | , resolved :: [ItemId] 49 | , replaced :: [(ItemId,ItemId)] } deriving (Eq,Ord,Show) 50 | 51 | patchDiff :: Catalogue -> Patch -> Diff 52 | patchDiff cat (cmp,ops) = Diff 53 | { removed = removed 54 | , added = added 55 | , modified = modified 56 | , resolved = resolved 57 | , replaced = replaced } 58 | where catUnits = knowledgeUnits cat 59 | cmpUnits = knowledgeUnits cmp 60 | catUnits' = map itemId catUnits 61 | cmpUnits' = map itemId cmpUnits 62 | removed = [x | (x,Delete) <- ops, x `elem` catUnits'] 63 | add = [x | (x,Add) <- ops] 64 | replaced = [(x,y) | (x,ReplaceBy ys) <- ops, x `elem` catUnits', y <- ys] 65 | replaceTo = nub $ map snd replaced 66 | resolved = [x | (x,Resolve) <- ops] 67 | -- add a unit if (1) pointed to or (2) not changed, but new 68 | added = (nub $ replaceTo ++ add) \\ catUnits' 69 | modified = filter (not . fromJust . identicalItems cat cmp) $ 70 | ((cmpUnits' `intersect` catUnits') \\ removed) \\ added 71 | 72 | identicalItems :: Catalogue -> Catalogue -> ItemId -> Maybe Bool 73 | identicalItems c1 c2 x = liftA2 (==) (getItemTree c1 x) (getItemTree c2 x) 74 | -- where f = fmap (\x -> x { itemRemark = Nothing, 75 | -- itemEditors = if itemType x == KT then [] 76 | -- else itemEditors x }) 77 | 78 | patch :: Catalogue -> Patch -> (Catalogue, Diff) 79 | patch c p@(cmp,_) = (c6, d) 80 | where d = patchDiff c p 81 | c2 = removeItems' c $ removed d 82 | adds = map fixTreeTopicIds . treesWithItems cmp $ added d ++ modified d 83 | c3 = foldl' addItemTree c2 adds 84 | overlaps = overlapLinks c3 85 | resolved' = resolved d ++ map fst (replaced d) 86 | units = map itemId $ knowledgeUnits cmp 87 | overlaps2 = filter (\(x1,x2,_) -> 88 | (x1 `elem` resolved' && x2 `elem` units) || 89 | (x1 `elem` units && x2 `elem` resolved')) overlaps 90 | c4 = foldl' removeLink c3 overlaps2 91 | c5 = combineEditorsAndOverlaps c4 p 92 | c6 = foldl' (\c (x1,x2) -> fromMaybe c (replaceItem' c x1 x2)) c5 $ replaced d 93 | 94 | overlapLinks :: Catalogue -> [(ItemId,ItemId,Link)] 95 | overlapLinks = filter (\(_,_,l) -> l==Overlaps) . links' 96 | 97 | -- for all ReplaceBy targets, take ReplaceBy source editors 98 | -- for all ReokaceBy targets, take ReplaceBy 99 | -- (remark: inefficient, because diff is computed again) 100 | combineEditorsAndOverlaps :: Catalogue -> Patch -> Catalogue 101 | combineEditorsAndOverlaps c p@(cmp,_) = c3 102 | where Diff rem add mod ret rep = patchDiff c p 103 | rep' = mapMaybe (\(x1,x2) -> 104 | liftA2 (,) (getItem c x1) (getItem cmp x2)) rep 105 | addEditors x1 x2 = x2 { 106 | itemEditors = sort . nub $ itemEditors x1 ++ itemEditors x2 } 107 | links' = substitute $ overlapLinks c 108 | c2 = foldl' (\c (x1,x2) -> modifyItem c (itemId x2) (addEditors x1)) c rep' 109 | c3 = fromJust $ addLinks c2 links' 110 | substitute xs = [(x1',x2',l) | (x1,x2,l) <- xs, 111 | let x1' = fromMaybe x1 $ lookup x1 rep, 112 | let x2' = fromMaybe x2 $ lookup x2 rep, 113 | x1'/=x2', 114 | x1 `elem` map fst rep || x2 `elem` map fst rep ] 115 | 116 | -- patch is bogus if it operates on a (non-added) unit that does not exist in the 117 | -- catalogue or maps to a unit that does not exist in the component nor in the 118 | -- catalogue 119 | bogusPatch :: Catalogue -> Patch -> Bool 120 | bogusPatch c p@(cmp,ops) = 121 | (opItems `intersect` catUnits /= opItems) || 122 | (replaceTo `intersect` (catUnits ++ cmpUnits) /= replaceTo) 123 | where catUnits = map itemId $ knowledgeUnits c 124 | cmpUnits = map itemId $ knowledgeUnits cmp 125 | opItems = map fst ops \\ added 126 | replaceTo = nub $ [y | (_,ReplaceBy ys) <- ops, y <- ys] 127 | added = [x | (x,Add) <- ops] 128 | 129 | patchAndLog :: Catalogue -> FilePath -> IO (Catalogue,CSV) 130 | patchAndLog c f = do 131 | Right p <- loadPatch f 132 | let (c2,d) = patch c p 133 | Diff rem add mod res rep = d 134 | xs = [[patchFile,showItemId x,"Removed"] | x <- rem] ++ 135 | [[patchFile,showItemId x,"Added"] | x <- add] ++ 136 | [[patchFile,showItemId x,"Modified"] | x <- mod] ++ 137 | [[patchFile,showItemId x,"Declared as non-overlapping (within this component)"] | x <- res] ++ 138 | [[patchFile,showItemId x,"Replaced by " ++ showItemId y] | (x,y) <- rep] 139 | return (c2,zipWith (\x xs -> show x : xs) [1..] xs) 140 | where patchFile = takeFileName f 141 | 142 | dir = "/home/jan/fer3/fer3-catalogue/data/catalogue/v0.3" 143 | catFile = dir > "FER3-KC-v0.3.0.csv" 144 | dirInbox = dir > "components-csv-resolved" 145 | dirOk = dirInbox > "ok" 146 | dirBogus = dirInbox > "bogus" 147 | newCatFile = "FER3-KC-v0.4.0.csv" 148 | dirOut = dir > "resolved" 149 | 150 | sortPatches = do 151 | Right c <- loadCatalogue catFile 152 | fs <- filter (".csv" `isSuffixOf`) <$> getDirectoryContents dirInbox 153 | forM_ fs $ \f -> do 154 | putStr $ f ++ ": " 155 | Right p <- loadPatch $ dirInbox > f 156 | if bogusPatch c p then do 157 | putStrLn "BOGUS" 158 | renameFile (dirInbox > f) (dirBogus > f) 159 | else do 160 | putStrLn "OK" 161 | renameFile (dirInbox > f) (dirOk > f) 162 | 163 | applyPatches = do 164 | Right c <- loadCatalogue catFile 165 | fs <- map (dirOk >) . filter (".csv" `isSuffixOf`) <$> getDirectoryContents dirOk 166 | (cNew,log) <- foldM (\(c,csv) f -> do 167 | (c2,csv2) <- patchAndLog c f 168 | putStrLn $ f ++ show (length csv2) 169 | return (c2,csv++csv2)) (c,[]) fs 170 | saveCatalogue (dirOut > newCatFile) (removeTopicEditors $ addInfoRemark cNew) 171 | writeFile (dirOut > "log.csv") (showCSV log) 172 | 173 | -------------------------------------------------------------------------------- /src/WorkGroups.hs: -------------------------------------------------------------------------------- 1 | 2 | module WorkGroups where 3 | 4 | import Catalogue 5 | import Control.Applicative 6 | import Control.Monad hiding (ap) 7 | import CSV 8 | import qualified Data.Counts as C 9 | import Data.Function 10 | import Data.List 11 | import qualified Data.EdgeLabeledGraph as G 12 | import Data.Graph.Inductive 13 | import qualified Data.Map as M 14 | import Data.Maybe 15 | import Data.List.Partition (eqClasses) 16 | import SimLists hiding (catFile, dir) 17 | import qualified SplitGraph as SG 18 | import System.FilePath 19 | import Text.Printf 20 | 21 | type OverlapGraph = Gr ItemId () 22 | 23 | mkOverlapGraph :: SimGraphPaired -> OverlapGraph 24 | mkOverlapGraph g = undir $ mkGraph (zip [0..] nodes) edges 25 | where nodes = filter isUnit . nub . map lift $ G.vertices g 26 | edges = do 27 | (v1,(l1,l2),v2) <- G.toEdgeList g 28 | let (v1',v2') = (lift v1, lift v2) 29 | guard $ isUnit v1' && isUnit v2' 30 | guard $ l1 `elem` [O,E] || l2 `elem` [O,E] 31 | let Just i1 = M.lookup v1' ix 32 | let Just i2 = M.lookup v2' ix 33 | return (i1,i2,()) 34 | ix = M.fromList $ zip nodes [0..] 35 | isUnit x = (isJust . unitId $ x) && (topicId x == Nothing) 36 | lift x = x { topicId = Nothing } 37 | 38 | overlapGraph :: Catalogue -> OverlapGraph 39 | overlapGraph c = 40 | undir $ mkGraph (zip [0..] nodes) 41 | [(ix x1,ix x2,()) | (x1,x2,Overlaps) <- links' c ] 42 | where nodes = map itemId $ knowledgeUnits c 43 | ixMap = M.fromList $ zip nodes [0..] 44 | ix x = fromJust $ M.lookup x ixMap 45 | 46 | g = mkGraph (zip [1..5] "12345") 47 | [(1,2,"12"),(2,3,"23"),(1,3,"13"),(3,4,"34"),(3,5,"35"),(4,5,"45")] :: Gr Char String 48 | 49 | g2 = mkGraph (zip [1..6] "123456") 50 | [(1,2,"12"),(2,3,"23"),(1,3,"13"),(6,4,"34"),(6,5,"35"),(4,5,"45")] :: Gr Char String 51 | 52 | g3 = mkGraph (zip [1..6] "123456") 53 | [(1,2,"12"),(2,3,"23"),(1,3,"13"),(3,4,"34"),(3,5,"35"),(4,5,"45"),(4,6,"46")] :: Gr Char String 54 | 55 | {- 56 | sccGraphs :: Graph gr => gr a b => [gr a b] 57 | sccGraphs g = [ delNodes ys g | xs <- scc g, let ys = ns \\ xs ] 58 | where ns = nodes g 59 | -} 60 | 61 | loadOverlapGraph = do 62 | Right c <- loadCatalogue catFile 63 | fs <- csvFiles simDir 64 | g <- addCompEdges c . filterSpurious <$> loadSimLists fs 65 | let og = mkOverlapGraph . pairGraph2 $ g 66 | return $ og 67 | 68 | addOverlapLinks :: OverlapGraph -> Catalogue -> Catalogue 69 | addOverlapLinks og c = c { catAreas = g' } 70 | where g = catAreas c 71 | es = mapMaybe oEdge $ edges og 72 | g' = g `G.union` G.fromEdgeList es 73 | oEdge (x, y) = case (lab og x >>= getItem c, lab og y >>= getItem c) of 74 | (Just x,Just y) -> Just (x,Overlaps,y) 75 | _ -> Nothing 76 | 77 | type OverlapComponent = (Int, [(ItemId, [Int])]) 78 | 79 | maxComponentSize = 1000 80 | 81 | overlapComponents :: OverlapGraph -> [OverlapComponent] 82 | overlapComponents og = 83 | [ (i, map (mkUnit i) $ nodes oc) | (i,oc) <- ocs ] 84 | where ocs = zip [1..] . concatMap (SG.splitToSize maxComponentSize) . 85 | filter ((>1) . noNodes) $ SG.gComponents og 86 | gi x = map fst $ filter ((x `gelem`) . snd) ocs 87 | mkUnit i x = (fromJust $ lab og x, 88 | filter (/=i) $ gi x) 89 | 90 | analyseOverlaps = do 91 | Right c <- loadCatalogue $ dir > catFile 92 | let og = overlapGraph c 93 | let oc = filter ((>1) . noNodes) $ SG.gComponents og 94 | n = sum $ map noNodes oc 95 | xs = C.fromList $ map noNodes oc 96 | oc' = concatMap (SG.splitToSize maxComponentSize) oc 97 | n' = sum $ map noNodes oc' 98 | xs' = C.fromList $ map noNodes oc' 99 | pl = C.fromList $ map longestShortestPath oc' 100 | a = length . nub $ concatMap nodes oc' \\ concatMap nodes oc 101 | zs = C.fromList . map snd . C.counts . C.fromList $ concatMap nodes oc' 102 | putStr . unlines $ 103 | [ printf "%d non-singleton components" (length oc) 104 | , printf "%d nodes in non-singleton components" n 105 | , printf "components size histogram: %s" (show $ C.counts xs) 106 | , printf "size-%d-splitted components: %d" maxComponentSize (length oc') 107 | , printf "components size histogram: %s" (show $ C.counts xs') 108 | , printf "longest shortest paths histogram: %s" (show $ C.counts pl) 109 | , printf "articulation points split: %d" a 110 | , printf "articulation points multiplicity histogram: %s" (show $ C.counts zs) ] 111 | 112 | longestShortestPath :: Gr a b -> Int 113 | longestShortestPath g = maximum $ map (longest g) (nodes g) 114 | where longest g x = maximum . map (\(LP p) -> length p) . spTree x $ emap (const 1) g 115 | 116 | {- 117 | mkGroups :: Catalogue -> [OverlapComponent] -> [([OverlapComponent], [ItemEditor])] 118 | mkGroups cat oc = 119 | [ (map fst xs, e) | xs@((x,e):_) <- eqClasses ((==) `on` snd) xss ] 120 | where xss = map (\x -> (x, componentEditors cat x)) oc 121 | -} 122 | 123 | mkGroups :: Catalogue -> [OverlapComponent] -> [[OverlapComponent]] 124 | mkGroups cat oc = 125 | [ map fst xs | xs@((x,e):_) <- eqClasses ((==) `on` snd) xss ] 126 | where xss = map (\x -> (x, componentCats cat x)) oc 127 | 128 | componentCats :: Catalogue -> OverlapComponent -> [String] 129 | componentCats c (_,zs) = 130 | nub . sort $ map (\(x,_) -> catCode x) zs 131 | 132 | componentEditors :: Catalogue -> OverlapComponent -> [ItemEditor] 133 | componentEditors cat (_,zs) = 134 | nub . sort $ concatMap (\(x,_) -> let i = getItem cat x 135 | in if isNothing i then error ("cannot find " ++ show x) 136 | else itemEditors . fromJust $ getItem cat x) zs 137 | 138 | {- 139 | csvOverlapComponent :: Catalogue -> OverlapComponent -> CSV 140 | csvOverlapComponent c oc@(i,zs) = 141 | [printf "COMPONENT %d" i] : [printf "EDITORS: %s" editors] : 142 | [printf "This component has %d KUs" $ length zs] : [w] : [] : 143 | concatMap csvUnit zs 144 | where csvUnit (x,o,z) = 145 | let r1 = intercalate ", " $ map showItemId o 146 | r2 = printf "SHARED WITH COMPONENTS: %s / Overlaps with KUs: %s" 147 | (intercalate ", " $ map show z) 148 | in (++[[]]) . addRemark r2 . addRemark r1 . fromJust $ csvKnowledgeUnit c x 149 | editors = intercalate ", " $ componentEditors c oc 150 | hasRepeated = any (\(_,_,xs) -> not $ null xs) zs 151 | w = if hasRepeated then "WARNING: This component shares some KUs with other components (see remarks in column J)" else "" 152 | -} 153 | 154 | csvOverlapComponent :: Catalogue -> OverlapComponent -> CSV 155 | csvOverlapComponent c oc@(i,zs) = 156 | [printf "COMPONENT %d (contains %d KUs)" i (length zs)] : 157 | [printf "SUBCATS (%d): %s" (length as) (intercalate ", " as)] : 158 | [printf "EDITORS (%d): %s" (length editors) (intercalate ", " editors)] : [w] : [] : 159 | ["KA Id","KA Name","KU Id","KU Name","KT Id","KT Name","Comment","Editors","Overlaps"] : [] : 160 | csvCatalogueSubset c2 units 161 | where units = map (\(x,_) -> x) zs 162 | editors = componentEditors c oc 163 | as = componentCats c oc 164 | hasRepeated = any (\(_,xs) -> not $ null xs) zs 165 | w = if hasRepeated then "WARNING: This component shares some KUs with other components (see remarks in column G)" else "" 166 | r z = if null z then "" else 167 | printf "SHARED WITH COMPONENTS: %s" (intercalate ", " $ map show z) 168 | c2 = foldl' (\c (x,z) -> addItemRemark c x (r z)) c zs 169 | 170 | dir = "/home/jan/fer3/fer3-catalogue/data/catalogue/v0.3/" 171 | catFile = "FER3-KC-v0.3.0.csv" 172 | componentsFile = dir > "components/FER3-v0.3-components.txt" 173 | componentsCsvDir = dir > "components-csv" 174 | 175 | main = do 176 | Right c <- loadCatalogue $ dir > catFile 177 | ocs <- loadComponents 178 | forM_ ocs $ \oc@(i,_) -> do 179 | let csv = csvOverlapComponent c oc 180 | writeFile (componentsCsvDir > printf "c%03d.csv" i) (showCSV csv) 181 | 182 | loadComponents :: IO [OverlapComponent] 183 | loadComponents = read <$> readFile componentsFile 184 | 185 | generateComponents = do 186 | Right c <- loadCatalogue $ dir > catFile 187 | let og = overlapGraph c 188 | ocs = overlapComponents og 189 | writeFile (dir > componentsFile) $ show ocs 190 | 191 | generateIndex = do 192 | ocs <- loadComponents 193 | Right c <- loadCatalogue $ dir > catFile 194 | let h = ["Component", "Filename", "Component size", 195 | "Component subcats", "Component editors", "Component units" ] 196 | h1 = "Editor" : h 197 | h2 = "Subcats" : h 198 | h3 = "KU" : h 199 | xs = sort $ [ [e,show j, printf "c%03d.csv" j, 200 | show $ length kus, 201 | intercalate ", " as,intercalate ", " es, 202 | intercalate ", " $ map (showItemId . fst) kus ] | 203 | oc@(j,kus) <- ocs, 204 | let as = componentCats c oc, 205 | let es = componentEditors c oc, 206 | e <- es ] 207 | ys = sort $ [ [a,show j, printf "c%03d.csv" j, 208 | show $ length kus, 209 | intercalate ", " as,intercalate ", " es, 210 | intercalate ", " $ map (showItemId . fst) kus ] | 211 | oc@(j,kus) <- ocs, 212 | let as = componentCats c oc, 213 | let es = componentEditors c oc, 214 | a <- as ] 215 | zs = sort $ [ [ku,show j, printf "c%03d.csv" j, 216 | show $ length kus, 217 | intercalate ", " as,intercalate ", " es, 218 | intercalate ", " $ map (showItemId . fst) kus ] | 219 | oc@(j,kus) <- ocs, 220 | let as = componentCats c oc, 221 | let es = componentEditors c oc, 222 | ku <- map (showItemId . fst) kus ] 223 | writeFile (componentsCsvDir > "index-editor.csv") $ showCSV (h1:xs) 224 | writeFile (componentsCsvDir > "index-subcat.csv") $ showCSV (h2:ys) 225 | writeFile (componentsCsvDir > "index-ku.csv") $ showCSV (h3:zs) 226 | -------------------------------------------------------------------------------- /src/Catalogue.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TupleSections, TypeSynonymInstances, FlexibleInstances, MultiParamTypeClasses #-} 2 | 3 | module Catalogue 4 | ( Catalogue (..) -- <== TODO: don't expose the graph structure!!! 5 | , emptyCat 6 | , Item (..) 7 | , ItemType (..) 8 | , itemType' 9 | , ItemId (..) 10 | , Link (..) -- <== tmp 11 | , ItemEditor 12 | , loadCatalogue 13 | , readCatalogue 14 | , readItem 15 | , readItemId 16 | , readItemId' 17 | , showItemId 18 | , readItemForest 19 | , splitCatalogue 20 | , knowledgeItems 21 | , knowledgeAreas 22 | , knowledgeUnits 23 | , getItem 24 | , getItemTree 25 | , getItemArea 26 | , csvCatalogue 27 | , csvCatalogueSubset 28 | , saveCatalogue 29 | , readItemIds 30 | , fixItemIds 31 | , fixTreeTopicIds 32 | , fixTreeItemIds 33 | , itemLinks 34 | , itemLinks' 35 | , addItemRemark 36 | , modifyItem 37 | , removeItem 38 | , removeItem' 39 | , removeItems 40 | , removeItems' 41 | , findItem 42 | , getSubitems' 43 | , getSubitems 44 | , getAllSubitems 45 | , getAllSubitems' 46 | , getParentItem 47 | , parentId 48 | , addItem 49 | , addItems 50 | , pruneItems 51 | , redirectLinks 52 | , redirectLinks' 53 | , replaceItem 54 | , replaceItem' 55 | , addItemTree 56 | , removeLink 57 | , addLink 58 | , addLinks 59 | , links' 60 | , removeTopicEditors 61 | , addInfoRemark 62 | , treesWithItems ) where 63 | 64 | import Control.Applicative ((<$>),liftA2) 65 | import Control.Monad 66 | import CSV 67 | import qualified Data.EdgeLabeledGraph as G -- TMP! 68 | import Data.Graph.Inductive 69 | import Data.List 70 | import Data.Maybe 71 | import qualified Data.Map as M 72 | import Data.Function 73 | import qualified Data.Text as T 74 | import Data.List.Split (splitOneOf) 75 | import Data.Tree 76 | import Debug.Trace 77 | import Text.Parsec 78 | import Text.Parsec.String 79 | import Text.Printf 80 | 81 | -- TODO: Abstract Catalogue interface (it shouldn't be exposed as a graph, 82 | -- i.e., hide some constructors and provide operators to get items) 83 | -- TODO: better error handling when reading in the catalogue 84 | -- TODO: switch foo' and foo semantics (primed versions take values, non-primed 85 | -- versions take ids, which is the default) 86 | -- TODO: Rename: CataologueTree -> ItemTree 87 | 88 | ------------------------------------------------------------------------------ 89 | -- Catalogue data structures 90 | ------------------------------------------------------------------------------ 91 | 92 | data ItemType = CAT | KA | KU | KT deriving (Eq,Show,Enum,Read,Ord) 93 | 94 | type ItemEditor = String 95 | 96 | data ItemId = Null | ItemId 97 | { catCode :: String 98 | , areaCode :: String 99 | , unitId :: Maybe Int 100 | , topicId :: Maybe Int } deriving (Eq,Ord,Read,Show) 101 | 102 | {- 103 | instance Show ItemId where 104 | show x = "readItemId \"" ++ showItemId x ++ "\"" 105 | -} 106 | 107 | data Item = Item 108 | { itemId :: ItemId 109 | , itemType :: ItemType 110 | , itemLabel :: String 111 | , itemVersion :: Maybe String 112 | , itemEditors :: [ItemEditor] 113 | , itemRemark :: Maybe String } deriving (Eq,Show,Read,Ord) 114 | 115 | itemType' :: ItemId -> ItemType 116 | itemType' x = case (unitId x, topicId x) of 117 | (Just _, Just _ ) -> KT 118 | (Just _, Nothing) -> KU 119 | _ -> KA 120 | 121 | {- 122 | rootNode :: Item 123 | rootNode = Item 124 | { itemId = ItemId { catCode = "FER3" 125 | , areaCode = "", unitId = Nothing, topicId = Nothing } 126 | , itemType = CAT 127 | , itemLabel = "" 128 | , itemVersion = Nothing 129 | , itemEditors = [] 130 | , itemRemark = Nothing } 131 | -} 132 | 133 | data Link 134 | = SubItem 135 | | Overlaps 136 | | Related 137 | | Prereq 138 | | Alias deriving (Eq,Ord,Show,Read) 139 | 140 | type ItemIndex = T.Text 141 | 142 | instance G.Vertex Item T.Text where 143 | index = T.pack . showItemId . itemId 144 | 145 | data Catalogue = Cat 146 | { catAreas :: G.Graph ItemIndex Item Link 147 | , catId :: String 148 | , catName :: String 149 | , catVersion :: Maybe String 150 | , catDate :: Maybe String 151 | , catEditors :: [String] 152 | , catRemarks :: Maybe String } deriving (Eq,Ord,Show,Read) 153 | 154 | emptyCat :: Catalogue 155 | emptyCat = Cat G.empty "" "" Nothing Nothing [] Nothing 156 | 157 | showItemId :: ItemId -> String 158 | showItemId (ItemId c a (Just u) (Just t)) = printf "%s-%s%02d%02d" c a u t 159 | showItemId (ItemId c a (Just u) Nothing) = printf "%s-%s%02d" c a u 160 | showItemId (ItemId c a Nothing Nothing) = printf "%s-%s" c a 161 | 162 | ------------------------------------------------------------------------------ 163 | -- Catalogue reading from a CSV file 164 | ------------------------------------------------------------------------------ 165 | 166 | readItemId' :: String -> Maybe ItemId 167 | readItemId' s = case parse parseItemId "" s of 168 | Right id -> Just id 169 | _ -> {-trace ("CANNOT READ: " ++ show s) $-} Nothing -- error $ "Cannot read ItemId " ++ show s 170 | 171 | readItemId :: String -> ItemId 172 | readItemId s = case parse parseItemId "" s of 173 | Right id -> id 174 | _ -> error $ "Cannot read ItemId " ++ show s 175 | 176 | parseItemId :: Parser ItemId 177 | parseItemId = do 178 | c <- many letter 179 | char '-' 180 | a <- many letter 181 | u <- optionMaybe $ count 2 digit 182 | t <- optionMaybe $ count 2 digit 183 | return $ ItemId c a (read <$> u) (read <$> t) 184 | 185 | loadCatalogue :: FilePath -> IO (Either ParseError Catalogue) 186 | loadCatalogue f = readCatalogue <$> readFile f 187 | 188 | readCatalogue :: String -> Either ParseError Catalogue 189 | readCatalogue s = (\(c,ls) -> fromJust $ addLinks c ls) . readCat <$> readCSV s 190 | 191 | readCat :: CSV -> (Catalogue,[(ItemId,ItemId,Link)]) 192 | readCat xs = (,links) $ Cat 193 | { catAreas = G.unions $ 194 | map (G.fromTree (const SubItem) . fixTreeTopicIds) forest' 195 | , catId = getFieldMaybe xs 1 1 "" 196 | , catName = getFieldMaybe xs 2 1 "" 197 | , catVersion = getField xs 3 1 198 | , catDate = getField xs 4 1 199 | , catEditors = concat . maybeToList $ ( readEditors <$> getField xs 5 1 ) 200 | , catRemarks = getField xs 6 1 } 201 | where p [] = True 202 | p (x:_) = isNothing $ readItemId' x 203 | forest = readItemForest $ dropWhile p xs 204 | forest' = map (fmap fst) forest 205 | links = mkLinks 0 Overlaps . concatMap flatten $ forest 206 | 207 | mkLinks :: Int -> Link -> [(Item,[Field])] -> [(ItemId,ItemId,Link)] 208 | mkLinks i l = concatMap (\(x1,zs) -> 209 | [(itemId x1,x2,l) | x2 <- fromMaybe [] $ (readItemIds <$> zs !!! i)] ) 210 | 211 | readItemIds :: Field -> [ItemId] 212 | readItemIds = map (readItemId . unwords . words) . splitOneOf ",;." 213 | 214 | readItemForest :: CSV -> [Tree (Item,[Field])] 215 | readItemForest = 216 | map (levelMap readItem) . csvToForest . filter (not . null) 217 | 218 | -- Reads item and the remaining columns 219 | readItem :: Int -> [Field] -> (Item,[Field]) 220 | readItem level xs = {-trace (show xs) $ -} case level of 221 | 0 -> (readFields KA [0,1,6,7] xs, rest) 222 | 1 -> (readFields KU [2,3,6,7] xs, rest) 223 | 2 -> (readFields KT [4,5,6,7] xs, rest) 224 | where rest = drop 8 xs 225 | 226 | readFields :: ItemType -> [Int] -> [Field] -> Item 227 | readFields t ix xs = Item 228 | { itemType = t 229 | , itemId = readItemId $ xs !! (ix !! 0) 230 | , itemLabel = xs !! (ix !! 1) 231 | , itemVersion = Nothing 232 | , itemRemark = xs !!! (ix !! 2) 233 | , itemEditors = readEditors . fromMaybe [] $ xs !!! (ix !! 3) } 234 | 235 | readEditors :: String -> [ItemEditor] 236 | readEditors = filter (not . null) . map (unwords . words) . splitOneOf ",;" 237 | 238 | levelMap :: (Int -> a -> b) -> Tree a -> Tree b 239 | levelMap f = lmap 0 240 | where lmap l (Node x ns) = Node (f l x) (map (lmap $ l+1) ns) 241 | 242 | ------------------------------------------------------------------------------ 243 | -- Catalogue access/modification 244 | ------------------------------------------------------------------------------ 245 | 246 | getItem :: Catalogue -> ItemId -> Maybe Item 247 | getItem c x = G.vertex (T.pack $ showItemId x) (catAreas c) 248 | 249 | addLink :: Catalogue -> (ItemId,ItemId,Link) -> Maybe Catalogue 250 | addLink cat (x1,x2,l) = case (getItem cat x1, getItem cat x2) of 251 | (Just x1, Just x2) -> Just $ cat { catAreas = G.addEdge x1 x2 l (catAreas cat) } 252 | _ -> error $ "Cannot add link " ++ show (x1,x2,l) --Nothing 253 | 254 | removeLink :: Catalogue -> (ItemId,ItemId,Link) -> Catalogue 255 | removeLink c (x1,x2,l) = case (getItem c x1, getItem c x2) of 256 | (Just x1, Just x2) -> c { catAreas = G.removeEdge x1 x2 l $ catAreas c } 257 | _ -> c 258 | 259 | addLinks :: Catalogue -> [(ItemId,ItemId,Link)] -> Maybe Catalogue 260 | addLinks cat = foldM addLink cat 261 | 262 | findItem :: Catalogue -> (Item -> Bool) -> Maybe Item 263 | findItem cat p = find p $ knowledgeItems cat 264 | 265 | getSubitems :: Catalogue -> Item -> [Item] 266 | getSubitems cat x = 267 | mapMaybe (getItem cat . snd) . filter ((==SubItem) . fst) $ itemLinks cat x 268 | 269 | getSubitems' :: Catalogue -> ItemId -> Maybe [Item] 270 | getSubitems' cat x = getSubitems cat <$> getItem cat x 271 | 272 | getAllSubitems :: Catalogue -> Item -> [Item] 273 | getAllSubitems cat x | null xs = xs 274 | | otherwise = nub $ xs ++ concatMap (getSubitems cat) xs 275 | where xs = getSubitems cat x 276 | 277 | getAllSubitems' :: Catalogue -> ItemId -> Maybe [Item] 278 | getAllSubitems' cat x = getAllSubitems cat <$> getItem cat x 279 | 280 | -- dual of: childId 281 | parentId :: ItemId -> ItemId 282 | parentId x = case (unitId x,topicId x) of 283 | (Just _, Just _ ) -> x { topicId = Nothing } 284 | _ -> x { unitId = Nothing } 285 | 286 | -- gets parent item by id 287 | getParentItem' :: Catalogue -> ItemId -> Maybe Item 288 | getParentItem' c = getItem c . parentId 289 | 290 | -- gets parent item by id 291 | getParentItem :: Catalogue -> Item -> Maybe Item 292 | getParentItem c = getParentItem' c . itemId 293 | 294 | modifyItem :: Catalogue -> ItemId -> (Item -> Item) -> Catalogue 295 | modifyItem c x f = case getItem c x of 296 | Just x' -> c { catAreas = G.modifyVertex x' f $ catAreas c } 297 | Nothing -> c 298 | 299 | -- removes an item and all its subitems as well all links pointing to 300 | -- it and the subitems 301 | -- (removing dangling edges is taken care of by 'removeVertex') 302 | removeItem :: Catalogue -> Item -> Catalogue 303 | removeItem c x = c { catAreas = G.removeVertices (x:xs) (catAreas c) } 304 | where xs = getAllSubitems c x 305 | 306 | removeItem' :: Catalogue -> ItemId -> Catalogue 307 | removeItem' c x = fromMaybe c $ removeItem c <$> getItem c x 308 | 309 | removeItems :: Catalogue -> [Item] -> Catalogue 310 | removeItems c = foldl' removeItem c 311 | 312 | removeItems' :: Catalogue -> [ItemId] -> Catalogue 313 | removeItems' c = foldl' removeItem' c 314 | 315 | -- adds an item and links it to its parent by ItemId 316 | -- returns Nothing if the parent doesn't exist 317 | -- NB: Knowlede areas have no parents and hence can always be added! 318 | -- NB: Will replace existing items! 319 | addItem :: Catalogue -> Item -> Maybe Catalogue 320 | addItem c x 321 | | itemType x == KA = Just $ c { catAreas = G.addVertex x (catAreas c) } 322 | | otherwise = 323 | case getParentItem c x of 324 | Just p -> Just $ c { catAreas = G.addEdge p x SubItem (catAreas c) } 325 | Nothing -> Nothing 326 | 327 | addItems :: Catalogue -> [Item] -> Maybe Catalogue 328 | addItems c = foldM addItem c 329 | 330 | -- remove unpopulated areas an units 331 | pruneItems :: Catalogue -> Catalogue 332 | pruneItems c = removeItems c2 ka 333 | where ku = filter (null . getSubitems c) $ knowledgeUnits c 334 | c2 = removeItems c ku 335 | ka = filter (null . getSubitems c2) $ knowledgeAreas c2 336 | 337 | -- redirects incoming links to a new node 338 | -- Subitem links are NOT redirected 339 | redirectLinks :: Catalogue -> Item -> Item -> Catalogue 340 | redirectLinks c x1 x2 = c { catAreas = G.modifyEdges f (catAreas c) } 341 | where f e@(_,SubItem,_) = Just e 342 | f e@(x,l,y) | y==x1 = Just (x,l,x2) 343 | | otherwise = Just e 344 | 345 | redirectLinks' :: Catalogue -> ItemId -> ItemId -> Maybe Catalogue 346 | redirectLinks' c x1 x2 = 347 | liftA2 (redirectLinks c) (getItem c x1) (getItem c x2) 348 | 349 | -- replaces one item by the other: item x1 is deleted and all links 350 | -- (except SubItem links) that were pointing to x1 get redirected to x2 351 | replaceItem :: Catalogue -> Item -> Item -> Catalogue 352 | replaceItem c x1 x2 = removeItem (redirectLinks c x1 x2) x1 353 | 354 | replaceItem' :: Catalogue -> ItemId -> ItemId -> Maybe Catalogue 355 | replaceItem' c x1 x2 = do 356 | c2 <- redirectLinks' c x1 x2 357 | return $ removeItem' c2 x1 358 | 359 | -- splits catalogue according to areas 360 | splitCatalogue :: Catalogue -> [Catalogue] 361 | splitCatalogue = 362 | map (\es@(e:_) -> emptyCat { catAreas = G.fromEdgeList es, 363 | catId = outCatCode e }) . 364 | groupBy ((==) `on` outCatCode) . 365 | sortBy (compare `on` outCatCode) . 366 | filter inCat . 367 | G.toEdgeList . catAreas 368 | where outCatCode (v,_,_) = catCode $ itemId v 369 | inCat (v1,_,v2) = catCode (itemId v1) == catCode (itemId v2) 370 | 371 | catCodes :: Catalogue -> [String] 372 | catCodes = nub . sort . map (catCode . itemId) . G.vertices . catAreas 373 | 374 | knowledgeItems :: Catalogue -> [Item] 375 | knowledgeItems = G.vertices . catAreas 376 | 377 | knowledgeAreas :: Catalogue -> [Item] 378 | knowledgeAreas = filter ((==KA) . itemType) . knowledgeItems 379 | 380 | knowledgeUnits :: Catalogue -> [Item] 381 | knowledgeUnits = filter ((==KU) . itemType) . knowledgeItems 382 | 383 | knowledgeTopics :: Catalogue -> [Item] 384 | knowledgeTopics = filter ((==KT) . itemType) . knowledgeItems 385 | 386 | links' :: Catalogue -> [(ItemId,ItemId,Link)] 387 | links' = map (\(x1,l,x2) -> (itemId x1,itemId x2,l)) . G.toEdgeList . catAreas 388 | 389 | -- TODO: generalize so that it works with topics and areas... 390 | addArea :: Catalogue -> ItemTree -> ItemTree 391 | addArea c n@(Node x ns) = case getItemArea c (itemId x) of 392 | Nothing -> n 393 | Just x' -> Node x' [n] 394 | 395 | getItemArea :: Catalogue -> ItemId -> Maybe Item 396 | getItemArea c x = getItem c $ x { unitId = Nothing, topicId = Nothing } 397 | 398 | -- TODO: change to (ItemId,Link) 399 | itemLinks :: Catalogue -> Item -> [(Link,ItemId)] 400 | itemLinks c x = map (\(l,x) -> (l,itemId x)) . G.outEdges x $ catAreas c 401 | 402 | itemLinks' :: Catalogue -> ItemId -> Maybe [(Link,ItemId)] 403 | itemLinks' c x = itemLinks c <$> getItem c x 404 | 405 | -- changes item ids, inclusive in links 406 | changeItemIds :: Catalogue -> [(ItemId ,ItemId)] -> Catalogue 407 | changeItemIds c xs = c 408 | { catAreas = G.fromEdgeList . map (\(x1,l,x2) -> (f x1,l,f x2)) . 409 | G.toEdgeList $ catAreas c } 410 | where f x = let x' = itemId x in x { itemId = fromMaybe x' $ lookup x' xs } 411 | 412 | itemIdCorrections :: Catalogue -> [(ItemId,ItemId)] 413 | itemIdCorrections c = concatMap (fix0 . fmap itemId) $ catalogueForest c 414 | where fix0 (Node n ns) = concat $ zipWith (fix n) ns [1..] 415 | fix y (Node x ns) i 416 | | x == childId y i = concat $ zipWith (fix x) ns [1..] 417 | | otherwise = (x,childId y i) : (concat $ zipWith (fix x) ns [1..]) 418 | 419 | fixItemIds :: Catalogue -> Catalogue 420 | fixItemIds cat = changeItemIds cat $ itemIdCorrections cat 421 | 422 | childId :: ItemId -> Int -> ItemId 423 | childId x i = case (unitId x,topicId x) of 424 | (Nothing, Nothing) -> x { unitId = Just i } 425 | (_, Nothing) -> x { topicId = Just i } 426 | 427 | -- make faster 428 | nonuniqueIds :: Catalogue -> [ItemId] 429 | nonuniqueIds c = xs \\ nub xs 430 | where xs = map itemId $ knowledgeItems c 431 | 432 | symmetricizeOverlaps :: Catalogue -> Catalogue 433 | symmetricizeOverlaps c = 434 | c { catAreas = G.fromEdgeList . concatMap f $ G.toEdgeList (catAreas c) } 435 | where f l@(x1,Overlaps,x2) = [l,(x2,Overlaps,x1)] 436 | f l = [l] 437 | 438 | ------------------------------------------------------------------------------ 439 | -- Catalogue--ItemTree conversions 440 | ------------------------------------------------------------------------------ 441 | 442 | type ItemTree = Tree Item 443 | 444 | {- 445 | rootConnect :: [ItemTree] -> ItemTree 446 | rootConnect ns = Node rootNode ns 447 | -} 448 | 449 | catalogueFromForest :: [ItemTree] -> Catalogue 450 | catalogueFromForest forest = 451 | emptyCat { catAreas = G.unions $ map (G.fromTree (const SubItem)) forest } 452 | 453 | catalogueForest :: Catalogue -> [ItemTree] 454 | catalogueForest cat = 455 | --sortBy (compare `on` (itemId . rootLabel)) . 456 | map (\x -> G.toTree x (==SubItem) g) $ knowledgeAreas cat 457 | where g = catAreas cat 458 | 459 | -- get item tree rooted in item x 460 | getItemTree :: Catalogue -> ItemId -> Maybe ItemTree 461 | getItemTree c x = (\x -> G.toTree x (==SubItem) (catAreas c)) <$> getItem c x 462 | 463 | -- gets tree that holds items 464 | treesWithItems :: Catalogue -> [ItemId] -> [ItemTree] 465 | treesWithItems c xs = 466 | map (filterTree2 (\x -> itemId x `elem` xs)) ts 467 | where ts = mapMaybe (getItemTree c) . sort . nub $ 468 | map (\x -> x {unitId = Nothing, topicId = Nothing}) xs 469 | 470 | addItemTree :: Catalogue -> ItemTree -> Catalogue 471 | addItemTree c t = 472 | c { catAreas = G.union (catAreas c) $ G.fromTree (const SubItem) t } 473 | 474 | -- fixes itemids but discards all links but subitem links (suitable for components) 475 | fixTreeItemIds :: ItemTree -> ItemTree 476 | fixTreeItemIds = fix0 477 | where fix0 (Node x ns) = Node x $ zipWith (fix x) ns [1..] 478 | fix y (Node x ns) i 479 | | itemId x == childId (itemId y) i = 480 | Node x $ zipWith (fix x) ns [1..] 481 | | otherwise = 482 | Node (x { itemId = childId (itemId y) i }) $ zipWith (fix x) ns [1..] 483 | 484 | fixTreeTopicIds :: ItemTree -> ItemTree 485 | fixTreeTopicIds = fix0 486 | where fix0 (Node x ns) = Node x $ zipWith (fix x) ns [1..] 487 | fix y (Node x ns) i 488 | | itemType x == KT && itemId x /= childId (itemId y) i = 489 | Node (x { itemId = childId (itemId y) i }) $ zipWith (fix x) ns [1..] 490 | | otherwise = 491 | Node x $ zipWith (fix x) ns [1..] 492 | 493 | removeTopicEditors :: Catalogue -> Catalogue 494 | removeTopicEditors c = 495 | foldl' (\c x -> modifyItem c (itemId x) f) c (knowledgeTopics c) 496 | where f x = x { itemEditors = [] } 497 | 498 | ------------------------------------------------------------------------------ 499 | -- Catalogue output to CSV 500 | ------------------------------------------------------------------------------ 501 | 502 | csvCatalogueSubset :: Catalogue -> [ItemId] -> CSV 503 | csvCatalogueSubset c = concatMap (csvItemTree c) . treesWithItems c 504 | 505 | csvItemTree :: Catalogue -> ItemTree -> CSV 506 | csvItemTree c (Node x ns) = csvItem c x : concatMap (csvItemTree c) ns 507 | 508 | csvItem :: Catalogue -> Item -> Row 509 | csvItem c x 510 | | itemType x == KA = 511 | [showItemId $ itemId x,itemLabel x,"","","","", 512 | fromMaybe "" $ itemRemark x,showItemEditors x,overlaps] 513 | | itemType x == KU = 514 | ["","",showItemId $ itemId x,itemLabel x,"","", 515 | fromMaybe "" $ itemRemark x,showItemEditors x,overlaps] 516 | | itemType x == KT = 517 | ["","","","",showItemId $ itemId x,itemLabel x, 518 | fromMaybe "" $ itemRemark x,showItemEditors x,overlaps] 519 | where overlaps = intercalate ", " $ 520 | map (showItemId . snd) . filter ((==Overlaps) . fst) $ itemLinks c x 521 | 522 | showItemEditors :: Item -> String 523 | showItemEditors = intercalate ", " . itemEditors 524 | 525 | header = 526 | [ "KA Id","KA Name","KU Id","KU Name","KT Id","KT Name","Remark" 527 | , "Editors","Overlaps" ] 528 | 529 | csvCatalogue :: Catalogue -> CSV 530 | csvCatalogue c = 531 | ["FER3 Knowledge Catalogue"] : 532 | ["Catalogue", catName c] : 533 | ["Catalogue ID", catId c] : 534 | ["Version", fromMaybe "" $ catVersion c] : 535 | ["Date", fromMaybe "" $ catDate c] : 536 | ["Editors", intercalate ", " $ catEditors c] : 537 | ["Remark", fromMaybe "" $ catRemarks c] : [] : 538 | header : [] : 539 | (concatMap (csvItemTree c) $ catalogueForest c) 540 | 541 | saveCatalogue :: FilePath -> Catalogue -> IO () 542 | saveCatalogue f = writeFile f . showCSV . csvCatalogue 543 | 544 | addInfoRemark :: Catalogue -> Catalogue 545 | addInfoRemark c = 546 | c { catRemarks = Just $ printf "%d KAs, %d KUs, %d KTs" 547 | (length $ knowledgeAreas c) 548 | (length $ knowledgeUnits c) 549 | (length $ knowledgeTopics c) } 550 | 551 | addItemRemark :: Catalogue -> ItemId -> String -> Catalogue 552 | addItemRemark c x r = modifyItem c x addRemark 553 | where addRemark x = x 554 | { itemRemark = case itemRemark x of 555 | Nothing -> Just r 556 | Just r' -> Just $ r ++ "; " ++ r' } 557 | 558 | filterTree :: Eq a => (a -> Bool) -> Tree a -> Tree a 559 | filterTree p (Node x ns) = 560 | Node x . map (filterTree p) $ filter (any p . flatten) ns 561 | 562 | -- includes hanging substrees 563 | filterTree2 :: Eq a => (a -> Bool) -> Tree a -> Tree a 564 | filterTree2 p n@(Node x ns) 565 | | p x = n 566 | | otherwise = Node x . map (filterTree2 p) $ filter (any p . flatten) ns 567 | 568 | 569 | ------------------------------------------------------------------------------ 570 | -- TMP 571 | 572 | {- 573 | main = do 574 | Right (c,ls) <- loadCatalogueComponent "../data/catalogue/v0.2/components-resolved-csv/g020-c022.res.csv" 575 | printCSV $ csvCatalogue c 576 | print $ map (\(x1,x2,l) -> (showItemId x1, showItemId x2, l)) ls 577 | -} 578 | 579 | main2 = do 580 | Right c <- loadCatalogue "../data/catalogue/v0.2/catalogue/FER3-v0.2.3.csv" 581 | printCSV $ csvCatalogue c 582 | 583 | correct = do 584 | Right c <- loadCatalogue "../data/catalogue/v0.2/catalogue/FER3-v0.2.3.csv" 585 | let xs = itemIdCorrections c 586 | saveCatalogue "FER3-v0.2.3.csv" $ changeItemIds c xs 587 | 588 | removeOld = do 589 | Right c <- loadCatalogue "../data/catalogue/v0.3/FER3-v0.2.4.csv" 590 | return . removeTopicEditors . pruneItems $ removeItems' c (map readItemId xs) 591 | --todo: fix tree topic IDS 592 | 593 | sym = do 594 | Right c <- loadCatalogue "../data/catalogue/v0.3/FER3-v0.2.5.csv" 595 | return $ symmetricizeOverlaps c 596 | 597 | xs = ["CS-SE","SE-SE","CE-SE","TI-PS08","TI-PS10","TI-PS12","TI-PS13","TI-PS14","WT-CT","TI-CN","TI-DA","WT-CS","WT-OC","SE-AL","CE-PRF","CS-PL","SE-PL","TI-DN03","TI-DN04","TI-DN05","TI-PS01","TI-PS02","TI-PS03","TI-PS04","TI-PS05","TI-PS06","TI-PS07","TI-PS09","CS-AL","CE-SDF","CS-SDF","SE-SDF","CE-DBS","CS-IM","SE-IM","TI-DK","SE-INS","CE-PBD","CS-PBD","SE-PBD","CS-CN","CS-SF"] 598 | 599 | main3 = do 600 | Right c <- loadCatalogue "../data/catalogue/v0.3/FER3-KC-v0.2.9.csv" 601 | let c2 = removeTopicEditors . addInfoRemark $ symmetricizeOverlaps c 602 | saveCatalogue "../data/catalogue/v0.3/FER3-KC-v0.3.0.csv" c2 603 | 604 | 605 | {- 606 | 607 | Right c <- loadCatalogue "../data/catalogue/v0.2/catalogue/FER3-v0.2.2.csv" 608 | printCSV $ csvCatalogue c 609 | 610 | Right c <- loadCatalogue "../data/catalogue/v0.2/components-resolved-csv/g020-c022.res.csv" 611 | printCSV $ csvCatalogue c 612 | 613 | let t = catalogueForest c 614 | putStr $ showCSV $ concatMap (csvItemTree c) t 615 | 616 | let Just t = getItemTree c (fromJust $ readItemId "CS-IS") 617 | putStr $ showCSV $ csvItemTree c t 618 | 619 | -} 620 | 621 | -------------------------------------------------------------------------------- /src/fer3.hs: -------------------------------------------------------------------------------- 1 | -- 2 | -- FER3 Knowledge Catalogue Processing 3 | -- (c) 2014 Jan Snajder, FER 4 | -- 5 | ------------------------------------------------------------------------------ 6 | 7 | import Control.Applicative ((<$>)) 8 | import Control.Monad 9 | import Data.Function 10 | import Data.List.Split (splitOn,splitOneOf) 11 | import Data.Char 12 | import qualified Data.Foldable 13 | import Data.List 14 | import Data.Map (Map) 15 | import qualified Data.Map as M 16 | import Data.Maybe 17 | import Data.Ord hiding (compare) 18 | import Data.Set (Set) 19 | import qualified Data.Set as S 20 | import Data.Tree 21 | import Data.Traversable hiding (forM) 22 | import qualified NLP.Stemmer as Stemmer 23 | import Prelude hiding (sequence,compare) 24 | import System.FilePath 25 | import System.IO 26 | import Text.Parsec hiding (label,labels) 27 | import Text.Parsec.String 28 | import Text.Printf 29 | 30 | ------------------------------------------------------------------------------ 31 | 32 | type Catalogue = Tree Item 33 | 34 | data ItemType = CAT | KA | KU | KT deriving (Eq,Show,Enum,Read) 35 | 36 | data Item = Item 37 | { itemType :: ItemType 38 | , itemId :: String 39 | , itemLabel :: String 40 | , itemPath :: [String] 41 | , itemVersion :: Maybe String 42 | , itemEditors :: [String] 43 | , itemRemark :: Maybe String } deriving (Eq,Show,Read) 44 | 45 | ------------------------------------------------------------------------------ 46 | -- Basic querying 47 | ------------------------------------------------------------------------------ 48 | 49 | catStats :: Catalogue -> (Int,Int,Int) 50 | catStats c = (ka,ku,kt) 51 | where [ka,ku,kt] = map (\t -> length $ catItems t c) [KA,KU,KT] 52 | 53 | catItems :: ItemType -> Catalogue -> [Item] 54 | catItems t = filter ((==t) . itemType) . flatten 55 | 56 | catLabels :: Catalogue -> [String] 57 | catLabels = map itemLabel . flatten 58 | 59 | catEditors :: Catalogue -> [String] 60 | catEditors = 61 | sortBy (comparing (f . words)) . nub . concatMap itemEditors . flatten 62 | where f (_:w:ws) = w:ws 63 | f s = s 64 | 65 | -- Return subcatalogues edited by a given editor (excluding their 66 | -- subcatalogues) 67 | catEditedBy :: String -> Catalogue -> [Catalogue] 68 | catEditedBy e c@(Node x ns) 69 | | e `elem` itemEditors x = [c] 70 | | otherwise = concatMap (catEditedBy e) ns 71 | 72 | ------------------------------------------------------------------------------ 73 | -- CSV input 74 | ------------------------------------------------------------------------------ 75 | 76 | stringCell :: Parser String 77 | stringCell = do 78 | s <- quotedCell <|> many1 (noneOf ",\n\r") 79 | char ',' <|> lookAhead newline 80 | return s 81 | 82 | quotedCell :: Parser String 83 | quotedCell = 84 | do char '"' 85 | content <- many quotedChar 86 | char '"' > "quote at end of cell" 87 | return content 88 | 89 | quotedChar :: Parser Char 90 | quotedChar = noneOf "\"" <|> try (string "\"\"" >> return '"') 91 | 92 | readCatalogue :: String -> Catalogue 93 | readCatalogue s = case parse catalogue "(unknown)" s of 94 | Right c -> c 95 | Left e -> error $ show e 96 | 97 | nextLine :: Parser () 98 | nextLine = anyChar `manyTill` newline >> return () 99 | 100 | catalogue :: Parser Catalogue 101 | catalogue = do 102 | count 2 nextLine 103 | label <- string "Catalogue:,," >> stringCell 104 | nextLine 105 | id <- string "Catalogue ID:,," >> stringCell 106 | nextLine 107 | version <- string "Version:,," >> stringCell 108 | count 2 nextLine 109 | editor <- string "Editor(s):,," >> stringCell 110 | count 4 nextLine 111 | xs <- many (item 1) 112 | eof 113 | return $ Node (Item CAT id label [label] (Just version) 114 | (stringSequence editor) Nothing) xs 115 | 116 | stringSequence :: String -> [String] 117 | stringSequence [] = [] 118 | stringSequence s = map (unwords . words) $ splitOneOf ",;" s 119 | 120 | idLabel :: Parser (String,String) 121 | idLabel = do 122 | id <- stringCell > "item identifier" 123 | label <- try stringCell <|> (char ',' >> return "") > "item label" 124 | return (id,label) 125 | 126 | remarkEditor :: Int -> Parser (Maybe String,[String]) 127 | remarkEditor n = option (Nothing,[]) $ do 128 | count n (char ',') 129 | remark <- (Just <$> stringCell) <|> (char ',' >> return Nothing) 130 | editor <- option [] stringCell 131 | return (remark,stringSequence editor) 132 | 133 | item :: Int -> Parser (Tree Item) 134 | item level = do 135 | count ((level-1)*2) (char ',') 136 | (id,label) <- idLabel 137 | (remark,editor) <- remarkEditor (4 - (level-1)*2) 138 | nextLine 139 | optional $ try emptyRow 140 | xs <- if level==3 then return [] else many . try . item $ level + 1 141 | return $ Node (Item (toEnum level) id label [] Nothing editor remark) xs 142 | 143 | emptyRow :: Parser () 144 | emptyRow = manyTill (char ',') newline >> return () 145 | 146 | ------------------------------------------------------------------------------ 147 | -- CSV output 148 | ------------------------------------------------------------------------------ 149 | 150 | csvCatalogue :: Catalogue -> String 151 | csvCatalogue c@(Node x ns) = 152 | "FER3 Knowledge Catalogue,,,,,,,\n" ++ 153 | printf "Catalogue:,,%s,,,,,\nCatalogue ID:,,%s,,,,,\nVersion:,,%s,,,,,\nDate:,\n" 154 | (csvQuote $ itemLabel x) (itemId x) (fromMaybe "NA" $ itemVersion x) ++ 155 | printf "Editor(s):,,%s,,,,,\nComment:,,%s\n" 156 | (csvQuote . showSequence $ itemEditors x) 157 | (csvQuote . fromMaybe "" $ itemRemark x) ++ 158 | ",,,,,,,\n" ++ 159 | "KA ID,KA Name,KU ID,KU Name,KT ID,KT Name,Comment,Editor(s)\n" ++ 160 | concatMap (csvItem 0) ns 161 | 162 | csvItem :: Int -> Catalogue -> String 163 | csvItem n c@(Node x ns) = 164 | replicate (n*2) ',' ++ 165 | csvQuote (itemId x) ++ "," ++ 166 | csvQuote (itemLabel x) ++ "," ++ 167 | replicate (4 - n*2) ',' ++ 168 | csvQuote (fromMaybe "" $ itemRemark x) ++ "," ++ 169 | csvQuote (showSequence $ itemEditors x) ++ "\n" ++ 170 | concatMap (csvItem (n+1)) ns 171 | 172 | csvQuote :: String -> String 173 | csvQuote s | needsQuotes = "\"" ++ escape s ++ "\"" 174 | | otherwise = s 175 | where needsQuotes = any (==',') s 176 | escape [] = [] 177 | escape ('"':xs) = "\"\"" ++ escape xs 178 | escape (x:xs) = x : escape xs 179 | 180 | ------------------------------------------------------------------------------ 181 | -- Fix catalogue 182 | ------------------------------------------------------------------------------ 183 | 184 | fixIds :: Catalogue -> Catalogue 185 | fixIds (Node x ns) = Node x $ 186 | zipWith f (map (itemId . rootLabel) ns) ns 187 | where f id (Node x xs) = Node (x { itemId = id}) 188 | (zipWith f (ids (itemId x)) xs) 189 | ids p = [ printf "%s%02d" p i | i <- [(1::Int)..]] 190 | 191 | prefixRootId :: Catalogue -> Catalogue 192 | prefixRootId (Node x xs) = Node x (map (fmap f) xs) 193 | where f y = y { itemId = itemId x ++ "-" ++ itemId y } 194 | 195 | fixEditors :: Catalogue -> Catalogue 196 | fixEditors c@(Node x ns) = fix [] (Node x' ns) 197 | where fix e (Node x ns) = 198 | let e' = if null (itemEditors x) then e else itemEditors x 199 | in Node (x { itemEditors = e' } ) $ map (fix e') ns 200 | x' | itemType x == CAT = 201 | x { itemEditors = map (++ " (CE)") $ itemEditors x } 202 | | otherwise = x 203 | 204 | fixPaths :: Catalogue -> Catalogue 205 | fixPaths c = fix [] c 206 | where fix z (Node x ns) = 207 | let z' = z ++ [printf "[%s] %s" (itemId x) (itemLabel x)] 208 | in Node (x { itemPath = z' }) $ map (fix z') ns 209 | 210 | fixVersion :: Catalogue -> Catalogue 211 | fixVersion c@(Node x ns) = fix (itemVersion x) c 212 | where fix v (Node x ns) = Node (x { itemVersion = v }) $ map (fix v) ns 213 | 214 | fixLabels :: Catalogue -> Catalogue 215 | fixLabels = fmap (\x -> x {itemLabel = fixLabel (itemLabel x)}) 216 | 217 | -- Remove superflous spaces, capitalize sublabels, remove trailing puncuation 218 | fixLabel :: String -> String 219 | fixLabel [] = [] 220 | fixLabel s = 221 | rtp . intercalate "; " . map upper . splitOn "; " . unwords $ words s 222 | where upper (c:cs) = toUpper c : cs 223 | rtp = reverse . dropWhile isPunct . reverse 224 | isPunct c = c=='.' || c==';' 225 | 226 | fixCatalogue :: Catalogue -> Catalogue 227 | fixCatalogue = fixPaths . fixEditors . fixLabels . prefixRootId . fixIds 228 | 229 | ------------------------------------------------------------------------------ 230 | -- Semantic representations 231 | ------------------------------------------------------------------------------ 232 | 233 | type SemCatalogue a = Tree (Item,a) 234 | 235 | class SemRep a where 236 | compare :: a -> a -> Double 237 | compose :: a -> a -> a 238 | 239 | semCatalogue :: SemRep a => (String -> a) -> Catalogue -> SemCatalogue a 240 | semCatalogue f = fmap (\x -> (x,f $ itemLabel x)) 241 | 242 | ------------------------------------------------------------------------------ 243 | -- Simple BOW representation 244 | ------------------------------------------------------------------------------ 245 | 246 | normalize :: String -> String 247 | normalize = Stemmer.stem Stemmer.English . lower 248 | where lower s | any isLower s = map toLower s 249 | | otherwise = s 250 | 251 | tokenize :: String -> [String] 252 | tokenize = 253 | filter (not . null) . concatMap slashSplit . splitOneOf " .,:;’'`\"()[]“”" 254 | where notAcronym = any isLower 255 | slashSplit s = if notAcronym s then splitOn "/" s else [s] 256 | 257 | newtype Bow = Bow (Set String) deriving (Eq,Show) 258 | 259 | instance SemRep Bow where 260 | compare (Bow xs) (Bow ys) = dice xs ys 261 | compose (Bow xs) (Bow ys) = Bow $ S.union xs ys 262 | 263 | dice :: (Ord a, Eq a) => Set a -> Set a -> Double 264 | dice xs ys = 265 | (realToFrac $ 2 * S.size zs) / (realToFrac $ S.size xs + S.size ys) 266 | where zs = S.intersection xs ys 267 | 268 | bow :: [String] -> String -> Bow 269 | bow sw = Bow . S.map normalize . S.filter f . S.fromList . tokenize 270 | where f s = length s > 2 && s `notElem` sw 271 | 272 | semCatalogueBow :: [String] -> Catalogue -> SemCatalogue Bow 273 | semCatalogueBow sw = semCatalogue (bow sw) 274 | 275 | ------------------------------------------------------------------------------ 276 | -- TfIdf BOW representation 277 | ------------------------------------------------------------------------------ 278 | 279 | type Vector a = Map a Double 280 | newtype TfIdfBow = TfIdfBow (Vector String) deriving (Eq,Show) 281 | 282 | type IdfMap = Map String Double 283 | 284 | idfMap :: Catalogue -> IdfMap 285 | idfMap c = M.fromList . map (\t -> (t, idf n ds t)) $ catTerms c 286 | where n = length ls 287 | ls = catLabels c 288 | ds = map (map normalize . tokenize) ls 289 | 290 | idf :: Int -> [[String]] -> String -> Double 291 | idf n ds t | df==0 = 0 292 | | otherwise = log $ (realToFrac n) / df 293 | where df = realToFrac . length $ filter (t `elem`) ds 294 | 295 | catTerms :: Catalogue -> [String] 296 | catTerms = concatMap (map normalize . tokenize) . catLabels 297 | 298 | tfIdfBow :: [String] -> IdfMap -> String -> TfIdfBow 299 | tfIdfBow sw m s = 300 | TfIdfBow . M.fromList $ map (\t -> (t, tf t * idf t)) ts 301 | where ts = map normalize . filter f $ tokenize s 302 | tf t = realToFrac . length $ filter (==t) ts 303 | idf t = M.findWithDefault 0 t m 304 | f t = length t > 2 && t `notElem` sw 305 | 306 | instance SemRep TfIdfBow where 307 | compare (TfIdfBow x) (TfIdfBow y) = cosine x y 308 | compose (TfIdfBow x) (TfIdfBow y) = TfIdfBow $ M.unionWith (+) x y 309 | 310 | cosine :: Ord a => Vector a -> Vector a -> Double 311 | cosine x y = dot x y / (sqrt (dot x x) * sqrt (dot y y)) 312 | where dot x y = sum . map snd . M.toList $ M.intersectionWith (*) x y 313 | 314 | semCatalogueTfIdf :: [String] -> Catalogue -> SemCatalogue TfIdfBow 315 | semCatalogueTfIdf sw c = semCatalogue (tfIdfBow sw (idfMap c)) c 316 | 317 | ------------------------------------------------------------------------------ 318 | -- Similarity computation 319 | ------------------------------------------------------------------------------ 320 | 321 | type CatalogueSim a = SemCatalogue a -> SemCatalogue a -> Double 322 | 323 | sim1 :: SemRep a => CatalogueSim a 324 | sim1 (Node (_,r1) _) (Node (_,r2) _) = compare r1 r2 325 | 326 | sim2 :: SemRep a => CatalogueSim a 327 | sim2 c1 c2 = compare r1 r2 328 | where r1 = foldl1 compose . map snd $ flatten c1 329 | r2 = foldl1 compose . map snd $ flatten c2 330 | 331 | sim3 :: (SemRep a, Eq a) => CatalogueSim a 332 | sim3 c1@(Node (x1,r1) _) c2@(Node (x2,r2) _) 333 | | ((itemType x1 == KT && itemType x2 /= KT) || 334 | (itemType x1 /= KT && itemType x2 == KT)) && (not $ sameArea c1' c2') 335 | = 1 - sqrt (1 - compare r1 r2) 336 | | itemType x1 `elem` [KA,KU] && itemType x2 `elem` [KA,KU] && (not $ sameArea c1' c2') 337 | = compare r1' r2' 338 | | otherwise = 0 339 | where 340 | r1' = foldl1 compose . map snd $ flatten c1 341 | r2' = foldl1 compose . map snd $ flatten c2 342 | c1' = fmap fst c1 343 | c2' = fmap fst c2 344 | 345 | samePath :: Catalogue -> Catalogue -> Bool 346 | samePath c1 c2 = c1 `isSubtree` c2 || c2 `isSubtree` c1 347 | 348 | -- tmp, because there are no backpointers! 349 | sameArea :: Catalogue -> Catalogue -> Bool 350 | sameArea c1 c2 = a1 `isPrefixOf` a2 || a2 `isPrefixOf` a1 351 | where a1 = areaId $ rootLabel c1 352 | a2 = areaId $ rootLabel c2 353 | --sameArea c c1 c2 = any (\a -> c1 `isSubtree` a && c2 `isSubtree` a) areas 354 | -- where areas = subForest c 355 | 356 | areaId :: Item -> String 357 | areaId = takeWhile (not . isDigit) . itemId 358 | 359 | minSim = 0.33 360 | 361 | catCompareOn :: SemRep a => 362 | CatalogueSim a -> SemCatalogue a -> [SemCatalogue a] -> [(Double,Item)] 363 | catCompareOn f x = 364 | sortBy (flip $ comparing fst) . filter ((>=minSim) . fst) . 365 | map (\y -> (f x y, fst $ rootLabel y)) 366 | 367 | -- catCompare f x c: compares x against all subcatalogues of c, 368 | -- excluding x, all supercatalogues of x, and all subcatalogues of x 369 | catCompare :: (SemRep a, Eq a) => 370 | CatalogueSim a -> SemCatalogue a -> SemCatalogue a -> [(Double,Item)] 371 | catCompare f x = catCompareOn f x . subtrees 372 | --where sc = filter (\y -> not (y `isSubtree` x || x `isSubtree` y)) $ subtrees c 373 | 374 | subtrees :: Tree a -> [Tree a] 375 | subtrees n@(Node _ ns) = n : concatMap subtrees ns 376 | --subtrees n@(Node _ ns) = concatMap subtrees ns 377 | 378 | -- subtree test (not invariant to subforest order!) 379 | isSubtree :: Eq a => Tree a -> Tree a -> Bool 380 | isSubtree x@(Node _ xs) y@(Node _ ys) = x==y || any (isSubtree x) ys 381 | 382 | subtreesTree :: Tree a -> Tree (Tree a) 383 | subtreesTree n@(Node _ ns) = Node n (map subtreesTree ns) 384 | 385 | type SimRank = [(Double,Item)] 386 | 387 | type SimCatalogue = Tree (Item, SimRank) 388 | 389 | simCatalogue :: (SemRep a, Eq a) => 390 | CatalogueSim a -> SemCatalogue a -> SimCatalogue 391 | simCatalogue s c = 392 | fmap (\n -> (fst $ rootLabel n, catCompare s n c)) $ subtreesTree c 393 | 394 | ------------------------------------------------------------------------------ 395 | -- HTML output 396 | ------------------------------------------------------------------------------ 397 | 398 | htmlPage :: String -> String 399 | htmlPage s = 400 | "\n
\n" ++ 401 | "" ++ 402 | "Version: %s (Editors...)
"
410 | (fromMaybe "NA" $ itemVersion x) ++
411 | printf "%d areas, %d units, %d topics
| Knowledge Area | Knowledge Unit | Knowledge Topic |
|---|
Version: %s (Editors...)
"
467 | (fromMaybe "NA" $ itemVersion x) ++
468 | printf "%d areas, %d topics, %d units
| Knowledge Area | Knowledge Unit | Knowledge Topic | SimIndex |
|---|
Editor(s): %s
Remark: %s
| Knowledge Area/Unit/Topic | Editor(s) | " ++ 502 | concatMap htmlSimItem r ++ "
|---|
Version: %s (Catalogue...)
"
516 | (fromMaybe "NA" $ itemVersion x) ++
517 | printf "%d editors
| Knowledge Area/Unit/Topic | " ++ 526 | concatMap (\x -> printf "
|---|