[Haskell-beginners] first open source haskell project and a mystery to boot
Alia
alia_khouri at yahoo.com
Wed Oct 12 20:59:30 CEST 2011
Hi folks,
Given that I received such excellent help from this newsgroup recently, I wanted to share my first
open-source haskell project available here: https://github.com/aliakhouri/newsagent
It's a simple command line feed (atom, rss) retriever and analyzer in the early stages of development,
using the excellent feed / tagsoup libs to download and analyze feeds from the net.
The real intention is to use it as a platform to learn about information retrieval and machine learning
techniques in haskell.
To this end, I was searching for classification algorithms and I was on the lookout for a nice
clear implementation in haskell of canonical decision tree based classification algorithms.
My first discovery was an old DecisionTree package on hackage but it's poorly documented
and has no examples of usage. So I kept searching...
Then I found an hpaste page (http://hpaste.org/steps/11355) which looked at lot more
promising, but it also has no example or documentation. In fact, it's an island of code without
any references (I don't know who the author is) and nobody has ever referred to it by url or by
blog post). It's a mystery to me.
In any case, I've tried to create a working example but I'm stuck because you can't mix
strings and numbers in a list, and I can't decide whether that's when the author gave up,
or whether I've missed the point. Like I said it's a mystery.
I would appreciate if anyone could shed some light on this whimsical problem.
In case you are wondering why this is relevant to the beginner's forum. Well...
firstly, I am a beginner, and, er... the code is short enough to serve pedagogical purposes (-;
AK
<ID3.hs>
-- | This module is a generic implementation
-- of the ID3 decision tree algorithm.
--
-- A choice node on a ``continuous'' attribute is
-- handled by splitting the population in two via the mean attribute value.
module ID3 where
import Data.Ord
import Data.List
import qualified Data.Map as Map
data DecisionTree item outcome = Choice (item -> DecisionTree item outcome)
| Leaf outcome
data Attribute item = Discrete (item -> Integer)
| Continuous (item -> Double)
runDecisionTree :: DecisionTree item outcome -> item -> outcome
runDecisionTree (Leaf outcome) _ = outcome
runDecisionTree (Choice f) item = runDecisionTree (f item) item
id3 :: (Ord outcome) => [Attribute item] -> [(item, outcome)] -> DecisionTree item outcome
-- When there are no unused attributes left, select the most common outcome.
id3 [] xs = Leaf $ fst $ head $ sortBy (comparing (negate.snd)) $ histogram (map snd xs)
-- When all the items have the same outcome, pick that outcome
id3 attrs xs | allEqual (map snd xs) = Leaf $ snd $ head xs
-- Otherwise pick the attribute with minimum entropy
| otherwise =
let (bestAttr:moreAttrs) = sortBy (comparing (informationGain xs)) attrs in
case bestAttr of
Discrete attr ->
let attrTreeMap = Map.fromList attrTrees
allAttrValues = nub $ map (attr . fst) xs
subtree v = id3 moreAttrs (filter (\(x,_) -> v /= attr x) xs)
attrTrees = [(v, subtree v) | v <- allAttrValues]
in Choice $ \item -> case Map.lookup (attr item) attrTreeMap of
Just subtree -> subtree
Nothing -> error "id3: encountered a discrete attribute value that wasn't in the training set"
Continuous attr ->
let meanv = mean (map (attr.fst) xs)
ltTree = id3 moreAttrs (filter (\(x,_) -> attr x < meanv) xs)
gtTree = id3 moreAttrs (filter (\(x,_) -> attr x >= meanv) xs)
in Choice $ \item -> if attr item < meanv
then ltTree
else gtTree
informationGain :: Ord outcome => [(item, outcome)] -> Attribute item -> Double
informationGain xs (Discrete attr) =
currentEntropy - sum (map term allAttributeValues)
where
currentEntropy = entropy (map snd xs)
term a = probabilityOf (==a) * entropy (outcomesFor (==a))
probabilityOf f = fromIntegral (length (outcomesFor f)) / fromIntegral (length xs)
outcomesFor f = map snd $ filter (f . attr . fst) xs
allAttributeValues = nub $ map (attr . fst) xs
informationGain xs (Continuous attr) =
currentEntropy - term (< meanv) - term (>= meanv)
where
currentEntropy = entropy (map snd xs)
term f = probabilityOf f * entropy (outcomesFor f)
probabilityOf f = fromIntegral (length (outcomesFor f)) / fromIntegral (length xs)
outcomesFor f = map snd $ filter (f . attr . fst) xs
meanv = mean (map (attr.fst) xs)
entropy :: Ord a => [a] -> Double
entropy xs = sum $ map (\(_,n) -> term (fromIntegral n)) $ histogram xs
where term 0 = 0
term n = - (n / num) * log (n / num) / log 2
num = fromIntegral (length xs)
histogram :: Ord a => [a] -> [(a, Int)]
histogram = buildHistogram Map.empty
where buildHistogram map [] = Map.assocs map
buildHistogram map (x:xs) = buildHistogram (Map.insertWith (+) x 1 map) xs
-- Simple "utility" functions
allEqual :: Eq a => [a] -> Bool
allEqual = and . mapAdjacent (==)
mapAdjacent :: (a -> a -> b) -> [a] -> [b]
mapAdjacent f xs = zipWith f xs (tail xs)
mean :: (Real a, Fractional n) => [a] -> n
mean xs = realToFrac (sum xs) / realToFrac (length xs)
--------------------------------------------------------------------
-- Testing Area
--------------------------------------------------------------------
outlook s
| s == "sunny" = 1
| s == "overcast" = 2
| s == "rain" = 3
temp :: (Real a, Fractional n) => a -> n
temp i = (realToFrac i) / (realToFrac 100)
humidity :: (Real a, Fractional n) => a -> n
humidity i = (realToFrac i) / (realToFrac 100)
windy x
| x == False = 0
| x == True = 1
-- attributes
a1 = Discrete outlook
a2 = Continuous temp
a3 = Continuous humidity
a4 = Discrete windy
outlookData = ["sunny","sunny","overcast","rain","rain","rain","overcast","sunny","sunny","rain","sunny","overcast","overcast","rain"]
tempData = [85, 80, 83, 70, 68, 65, 64, 72, 69, 75, 75, 72, 81, 71]
humidityData = [85, 90, 78, 96, 80, 70, 65, 95, 70, 80, 70, 90, 75, 80]
windyData = [False, True, False, False, False, True, True, False, False, False, True, True, False, True]
outcomes = [0,0,1,1,1,0,1,0,1,1,1,1,1,0]
d1 = zip outlookData outcomes
d2 = zip tempData outcomes
d3 = zip humidityData outcomes
d4 = zip windyData outcomes
t1 = id3 [a1] d1
t2 = id3 [a2] d2
t3 = id3 [a3] d3
t4 = id3 [a4] d4
--t5 = id3 [a1,a2,a3,a4] [d1,d2,d3,d4]
-- doesn't work because you can't mix strings and numbers in a list
--
----------------
</ID3.hs>
More information about the Beginners
mailing list