1
1
mirror of https://github.com/github/semantic.git synced 2024-12-24 23:42:31 +03:00

Compute feature vectors from Bags of p,q-grams.

This commit is contained in:
Rob Rix 2016-06-21 18:50:06 -04:00
parent 226be486c8
commit 686f94d53e

View File

@ -1,9 +1,11 @@
module Data.Gram where
import Data.DList
import Control.Monad.Random
import Data.DList as DList
import Data.Hashable
import Data.Vector as Vector
import Prologue
import Test.QuickCheck.Random
data Gram label = Gram { stem :: [label], base :: [label] }
@ -13,6 +15,14 @@ serialize gram = stem gram <> base gram
type Bag = DList
featureVector :: Hashable label => Bag (Gram label) -> Int -> Vector Rational
featureVector bag d = sumVectors $ unitDVector . hash <$> bag
where unitDVector hash = normalize . (`evalRand` mkQCGen hash) $ Prologue.sequence (Vector.replicate d getRandom)
normalize vec = fmap (/ magnitude vec) vec
magnitude vec = toRational (sqrtDouble (fromRational (Vector.sum (fmap (^^ (2 :: Integer)) vec))))
sumVectors = DList.foldr (Vector.zipWith (+)) (Vector.replicate d 0)
instance Hashable label => Hashable (Gram label) where
hashWithSalt _ = hash
hash = hash . serialize