1
1
mirror of https://github.com/github/semantic.git synced 2024-12-25 16:02:43 +03:00

Compute feature vectors from Bags of p,q-grams.

This commit is contained in:
Rob Rix 2016-06-21 18:50:06 -04:00
parent 226be486c8
commit 686f94d53e

View File

@ -1,9 +1,11 @@
module Data.Gram where module Data.Gram where
import Data.DList
import Control.Monad.Random import Control.Monad.Random
import Data.DList as DList
import Data.Hashable import Data.Hashable
import Data.Vector as Vector
import Prologue import Prologue
import Test.QuickCheck.Random
data Gram label = Gram { stem :: [label], base :: [label] } data Gram label = Gram { stem :: [label], base :: [label] }
@ -13,6 +15,14 @@ serialize gram = stem gram <> base gram
type Bag = DList type Bag = DList
featureVector :: Hashable label => Bag (Gram label) -> Int -> Vector Rational
featureVector bag d = sumVectors $ unitDVector . hash <$> bag
where unitDVector hash = normalize . (`evalRand` mkQCGen hash) $ Prologue.sequence (Vector.replicate d getRandom)
normalize vec = fmap (/ magnitude vec) vec
magnitude vec = toRational (sqrtDouble (fromRational (Vector.sum (fmap (^^ (2 :: Integer)) vec))))
sumVectors = DList.foldr (Vector.zipWith (+)) (Vector.replicate d 0)
instance Hashable label => Hashable (Gram label) where instance Hashable label => Hashable (Gram label) where
hashWithSalt _ = hash hashWithSalt _ = hash
hash = hash . serialize hash = hash . serialize