mirror of
https://github.com/github/semantic.git
synced 2024-12-26 00:12:29 +03:00
Compute feature vectors from Bags of p,q-grams.
This commit is contained in:
parent
226be486c8
commit
686f94d53e
@ -1,9 +1,11 @@
|
|||||||
module Data.Gram where
|
module Data.Gram where
|
||||||
|
|
||||||
import Data.DList
|
|
||||||
import Control.Monad.Random
|
import Control.Monad.Random
|
||||||
|
import Data.DList as DList
|
||||||
import Data.Hashable
|
import Data.Hashable
|
||||||
|
import Data.Vector as Vector
|
||||||
import Prologue
|
import Prologue
|
||||||
|
import Test.QuickCheck.Random
|
||||||
|
|
||||||
data Gram label = Gram { stem :: [label], base :: [label] }
|
data Gram label = Gram { stem :: [label], base :: [label] }
|
||||||
|
|
||||||
@ -13,6 +15,14 @@ serialize gram = stem gram <> base gram
|
|||||||
|
|
||||||
type Bag = DList
|
type Bag = DList
|
||||||
|
|
||||||
|
|
||||||
|
featureVector :: Hashable label => Bag (Gram label) -> Int -> Vector Rational
|
||||||
|
featureVector bag d = sumVectors $ unitDVector . hash <$> bag
|
||||||
|
where unitDVector hash = normalize . (`evalRand` mkQCGen hash) $ Prologue.sequence (Vector.replicate d getRandom)
|
||||||
|
normalize vec = fmap (/ magnitude vec) vec
|
||||||
|
magnitude vec = toRational (sqrtDouble (fromRational (Vector.sum (fmap (^^ (2 :: Integer)) vec))))
|
||||||
|
sumVectors = DList.foldr (Vector.zipWith (+)) (Vector.replicate d 0)
|
||||||
|
|
||||||
instance Hashable label => Hashable (Gram label) where
|
instance Hashable label => Hashable (Gram label) where
|
||||||
hashWithSalt _ = hash
|
hashWithSalt _ = hash
|
||||||
hash = hash . serialize
|
hash = hash . serialize
|
||||||
|
Loading…
Reference in New Issue
Block a user