search engine implementation + various utilities and tweaks

This commit is contained in:
Paul Chiusano 2016-10-05 16:57:32 -04:00
parent 75a24f5a7c
commit 3fe4807d55
9 changed files with 174 additions and 19 deletions

View File

@ -13,6 +13,7 @@ import Unison.Type (Type)
import Unison.Util.Logger (Logger) import Unison.Util.Logger (Logger)
import qualified Data.Text as Text import qualified Data.Text as Text
import qualified Data.Vector as Vector import qualified Data.Vector as Vector
import qualified Network.URI as URI
import qualified Unison.Cryptography as C import qualified Unison.Cryptography as C
import qualified Unison.Eval.Interpreter as I import qualified Unison.Eval.Interpreter as I
import qualified Unison.Hash as Hash import qualified Unison.Hash as Hash
@ -219,6 +220,24 @@ make _ blockStore crypto = do
op _ = fail "Http.get-url# unpossible" op _ = fail "Http.get-url# unpossible"
in (r, Just (I.Primop 1 op), unsafeParseType "Text -> Either Text Text", prefix "Http.get-url#") in (r, Just (I.Primop 1 op), unsafeParseType "Text -> Either Text Text", prefix "Http.get-url#")
, let r = R.Builtin "Uri.parse-scheme"
op [Term.Text' url] = pure $ case URI.parseURI (Text.unpack url) of
Nothing -> none
Just uri -> some . Term.text . Text.pack $ URI.uriScheme uri
op _ = error "Uri.parse-scheme unpossible"
typ = "Text -> Optional Text"
in (r, Just (I.Primop 1 op), unsafeParseType typ, prefix "Uri.parse-scheme")
, let r = R.Builtin "Uri.parse-authority"
op [Term.Text' url] = pure $
case URI.parseURI (Text.unpack url) >>= URI.uriAuthority of
Nothing -> none
Just auth -> some . Term.text . Text.pack $
URI.uriUserInfo auth ++ URI.uriRegName auth ++ URI.uriPort auth
op _ = error "Uri.parse-authority unpossible"
typ = "Text -> Optional Text"
in (r, Just (I.Primop 1 op), unsafeParseType typ, prefix "Uri.parse-authority")
-- Hashing -- Hashing
-- add erase, comparison functions -- add erase, comparison functions
, let r = R.Builtin "hash#" , let r = R.Builtin "hash#"

View File

@ -2,7 +2,7 @@ module Unison.Runtime.Html where
import Data.Maybe (listToMaybe, catMaybes, mapMaybe) import Data.Maybe (listToMaybe, catMaybes, mapMaybe)
import Data.Text (Text, toLower, pack) import Data.Text (Text, toLower, pack)
import Text.HTML.TagSoup (Tag(..), (~/=), maybeTagText, parseTags, innerText) import Text.HTML.TagSoup (Tag(..), (~/=), maybeTagText, parseTags, innerText, isTagOpenName, isTagComment, isTagCloseName)
import qualified Data.Text as Text import qualified Data.Text as Text
data Link = Link { ref :: Text, description :: Text } deriving (Show) data Link = Link { ref :: Text, description :: Text } deriving (Show)
@ -26,4 +26,15 @@ getLinks :: Text -> [Link]
getLinks s = mapMaybe sectionToLink . justAnchorSections $ parseTags s getLinks s = mapMaybe sectionToLink . justAnchorSections $ parseTags s
toPlainText :: Text -> Text toPlainText :: Text -> Text
toPlainText s = innerText $ parseTags s toPlainText s = innerText . ignores $ parseTags s
ignores :: [Tag Text] -> [Tag Text]
ignores = go where
script = Text.pack "script"
style = Text.pack "style"
go [] = []
go (hd:tl) = case hd of
_ | isTagOpenName script hd -> go (dropWhile (not . isTagCloseName script) tl)
| isTagOpenName style hd -> go (dropWhile (not . isTagCloseName style) tl)
| isTagComment hd -> go tl
| otherwise -> hd : go tl

View File

@ -97,7 +97,7 @@ scope msg = local tweak where
-- | Crash with a message. Include the current logging scope. -- | Crash with a message. Include the current logging scope.
crash :: String -> Multiplex a crash :: String -> Multiplex a
crash msg = do crash msg = do
warn msg -- warn msg
scope msg $ do scope msg $ do
l <- logger l <- logger
fail (show $ L.getScope l) fail (show $ L.getScope l)
@ -118,7 +118,7 @@ process recv = scope "Mux.process" $ do
callback <- atomically $ M.lookup destination cbs callback <- atomically $ M.lookup destination cbs
case callback of case callback of
Nothing -> do Nothing -> do
L.warn logger $ "dropped packet @ " ++ show (Base64.encode destination) L.info logger $ "dropped packet @ " ++ show (Base64.encode destination)
pure True pure True
Just callback -> do Just callback -> do
L.debug logger $ "packet delivered @ " ++ show (Base64.encode destination) L.debug logger $ "packet delivered @ " ++ show (Base64.encode destination)

View File

@ -119,6 +119,7 @@ library
mtl, mtl,
murmur-hash, murmur-hash,
network, network,
network-uri,
network-simple, network-simple,
prelude-extras, prelude-extras,
process, process,
@ -185,6 +186,7 @@ executable container
memory, memory,
mmorph, mmorph,
mtl, mtl,
network-uri,
process, process,
safecopy, safecopy,
scotty, scotty,
@ -264,6 +266,7 @@ executable node
memory, memory,
mtl, mtl,
murmur-hash, murmur-hash,
network-uri,
prelude-extras, prelude-extras,
random, random,
safecopy, safecopy,

View File

@ -245,7 +245,7 @@ makeBuiltins logger whnf =
-- Text -- Text
, let r = R.Builtin "Text.concatenate" , let r = R.Builtin "Text.concatenate"
in (r, Just (string2 (Term.ref r) mappend), strOpTyp, prefixes ["concatenate", "Text"]) in (r, Just (string2 (Term.ref r) mappend), strOpTyp, prefix "Text.concatenate")
, let r = R.Builtin "Text.==" , let r = R.Builtin "Text.=="
in (r, Just (string2' (Term.ref r) (==)), textCompareTyp, prefix "Text.==") in (r, Just (string2' (Term.ref r) (==)), textCompareTyp, prefix "Text.==")
, let r = R.Builtin "Text.<" , let r = R.Builtin "Text.<"

View File

@ -74,6 +74,9 @@ tests = withResource Common.node (\_ -> pure ()) $ \node ->
, t "Vector.fold-balanced (+) 0 [1,2,3]" "6" , t "Vector.fold-balanced (+) 0 [1,2,3]" "6"
, t "Vector.dedup-adjacent (==_Number) [1,1,2,2,3,4,4,4,4,5]" "[1,2,3,4,5]" , t "Vector.dedup-adjacent (==_Number) [1,1,2,2,3,4,4,4,4,5]" "[1,2,3,4,5]"
, t "Vector.dedup Number.Order [1,2,1,5,4,2,4,4,3,5]" "[1,2,3,4,5]" , t "Vector.dedup Number.Order [1,2,1,5,4,2,4,4,3,5]" "[1,2,3,4,5]"
, t "Vector.histogram Number.Order [1,2,1,5,4,2,4,4,3,5]" "[(1,2),(2,2),(3,1),(4,3),(5,2)]"
, t "Vector.ranked-histogram Number.Order [1,2,1,5,4,2,4,4,3,5]"
"[(4,3),(1,2),(2,2),(5,2),(3,1)]"
, t "Vector.range 0 10" "[0,1,2,3,4,5,6,7,8,9]" , t "Vector.range 0 10" "[0,1,2,3,4,5,6,7,8,9]"
, t "Vector.range 0 0" "[]" , t "Vector.range 0 0" "[]"
, t "Vector.fold-left (+) 0 (Vector.replicate 5 1)" "5" , t "Vector.fold-left (+) 0 (Vector.replicate 5 1)" "5"

View File

@ -77,9 +77,11 @@ Vector.fold-balanced plus zero vs =
Vector.fold-balanced1 : ∀ a . (a -> a -> a) -> Vector a -> Optional a; Vector.fold-balanced1 : ∀ a . (a -> a -> a) -> Vector a -> Optional a;
Vector.fold-balanced1 f v = Vector.fold-balanced (Optional.lift-or f) None (Vector.map Some v); Vector.fold-balanced1 f v = Vector.fold-balanced (Optional.lift-or f) None (Vector.map Some v);
Optional.lift-or : ∀ a . (a -> a -> a) -> Optional a -> Optional a -> Optional a; Vector.join : ∀ a . Vector (Vector a) -> Vector a;
Optional.lift-or f = a1 a2 -> Vector.join = Vector.bind identity;
a1 |> Optional.fold a2 (a1 -> Optional.fold None (a2 -> Some (f a1 a2)) a2);
Vector.filter : ∀ a . (a -> Boolean) -> Vector a -> Vector a;
Vector.filter f = Vector.bind (a -> if f a then [a] else []);
Vector.all? : ∀ a . (a -> Boolean) -> Vector a -> Boolean; Vector.all? : ∀ a . (a -> Boolean) -> Vector a -> Boolean;
Vector.all? f vs = Vector.fold-balanced and True (Vector.map f vs); Vector.all? f vs = Vector.fold-balanced and True (Vector.map f vs);
@ -106,6 +108,33 @@ Vector.dedup-adjacent eq v =
[] []
(Vector.map Vector.pure v); (Vector.map Vector.pure v);
Vector.drop-right : ∀ a . Number -> Vector a -> Vector a;
Vector.drop-right n v = Vector.take (Vector.size v - n) v;
Vector.take-right : ∀ a . Number -> Vector a -> Vector a;
Vector.take-right n v = Vector.drop (Vector.size v - n) v;
Vector.histogram : ∀ a . Order a -> Vector a -> Vector (a, Number);
Vector.histogram o v = let
merge-bin b1 b2 = (1st b1, 2nd b1 + 2nd b2);
combine bin1 bin2 =
Optional.map2 (p1 p2 -> if Order.equal o (1st p1) (1st p2)
then [merge-bin p1 p2]
else [p1, p2])
(Vector.last bin1) (Vector.1st bin2)
|> Optional.fold' (u -> Vector.concatenate bin1 bin2)
(p -> Vector.join [Vector.drop-right 1 bin1, p, Vector.drop 1 bin2])
<| Unit;
Vector.fold-balanced combine [] (Vector.map (a -> Vector.pure (a, 1)) (Vector.sort o v));;
;
Vector.ranked-histogram : ∀ a . Order a -> Vector a -> Vector (a, Number);
Vector.ranked-histogram o v =
Vector.histogram o v |> Vector.sort-by (Order.invert Number.Order) 2nd;
Vector.sum : Vector Number -> Number;
Vector.sum = Vector.fold-left (+) 0;
Vector.dedup : ∀ a . Order a -> Vector a -> Vector a; Vector.dedup : ∀ a . Order a -> Vector a -> Vector a;
Vector.dedup o v = Vector.dedup-adjacent (Order.equal o) (Vector.sort o v); Vector.dedup o v = Vector.dedup-adjacent (Order.equal o) (Vector.sort o v);
@ -219,6 +248,13 @@ Optional.map2 f a b = do Optional
pure (f a b);; pure (f a b);;
; ;
Optional.lift-or : ∀ a . (a -> a -> a) -> Optional a -> Optional a -> Optional a;
Optional.lift-or f = a1 a2 ->
a1 |> Optional.fold a2 (a1 -> Optional.fold None (a2 -> Some (f a1 a2)) a2);
Optional.fold' : ∀ a b . (Unit -> b) -> (a -> b) -> Optional a -> Unit -> b;
Optional.fold' thunk f = Optional.fold thunk (a u -> f a);
Either.map : ∀ a b c . (b -> c) -> Either a b -> Either a c; Either.map : ∀ a b c . (b -> c) -> Either a b -> Either a c;
Either.map f = Either.fold Left (f `and-then` Right); Either.map f = Either.fold Left (f `and-then` Right);

View File

@ -2,7 +2,7 @@
-- to pick which nodes are responsible for which keys. See: -- to pick which nodes are responsible for which keys. See:
-- https://en.wikipedia.org/wiki/Rendezvous_hashing -- https://en.wikipedia.org/wiki/Rendezvous_hashing
DIndex.Replication-Factor = 3; DIndex.Replication-Factor = 1;
DIndex.Timeout = Duration.seconds 10; DIndex.Timeout = Duration.seconds 10;
DIndex.Max-Timeout = Duration.seconds 500; DIndex.Max-Timeout = Duration.seconds 500;

View File

@ -1,11 +1,94 @@
-- going to need tuples at least let
-- allow declarations somehow alias DIndex k v = Index Node (Index k v);
-- going to need hashing alias Set v = Index v Unit;
alias SearchIndex = DIndex Text (Set Text);
alias VisitSet = DIndex (Hash Text) Unit;
let rec search : Number -> Vector Text -> SearchIndex
dindex : Vector Node -> Remote (Vector Text);
-> Remote (k -> Remote (Optional v), -- lookup search limit query ind = do Remote
k -> v -> Remote Unit) -- insert url-sets := Remote.traverse (k -> DIndex.lookup k ind) query;
url-sets = Vector.map Index.traversal (Optional.somes url-sets);
in zero = IndexedTraversal.empty;
_ merge = IndexedTraversal.intersect (Order.by-2nd Hash.Order);
urls = Optional.get-or IndexedTraversal.empty <| Vector.fold-balanced1 merge url-sets;
urls := IndexedTraversal.take-keys limit urls;
pure (Vector.map 1st urls);;
;
trim-to-host : Text -> Text;
trim-to-host url = Optional.get-or url <| do Optional
host := Uri.parse-authority url;
scheme := Uri.parse-scheme url;
pure (Text.concatenate scheme ("//" `Text.concatenate` host));;
;
-- | Convert url (possibly relative to parent) to an absolute url
resolve-url : Text -> Text -> Text;
resolve-url parent child =
if Text.take 1 child ==_Text "/" then
Text.concatenate (trim-to-host parent) child
else if (Text.take 5 child ==_Text "http:") `or` (Text.take 6 child ==_Text "https:") then
child
else parent `Text.concatenate` "/" `Text.concatenate` child
;
crawl : Number -> SearchIndex -> VisitSet -> Text -> Remote Unit;
crawl depth ind visited url = let rec
insert url keyword = do Remote
url-set := DIndex.lookup keyword ind;
Optional.fold
(do Remote
url-set := Index.empty;
DIndex.insert keyword url-set ind;
insert url keyword;;)
(Index.insert url Unit)
url-set;;
;
go depth url =
if depth <=_Number 0 then Remote.pure Unit
else do Remote
page := Remote.map (Debug.log "indexing url" url) (Http.get-url url);
page = Either.fold (err -> Debug.log "error fetching" (url, err) "") identity page;
page-hash := hash! page;
h := DIndex.lookup page-hash visited;
Optional.fold
(do Remote
page-text = Html.plain-text page;
keywords = Text.words page-text
|> Vector.map Text.lowercase
|> Vector.ranked-histogram Text.Order;
summary = Vector.drop 5 keywords |> Vector.take 100; -- hacky filter
keywords = summary;
-- rankings = Debug.watch "rs" <| Vector.map 2nd keywords;
-- rankings0 = Debug.watch "kw" <| Vector.map 1st keywords;
keywords = Vector.map 1st keywords;
links = Html.get-links page;
links = Vector.map (Html.get-href `and-then` resolve-url url) links;
-- insert all keywords for the page into the map
Remote.traverse (insert url) keywords;
-- mark page as visited
Debug.log "finished indexing" url <| DIndex.insert page-hash Unit visited;
-- recurse
Remote.traverse (go (depth - 1)) links;
pure Unit;;)
(x -> Remote.pure (Debug.log "already visited" url Unit))
h;;
;
go depth url;;
;
do Remote
n := Remote.spawn;
Remote.transfer n;
ind := DIndex.empty;
visited := DIndex.empty;
ind-nodes := Remote.replicate 3 Remote.spawn;
visited-nodes := Remote.replicate 3 Remote.spawn;
Remote.traverse (n -> Remote.at' n (DIndex.join ind)) ind-nodes;
Remote.traverse (n -> Remote.at' n (DIndex.join visited)) visited-nodes;
Remote.fork <| crawl 2 ind visited "http://unisonweb.org";
Remote.sleep (Duration.seconds 500);
results := search 10 ["design", "unison", "refactoring"] ind;
pure <| Debug.watch "results --- " results;;
;;