search engine implementation + various utilities and tweaks

This commit is contained in:
Paul Chiusano 2016-10-05 16:57:32 -04:00
parent 75a24f5a7c
commit 3fe4807d55
9 changed files with 174 additions and 19 deletions

View File

@ -13,6 +13,7 @@ import Unison.Type (Type)
import Unison.Util.Logger (Logger)
import qualified Data.Text as Text
import qualified Data.Vector as Vector
import qualified Network.URI as URI
import qualified Unison.Cryptography as C
import qualified Unison.Eval.Interpreter as I
import qualified Unison.Hash as Hash
@ -219,6 +220,24 @@ make _ blockStore crypto = do
op _ = fail "Http.get-url# unpossible"
in (r, Just (I.Primop 1 op), unsafeParseType "Text -> Either Text Text", prefix "Http.get-url#")
, let r = R.Builtin "Uri.parse-scheme"
op [Term.Text' url] = pure $ case URI.parseURI (Text.unpack url) of
Nothing -> none
Just uri -> some . Term.text . Text.pack $ URI.uriScheme uri
op _ = error "Uri.parse-scheme unpossible"
typ = "Text -> Optional Text"
in (r, Just (I.Primop 1 op), unsafeParseType typ, prefix "Uri.parse-scheme")
, let r = R.Builtin "Uri.parse-authority"
op [Term.Text' url] = pure $
case URI.parseURI (Text.unpack url) >>= URI.uriAuthority of
Nothing -> none
Just auth -> some . Term.text . Text.pack $
URI.uriUserInfo auth ++ URI.uriRegName auth ++ URI.uriPort auth
op _ = error "Uri.parse-authority unpossible"
typ = "Text -> Optional Text"
in (r, Just (I.Primop 1 op), unsafeParseType typ, prefix "Uri.parse-authority")
-- Hashing
-- add erase, comparison functions
, let r = R.Builtin "hash#"

View File

@ -2,7 +2,7 @@ module Unison.Runtime.Html where
import Data.Maybe (listToMaybe, catMaybes, mapMaybe)
import Data.Text (Text, toLower, pack)
import Text.HTML.TagSoup (Tag(..), (~/=), maybeTagText, parseTags, innerText)
import Text.HTML.TagSoup (Tag(..), (~/=), maybeTagText, parseTags, innerText, isTagOpenName, isTagComment, isTagCloseName)
import qualified Data.Text as Text
data Link = Link { ref :: Text, description :: Text } deriving (Show)
@ -26,4 +26,15 @@ getLinks :: Text -> [Link]
getLinks s = mapMaybe sectionToLink . justAnchorSections $ parseTags s
toPlainText :: Text -> Text
toPlainText s = innerText $ parseTags s
toPlainText s = innerText . ignores $ parseTags s
ignores :: [Tag Text] -> [Tag Text]
ignores = go where
script = Text.pack "script"
style = Text.pack "style"
go [] = []
go (hd:tl) = case hd of
_ | isTagOpenName script hd -> go (dropWhile (not . isTagCloseName script) tl)
| isTagOpenName style hd -> go (dropWhile (not . isTagCloseName style) tl)
| isTagComment hd -> go tl
| otherwise -> hd : go tl

View File

@ -97,7 +97,7 @@ scope msg = local tweak where
-- | Crash with a message. Include the current logging scope.
crash :: String -> Multiplex a
crash msg = do
warn msg
-- warn msg
scope msg $ do
l <- logger
fail (show $ L.getScope l)
@ -118,7 +118,7 @@ process recv = scope "Mux.process" $ do
callback <- atomically $ M.lookup destination cbs
case callback of
Nothing -> do
L.warn logger $ "dropped packet @ " ++ show (Base64.encode destination)
L.info logger $ "dropped packet @ " ++ show (Base64.encode destination)
pure True
Just callback -> do
L.debug logger $ "packet delivered @ " ++ show (Base64.encode destination)

View File

@ -119,6 +119,7 @@ library
mtl,
murmur-hash,
network,
network-uri,
network-simple,
prelude-extras,
process,
@ -185,6 +186,7 @@ executable container
memory,
mmorph,
mtl,
network-uri,
process,
safecopy,
scotty,
@ -264,6 +266,7 @@ executable node
memory,
mtl,
murmur-hash,
network-uri,
prelude-extras,
random,
safecopy,

View File

@ -245,7 +245,7 @@ makeBuiltins logger whnf =
-- Text
, let r = R.Builtin "Text.concatenate"
in (r, Just (string2 (Term.ref r) mappend), strOpTyp, prefixes ["concatenate", "Text"])
in (r, Just (string2 (Term.ref r) mappend), strOpTyp, prefix "Text.concatenate")
, let r = R.Builtin "Text.=="
in (r, Just (string2' (Term.ref r) (==)), textCompareTyp, prefix "Text.==")
, let r = R.Builtin "Text.<"

View File

@ -74,6 +74,9 @@ tests = withResource Common.node (\_ -> pure ()) $ \node ->
, t "Vector.fold-balanced (+) 0 [1,2,3]" "6"
, t "Vector.dedup-adjacent (==_Number) [1,1,2,2,3,4,4,4,4,5]" "[1,2,3,4,5]"
, t "Vector.dedup Number.Order [1,2,1,5,4,2,4,4,3,5]" "[1,2,3,4,5]"
, t "Vector.histogram Number.Order [1,2,1,5,4,2,4,4,3,5]" "[(1,2),(2,2),(3,1),(4,3),(5,2)]"
, t "Vector.ranked-histogram Number.Order [1,2,1,5,4,2,4,4,3,5]"
"[(4,3),(1,2),(2,2),(5,2),(3,1)]"
, t "Vector.range 0 10" "[0,1,2,3,4,5,6,7,8,9]"
, t "Vector.range 0 0" "[]"
, t "Vector.fold-left (+) 0 (Vector.replicate 5 1)" "5"

View File

@ -77,9 +77,11 @@ Vector.fold-balanced plus zero vs =
Vector.fold-balanced1 : ∀ a . (a -> a -> a) -> Vector a -> Optional a;
Vector.fold-balanced1 f v = Vector.fold-balanced (Optional.lift-or f) None (Vector.map Some v);
Optional.lift-or : ∀ a . (a -> a -> a) -> Optional a -> Optional a -> Optional a;
Optional.lift-or f = a1 a2 ->
a1 |> Optional.fold a2 (a1 -> Optional.fold None (a2 -> Some (f a1 a2)) a2);
Vector.join : ∀ a . Vector (Vector a) -> Vector a;
Vector.join = Vector.bind identity;
Vector.filter : ∀ a . (a -> Boolean) -> Vector a -> Vector a;
Vector.filter f = Vector.bind (a -> if f a then [a] else []);
Vector.all? : ∀ a . (a -> Boolean) -> Vector a -> Boolean;
Vector.all? f vs = Vector.fold-balanced and True (Vector.map f vs);
@ -106,6 +108,33 @@ Vector.dedup-adjacent eq v =
[]
(Vector.map Vector.pure v);
Vector.drop-right : ∀ a . Number -> Vector a -> Vector a;
Vector.drop-right n v = Vector.take (Vector.size v - n) v;
Vector.take-right : ∀ a . Number -> Vector a -> Vector a;
Vector.take-right n v = Vector.drop (Vector.size v - n) v;
Vector.histogram : ∀ a . Order a -> Vector a -> Vector (a, Number);
Vector.histogram o v = let
merge-bin b1 b2 = (1st b1, 2nd b1 + 2nd b2);
combine bin1 bin2 =
Optional.map2 (p1 p2 -> if Order.equal o (1st p1) (1st p2)
then [merge-bin p1 p2]
else [p1, p2])
(Vector.last bin1) (Vector.1st bin2)
|> Optional.fold' (u -> Vector.concatenate bin1 bin2)
(p -> Vector.join [Vector.drop-right 1 bin1, p, Vector.drop 1 bin2])
<| Unit;
Vector.fold-balanced combine [] (Vector.map (a -> Vector.pure (a, 1)) (Vector.sort o v));;
;
Vector.ranked-histogram : ∀ a . Order a -> Vector a -> Vector (a, Number);
Vector.ranked-histogram o v =
Vector.histogram o v |> Vector.sort-by (Order.invert Number.Order) 2nd;
Vector.sum : Vector Number -> Number;
Vector.sum = Vector.fold-left (+) 0;
Vector.dedup : ∀ a . Order a -> Vector a -> Vector a;
Vector.dedup o v = Vector.dedup-adjacent (Order.equal o) (Vector.sort o v);
@ -219,6 +248,13 @@ Optional.map2 f a b = do Optional
pure (f a b);;
;
Optional.lift-or : ∀ a . (a -> a -> a) -> Optional a -> Optional a -> Optional a;
Optional.lift-or f = a1 a2 ->
a1 |> Optional.fold a2 (a1 -> Optional.fold None (a2 -> Some (f a1 a2)) a2);
Optional.fold' : ∀ a b . (Unit -> b) -> (a -> b) -> Optional a -> Unit -> b;
Optional.fold' thunk f = Optional.fold thunk (a u -> f a);
Either.map : ∀ a b c . (b -> c) -> Either a b -> Either a c;
Either.map f = Either.fold Left (f `and-then` Right);

View File

@ -2,7 +2,7 @@
-- to pick which nodes are responsible for which keys. See:
-- https://en.wikipedia.org/wiki/Rendezvous_hashing
DIndex.Replication-Factor = 3;
DIndex.Replication-Factor = 1;
DIndex.Timeout = Duration.seconds 10;
DIndex.Max-Timeout = Duration.seconds 500;

View File

@ -1,11 +1,94 @@
-- going to need tuples at least
-- allow declarations somehow
-- going to need hashing
let
alias DIndex k v = Index Node (Index k v);
alias Set v = Index v Unit;
alias SearchIndex = DIndex Text (Set Text);
alias VisitSet = DIndex (Hash Text) Unit;
let rec
dindex : Vector Node
-> Remote (k -> Remote (Optional v), -- lookup
k -> v -> Remote Unit) -- insert
in
_
search : Number -> Vector Text -> SearchIndex
-> Remote (Vector Text);
search limit query ind = do Remote
url-sets := Remote.traverse (k -> DIndex.lookup k ind) query;
url-sets = Vector.map Index.traversal (Optional.somes url-sets);
zero = IndexedTraversal.empty;
merge = IndexedTraversal.intersect (Order.by-2nd Hash.Order);
urls = Optional.get-or IndexedTraversal.empty <| Vector.fold-balanced1 merge url-sets;
urls := IndexedTraversal.take-keys limit urls;
pure (Vector.map 1st urls);;
;
trim-to-host : Text -> Text;
trim-to-host url = Optional.get-or url <| do Optional
host := Uri.parse-authority url;
scheme := Uri.parse-scheme url;
pure (Text.concatenate scheme ("//" `Text.concatenate` host));;
;
-- | Convert url (possibly relative to parent) to an absolute url
resolve-url : Text -> Text -> Text;
resolve-url parent child =
if Text.take 1 child ==_Text "/" then
Text.concatenate (trim-to-host parent) child
else if (Text.take 5 child ==_Text "http:") `or` (Text.take 6 child ==_Text "https:") then
child
else parent `Text.concatenate` "/" `Text.concatenate` child
;
crawl : Number -> SearchIndex -> VisitSet -> Text -> Remote Unit;
crawl depth ind visited url = let rec
insert url keyword = do Remote
url-set := DIndex.lookup keyword ind;
Optional.fold
(do Remote
url-set := Index.empty;
DIndex.insert keyword url-set ind;
insert url keyword;;)
(Index.insert url Unit)
url-set;;
;
go depth url =
if depth <=_Number 0 then Remote.pure Unit
else do Remote
page := Remote.map (Debug.log "indexing url" url) (Http.get-url url);
page = Either.fold (err -> Debug.log "error fetching" (url, err) "") identity page;
page-hash := hash! page;
h := DIndex.lookup page-hash visited;
Optional.fold
(do Remote
page-text = Html.plain-text page;
keywords = Text.words page-text
|> Vector.map Text.lowercase
|> Vector.ranked-histogram Text.Order;
summary = Vector.drop 5 keywords |> Vector.take 100; -- hacky filter
keywords = summary;
-- rankings = Debug.watch "rs" <| Vector.map 2nd keywords;
-- rankings0 = Debug.watch "kw" <| Vector.map 1st keywords;
keywords = Vector.map 1st keywords;
links = Html.get-links page;
links = Vector.map (Html.get-href `and-then` resolve-url url) links;
-- insert all keywords for the page into the map
Remote.traverse (insert url) keywords;
-- mark page as visited
Debug.log "finished indexing" url <| DIndex.insert page-hash Unit visited;
-- recurse
Remote.traverse (go (depth - 1)) links;
pure Unit;;)
(x -> Remote.pure (Debug.log "already visited" url Unit))
h;;
;
go depth url;;
;
do Remote
n := Remote.spawn;
Remote.transfer n;
ind := DIndex.empty;
visited := DIndex.empty;
ind-nodes := Remote.replicate 3 Remote.spawn;
visited-nodes := Remote.replicate 3 Remote.spawn;
Remote.traverse (n -> Remote.at' n (DIndex.join ind)) ind-nodes;
Remote.traverse (n -> Remote.at' n (DIndex.join visited)) visited-nodes;
Remote.fork <| crawl 2 ind visited "http://unisonweb.org";
Remote.sleep (Duration.seconds 500);
results := search 10 ["design", "unison", "refactoring"] ind;
pure <| Debug.watch "results --- " results;;
;;