mirror of
https://github.com/unisonweb/unison.git
synced 2024-09-11 10:35:57 +03:00
search engine implementation + various utilities and tweaks
This commit is contained in:
parent
75a24f5a7c
commit
3fe4807d55
@ -13,6 +13,7 @@ import Unison.Type (Type)
|
||||
import Unison.Util.Logger (Logger)
|
||||
import qualified Data.Text as Text
|
||||
import qualified Data.Vector as Vector
|
||||
import qualified Network.URI as URI
|
||||
import qualified Unison.Cryptography as C
|
||||
import qualified Unison.Eval.Interpreter as I
|
||||
import qualified Unison.Hash as Hash
|
||||
@ -219,6 +220,24 @@ make _ blockStore crypto = do
|
||||
op _ = fail "Http.get-url# unpossible"
|
||||
in (r, Just (I.Primop 1 op), unsafeParseType "Text -> Either Text Text", prefix "Http.get-url#")
|
||||
|
||||
, let r = R.Builtin "Uri.parse-scheme"
|
||||
op [Term.Text' url] = pure $ case URI.parseURI (Text.unpack url) of
|
||||
Nothing -> none
|
||||
Just uri -> some . Term.text . Text.pack $ URI.uriScheme uri
|
||||
op _ = error "Uri.parse-scheme unpossible"
|
||||
typ = "Text -> Optional Text"
|
||||
in (r, Just (I.Primop 1 op), unsafeParseType typ, prefix "Uri.parse-scheme")
|
||||
|
||||
, let r = R.Builtin "Uri.parse-authority"
|
||||
op [Term.Text' url] = pure $
|
||||
case URI.parseURI (Text.unpack url) >>= URI.uriAuthority of
|
||||
Nothing -> none
|
||||
Just auth -> some . Term.text . Text.pack $
|
||||
URI.uriUserInfo auth ++ URI.uriRegName auth ++ URI.uriPort auth
|
||||
op _ = error "Uri.parse-authority unpossible"
|
||||
typ = "Text -> Optional Text"
|
||||
in (r, Just (I.Primop 1 op), unsafeParseType typ, prefix "Uri.parse-authority")
|
||||
|
||||
-- Hashing
|
||||
-- add erase, comparison functions
|
||||
, let r = R.Builtin "hash#"
|
||||
|
@ -2,7 +2,7 @@ module Unison.Runtime.Html where
|
||||
|
||||
import Data.Maybe (listToMaybe, catMaybes, mapMaybe)
|
||||
import Data.Text (Text, toLower, pack)
|
||||
import Text.HTML.TagSoup (Tag(..), (~/=), maybeTagText, parseTags, innerText)
|
||||
import Text.HTML.TagSoup (Tag(..), (~/=), maybeTagText, parseTags, innerText, isTagOpenName, isTagComment, isTagCloseName)
|
||||
import qualified Data.Text as Text
|
||||
|
||||
data Link = Link { ref :: Text, description :: Text } deriving (Show)
|
||||
@ -26,4 +26,15 @@ getLinks :: Text -> [Link]
|
||||
getLinks s = mapMaybe sectionToLink . justAnchorSections $ parseTags s
|
||||
|
||||
toPlainText :: Text -> Text
|
||||
toPlainText s = innerText $ parseTags s
|
||||
toPlainText s = innerText . ignores $ parseTags s
|
||||
|
||||
ignores :: [Tag Text] -> [Tag Text]
|
||||
ignores = go where
|
||||
script = Text.pack "script"
|
||||
style = Text.pack "style"
|
||||
go [] = []
|
||||
go (hd:tl) = case hd of
|
||||
_ | isTagOpenName script hd -> go (dropWhile (not . isTagCloseName script) tl)
|
||||
| isTagOpenName style hd -> go (dropWhile (not . isTagCloseName style) tl)
|
||||
| isTagComment hd -> go tl
|
||||
| otherwise -> hd : go tl
|
||||
|
@ -97,7 +97,7 @@ scope msg = local tweak where
|
||||
-- | Crash with a message. Include the current logging scope.
|
||||
crash :: String -> Multiplex a
|
||||
crash msg = do
|
||||
warn msg
|
||||
-- warn msg
|
||||
scope msg $ do
|
||||
l <- logger
|
||||
fail (show $ L.getScope l)
|
||||
@ -118,7 +118,7 @@ process recv = scope "Mux.process" $ do
|
||||
callback <- atomically $ M.lookup destination cbs
|
||||
case callback of
|
||||
Nothing -> do
|
||||
L.warn logger $ "dropped packet @ " ++ show (Base64.encode destination)
|
||||
L.info logger $ "dropped packet @ " ++ show (Base64.encode destination)
|
||||
pure True
|
||||
Just callback -> do
|
||||
L.debug logger $ "packet delivered @ " ++ show (Base64.encode destination)
|
||||
|
@ -119,6 +119,7 @@ library
|
||||
mtl,
|
||||
murmur-hash,
|
||||
network,
|
||||
network-uri,
|
||||
network-simple,
|
||||
prelude-extras,
|
||||
process,
|
||||
@ -185,6 +186,7 @@ executable container
|
||||
memory,
|
||||
mmorph,
|
||||
mtl,
|
||||
network-uri,
|
||||
process,
|
||||
safecopy,
|
||||
scotty,
|
||||
@ -264,6 +266,7 @@ executable node
|
||||
memory,
|
||||
mtl,
|
||||
murmur-hash,
|
||||
network-uri,
|
||||
prelude-extras,
|
||||
random,
|
||||
safecopy,
|
||||
|
@ -245,7 +245,7 @@ makeBuiltins logger whnf =
|
||||
|
||||
-- Text
|
||||
, let r = R.Builtin "Text.concatenate"
|
||||
in (r, Just (string2 (Term.ref r) mappend), strOpTyp, prefixes ["concatenate", "Text"])
|
||||
in (r, Just (string2 (Term.ref r) mappend), strOpTyp, prefix "Text.concatenate")
|
||||
, let r = R.Builtin "Text.=="
|
||||
in (r, Just (string2' (Term.ref r) (==)), textCompareTyp, prefix "Text.==")
|
||||
, let r = R.Builtin "Text.<"
|
||||
|
@ -74,6 +74,9 @@ tests = withResource Common.node (\_ -> pure ()) $ \node ->
|
||||
, t "Vector.fold-balanced (+) 0 [1,2,3]" "6"
|
||||
, t "Vector.dedup-adjacent (==_Number) [1,1,2,2,3,4,4,4,4,5]" "[1,2,3,4,5]"
|
||||
, t "Vector.dedup Number.Order [1,2,1,5,4,2,4,4,3,5]" "[1,2,3,4,5]"
|
||||
, t "Vector.histogram Number.Order [1,2,1,5,4,2,4,4,3,5]" "[(1,2),(2,2),(3,1),(4,3),(5,2)]"
|
||||
, t "Vector.ranked-histogram Number.Order [1,2,1,5,4,2,4,4,3,5]"
|
||||
"[(4,3),(1,2),(2,2),(5,2),(3,1)]"
|
||||
, t "Vector.range 0 10" "[0,1,2,3,4,5,6,7,8,9]"
|
||||
, t "Vector.range 0 0" "[]"
|
||||
, t "Vector.fold-left (+) 0 (Vector.replicate 5 1)" "5"
|
||||
|
@ -77,9 +77,11 @@ Vector.fold-balanced plus zero vs =
|
||||
Vector.fold-balanced1 : ∀ a . (a -> a -> a) -> Vector a -> Optional a;
|
||||
Vector.fold-balanced1 f v = Vector.fold-balanced (Optional.lift-or f) None (Vector.map Some v);
|
||||
|
||||
Optional.lift-or : ∀ a . (a -> a -> a) -> Optional a -> Optional a -> Optional a;
|
||||
Optional.lift-or f = a1 a2 ->
|
||||
a1 |> Optional.fold a2 (a1 -> Optional.fold None (a2 -> Some (f a1 a2)) a2);
|
||||
Vector.join : ∀ a . Vector (Vector a) -> Vector a;
|
||||
Vector.join = Vector.bind identity;
|
||||
|
||||
Vector.filter : ∀ a . (a -> Boolean) -> Vector a -> Vector a;
|
||||
Vector.filter f = Vector.bind (a -> if f a then [a] else []);
|
||||
|
||||
Vector.all? : ∀ a . (a -> Boolean) -> Vector a -> Boolean;
|
||||
Vector.all? f vs = Vector.fold-balanced and True (Vector.map f vs);
|
||||
@ -106,6 +108,33 @@ Vector.dedup-adjacent eq v =
|
||||
[]
|
||||
(Vector.map Vector.pure v);
|
||||
|
||||
Vector.drop-right : ∀ a . Number -> Vector a -> Vector a;
|
||||
Vector.drop-right n v = Vector.take (Vector.size v - n) v;
|
||||
|
||||
Vector.take-right : ∀ a . Number -> Vector a -> Vector a;
|
||||
Vector.take-right n v = Vector.drop (Vector.size v - n) v;
|
||||
|
||||
Vector.histogram : ∀ a . Order a -> Vector a -> Vector (a, Number);
|
||||
Vector.histogram o v = let
|
||||
merge-bin b1 b2 = (1st b1, 2nd b1 + 2nd b2);
|
||||
combine bin1 bin2 =
|
||||
Optional.map2 (p1 p2 -> if Order.equal o (1st p1) (1st p2)
|
||||
then [merge-bin p1 p2]
|
||||
else [p1, p2])
|
||||
(Vector.last bin1) (Vector.1st bin2)
|
||||
|> Optional.fold' (u -> Vector.concatenate bin1 bin2)
|
||||
(p -> Vector.join [Vector.drop-right 1 bin1, p, Vector.drop 1 bin2])
|
||||
<| Unit;
|
||||
Vector.fold-balanced combine [] (Vector.map (a -> Vector.pure (a, 1)) (Vector.sort o v));;
|
||||
;
|
||||
|
||||
Vector.ranked-histogram : ∀ a . Order a -> Vector a -> Vector (a, Number);
|
||||
Vector.ranked-histogram o v =
|
||||
Vector.histogram o v |> Vector.sort-by (Order.invert Number.Order) 2nd;
|
||||
|
||||
Vector.sum : Vector Number -> Number;
|
||||
Vector.sum = Vector.fold-left (+) 0;
|
||||
|
||||
Vector.dedup : ∀ a . Order a -> Vector a -> Vector a;
|
||||
Vector.dedup o v = Vector.dedup-adjacent (Order.equal o) (Vector.sort o v);
|
||||
|
||||
@ -219,6 +248,13 @@ Optional.map2 f a b = do Optional
|
||||
pure (f a b);;
|
||||
;
|
||||
|
||||
Optional.lift-or : ∀ a . (a -> a -> a) -> Optional a -> Optional a -> Optional a;
|
||||
Optional.lift-or f = a1 a2 ->
|
||||
a1 |> Optional.fold a2 (a1 -> Optional.fold None (a2 -> Some (f a1 a2)) a2);
|
||||
|
||||
Optional.fold' : ∀ a b . (Unit -> b) -> (a -> b) -> Optional a -> Unit -> b;
|
||||
Optional.fold' thunk f = Optional.fold thunk (a u -> f a);
|
||||
|
||||
Either.map : ∀ a b c . (b -> c) -> Either a b -> Either a c;
|
||||
Either.map f = Either.fold Left (f `and-then` Right);
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
-- to pick which nodes are responsible for which keys. See:
|
||||
-- https://en.wikipedia.org/wiki/Rendezvous_hashing
|
||||
|
||||
DIndex.Replication-Factor = 3;
|
||||
DIndex.Replication-Factor = 1;
|
||||
DIndex.Timeout = Duration.seconds 10;
|
||||
DIndex.Max-Timeout = Duration.seconds 500;
|
||||
|
||||
|
@ -1,11 +1,94 @@
|
||||
-- going to need tuples at least
|
||||
-- allow declarations somehow
|
||||
-- going to need hashing
|
||||
let
|
||||
alias DIndex k v = Index Node (Index k v);
|
||||
alias Set v = Index v Unit;
|
||||
alias SearchIndex = DIndex Text (Set Text);
|
||||
alias VisitSet = DIndex (Hash Text) Unit;
|
||||
|
||||
let rec
|
||||
dindex : Vector Node
|
||||
-> Remote (k -> Remote (Optional v), -- lookup
|
||||
k -> v -> Remote Unit) -- insert
|
||||
search : Number -> Vector Text -> SearchIndex
|
||||
-> Remote (Vector Text);
|
||||
search limit query ind = do Remote
|
||||
url-sets := Remote.traverse (k -> DIndex.lookup k ind) query;
|
||||
url-sets = Vector.map Index.traversal (Optional.somes url-sets);
|
||||
zero = IndexedTraversal.empty;
|
||||
merge = IndexedTraversal.intersect (Order.by-2nd Hash.Order);
|
||||
urls = Optional.get-or IndexedTraversal.empty <| Vector.fold-balanced1 merge url-sets;
|
||||
urls := IndexedTraversal.take-keys limit urls;
|
||||
pure (Vector.map 1st urls);;
|
||||
;
|
||||
|
||||
in
|
||||
_
|
||||
trim-to-host : Text -> Text;
|
||||
trim-to-host url = Optional.get-or url <| do Optional
|
||||
host := Uri.parse-authority url;
|
||||
scheme := Uri.parse-scheme url;
|
||||
pure (Text.concatenate scheme ("//" `Text.concatenate` host));;
|
||||
;
|
||||
|
||||
-- | Convert url (possibly relative to parent) to an absolute url
|
||||
resolve-url : Text -> Text -> Text;
|
||||
resolve-url parent child =
|
||||
if Text.take 1 child ==_Text "/" then
|
||||
Text.concatenate (trim-to-host parent) child
|
||||
else if (Text.take 5 child ==_Text "http:") `or` (Text.take 6 child ==_Text "https:") then
|
||||
child
|
||||
else parent `Text.concatenate` "/" `Text.concatenate` child
|
||||
;
|
||||
|
||||
crawl : Number -> SearchIndex -> VisitSet -> Text -> Remote Unit;
|
||||
crawl depth ind visited url = let rec
|
||||
insert url keyword = do Remote
|
||||
url-set := DIndex.lookup keyword ind;
|
||||
Optional.fold
|
||||
(do Remote
|
||||
url-set := Index.empty;
|
||||
DIndex.insert keyword url-set ind;
|
||||
insert url keyword;;)
|
||||
(Index.insert url Unit)
|
||||
url-set;;
|
||||
;
|
||||
go depth url =
|
||||
if depth <=_Number 0 then Remote.pure Unit
|
||||
else do Remote
|
||||
page := Remote.map (Debug.log "indexing url" url) (Http.get-url url);
|
||||
page = Either.fold (err -> Debug.log "error fetching" (url, err) "") identity page;
|
||||
page-hash := hash! page;
|
||||
h := DIndex.lookup page-hash visited;
|
||||
Optional.fold
|
||||
(do Remote
|
||||
page-text = Html.plain-text page;
|
||||
keywords = Text.words page-text
|
||||
|> Vector.map Text.lowercase
|
||||
|> Vector.ranked-histogram Text.Order;
|
||||
summary = Vector.drop 5 keywords |> Vector.take 100; -- hacky filter
|
||||
keywords = summary;
|
||||
-- rankings = Debug.watch "rs" <| Vector.map 2nd keywords;
|
||||
-- rankings0 = Debug.watch "kw" <| Vector.map 1st keywords;
|
||||
keywords = Vector.map 1st keywords;
|
||||
links = Html.get-links page;
|
||||
links = Vector.map (Html.get-href `and-then` resolve-url url) links;
|
||||
-- insert all keywords for the page into the map
|
||||
Remote.traverse (insert url) keywords;
|
||||
-- mark page as visited
|
||||
Debug.log "finished indexing" url <| DIndex.insert page-hash Unit visited;
|
||||
-- recurse
|
||||
Remote.traverse (go (depth - 1)) links;
|
||||
pure Unit;;)
|
||||
(x -> Remote.pure (Debug.log "already visited" url Unit))
|
||||
h;;
|
||||
;
|
||||
go depth url;;
|
||||
;
|
||||
|
||||
do Remote
|
||||
n := Remote.spawn;
|
||||
Remote.transfer n;
|
||||
ind := DIndex.empty;
|
||||
visited := DIndex.empty;
|
||||
ind-nodes := Remote.replicate 3 Remote.spawn;
|
||||
visited-nodes := Remote.replicate 3 Remote.spawn;
|
||||
Remote.traverse (n -> Remote.at' n (DIndex.join ind)) ind-nodes;
|
||||
Remote.traverse (n -> Remote.at' n (DIndex.join visited)) visited-nodes;
|
||||
Remote.fork <| crawl 2 ind visited "http://unisonweb.org";
|
||||
Remote.sleep (Duration.seconds 500);
|
||||
results := search 10 ["design", "unison", "refactoring"] ind;
|
||||
pure <| Debug.watch "results --- " results;;
|
||||
;;
|
||||
|
Loading…
Reference in New Issue
Block a user