mirror of
https://github.com/unisonweb/unison.git
synced 2024-09-11 10:35:57 +03:00
search engine implementation + various utilities and tweaks
This commit is contained in:
parent
75a24f5a7c
commit
3fe4807d55
@ -13,6 +13,7 @@ import Unison.Type (Type)
|
|||||||
import Unison.Util.Logger (Logger)
|
import Unison.Util.Logger (Logger)
|
||||||
import qualified Data.Text as Text
|
import qualified Data.Text as Text
|
||||||
import qualified Data.Vector as Vector
|
import qualified Data.Vector as Vector
|
||||||
|
import qualified Network.URI as URI
|
||||||
import qualified Unison.Cryptography as C
|
import qualified Unison.Cryptography as C
|
||||||
import qualified Unison.Eval.Interpreter as I
|
import qualified Unison.Eval.Interpreter as I
|
||||||
import qualified Unison.Hash as Hash
|
import qualified Unison.Hash as Hash
|
||||||
@ -219,6 +220,24 @@ make _ blockStore crypto = do
|
|||||||
op _ = fail "Http.get-url# unpossible"
|
op _ = fail "Http.get-url# unpossible"
|
||||||
in (r, Just (I.Primop 1 op), unsafeParseType "Text -> Either Text Text", prefix "Http.get-url#")
|
in (r, Just (I.Primop 1 op), unsafeParseType "Text -> Either Text Text", prefix "Http.get-url#")
|
||||||
|
|
||||||
|
, let r = R.Builtin "Uri.parse-scheme"
|
||||||
|
op [Term.Text' url] = pure $ case URI.parseURI (Text.unpack url) of
|
||||||
|
Nothing -> none
|
||||||
|
Just uri -> some . Term.text . Text.pack $ URI.uriScheme uri
|
||||||
|
op _ = error "Uri.parse-scheme unpossible"
|
||||||
|
typ = "Text -> Optional Text"
|
||||||
|
in (r, Just (I.Primop 1 op), unsafeParseType typ, prefix "Uri.parse-scheme")
|
||||||
|
|
||||||
|
, let r = R.Builtin "Uri.parse-authority"
|
||||||
|
op [Term.Text' url] = pure $
|
||||||
|
case URI.parseURI (Text.unpack url) >>= URI.uriAuthority of
|
||||||
|
Nothing -> none
|
||||||
|
Just auth -> some . Term.text . Text.pack $
|
||||||
|
URI.uriUserInfo auth ++ URI.uriRegName auth ++ URI.uriPort auth
|
||||||
|
op _ = error "Uri.parse-authority unpossible"
|
||||||
|
typ = "Text -> Optional Text"
|
||||||
|
in (r, Just (I.Primop 1 op), unsafeParseType typ, prefix "Uri.parse-authority")
|
||||||
|
|
||||||
-- Hashing
|
-- Hashing
|
||||||
-- add erase, comparison functions
|
-- add erase, comparison functions
|
||||||
, let r = R.Builtin "hash#"
|
, let r = R.Builtin "hash#"
|
||||||
|
@ -2,7 +2,7 @@ module Unison.Runtime.Html where
|
|||||||
|
|
||||||
import Data.Maybe (listToMaybe, catMaybes, mapMaybe)
|
import Data.Maybe (listToMaybe, catMaybes, mapMaybe)
|
||||||
import Data.Text (Text, toLower, pack)
|
import Data.Text (Text, toLower, pack)
|
||||||
import Text.HTML.TagSoup (Tag(..), (~/=), maybeTagText, parseTags, innerText)
|
import Text.HTML.TagSoup (Tag(..), (~/=), maybeTagText, parseTags, innerText, isTagOpenName, isTagComment, isTagCloseName)
|
||||||
import qualified Data.Text as Text
|
import qualified Data.Text as Text
|
||||||
|
|
||||||
data Link = Link { ref :: Text, description :: Text } deriving (Show)
|
data Link = Link { ref :: Text, description :: Text } deriving (Show)
|
||||||
@ -26,4 +26,15 @@ getLinks :: Text -> [Link]
|
|||||||
getLinks s = mapMaybe sectionToLink . justAnchorSections $ parseTags s
|
getLinks s = mapMaybe sectionToLink . justAnchorSections $ parseTags s
|
||||||
|
|
||||||
toPlainText :: Text -> Text
|
toPlainText :: Text -> Text
|
||||||
toPlainText s = innerText $ parseTags s
|
toPlainText s = innerText . ignores $ parseTags s
|
||||||
|
|
||||||
|
ignores :: [Tag Text] -> [Tag Text]
|
||||||
|
ignores = go where
|
||||||
|
script = Text.pack "script"
|
||||||
|
style = Text.pack "style"
|
||||||
|
go [] = []
|
||||||
|
go (hd:tl) = case hd of
|
||||||
|
_ | isTagOpenName script hd -> go (dropWhile (not . isTagCloseName script) tl)
|
||||||
|
| isTagOpenName style hd -> go (dropWhile (not . isTagCloseName style) tl)
|
||||||
|
| isTagComment hd -> go tl
|
||||||
|
| otherwise -> hd : go tl
|
||||||
|
@ -97,7 +97,7 @@ scope msg = local tweak where
|
|||||||
-- | Crash with a message. Include the current logging scope.
|
-- | Crash with a message. Include the current logging scope.
|
||||||
crash :: String -> Multiplex a
|
crash :: String -> Multiplex a
|
||||||
crash msg = do
|
crash msg = do
|
||||||
warn msg
|
-- warn msg
|
||||||
scope msg $ do
|
scope msg $ do
|
||||||
l <- logger
|
l <- logger
|
||||||
fail (show $ L.getScope l)
|
fail (show $ L.getScope l)
|
||||||
@ -118,7 +118,7 @@ process recv = scope "Mux.process" $ do
|
|||||||
callback <- atomically $ M.lookup destination cbs
|
callback <- atomically $ M.lookup destination cbs
|
||||||
case callback of
|
case callback of
|
||||||
Nothing -> do
|
Nothing -> do
|
||||||
L.warn logger $ "dropped packet @ " ++ show (Base64.encode destination)
|
L.info logger $ "dropped packet @ " ++ show (Base64.encode destination)
|
||||||
pure True
|
pure True
|
||||||
Just callback -> do
|
Just callback -> do
|
||||||
L.debug logger $ "packet delivered @ " ++ show (Base64.encode destination)
|
L.debug logger $ "packet delivered @ " ++ show (Base64.encode destination)
|
||||||
|
@ -119,6 +119,7 @@ library
|
|||||||
mtl,
|
mtl,
|
||||||
murmur-hash,
|
murmur-hash,
|
||||||
network,
|
network,
|
||||||
|
network-uri,
|
||||||
network-simple,
|
network-simple,
|
||||||
prelude-extras,
|
prelude-extras,
|
||||||
process,
|
process,
|
||||||
@ -185,6 +186,7 @@ executable container
|
|||||||
memory,
|
memory,
|
||||||
mmorph,
|
mmorph,
|
||||||
mtl,
|
mtl,
|
||||||
|
network-uri,
|
||||||
process,
|
process,
|
||||||
safecopy,
|
safecopy,
|
||||||
scotty,
|
scotty,
|
||||||
@ -264,6 +266,7 @@ executable node
|
|||||||
memory,
|
memory,
|
||||||
mtl,
|
mtl,
|
||||||
murmur-hash,
|
murmur-hash,
|
||||||
|
network-uri,
|
||||||
prelude-extras,
|
prelude-extras,
|
||||||
random,
|
random,
|
||||||
safecopy,
|
safecopy,
|
||||||
|
@ -245,7 +245,7 @@ makeBuiltins logger whnf =
|
|||||||
|
|
||||||
-- Text
|
-- Text
|
||||||
, let r = R.Builtin "Text.concatenate"
|
, let r = R.Builtin "Text.concatenate"
|
||||||
in (r, Just (string2 (Term.ref r) mappend), strOpTyp, prefixes ["concatenate", "Text"])
|
in (r, Just (string2 (Term.ref r) mappend), strOpTyp, prefix "Text.concatenate")
|
||||||
, let r = R.Builtin "Text.=="
|
, let r = R.Builtin "Text.=="
|
||||||
in (r, Just (string2' (Term.ref r) (==)), textCompareTyp, prefix "Text.==")
|
in (r, Just (string2' (Term.ref r) (==)), textCompareTyp, prefix "Text.==")
|
||||||
, let r = R.Builtin "Text.<"
|
, let r = R.Builtin "Text.<"
|
||||||
|
@ -74,6 +74,9 @@ tests = withResource Common.node (\_ -> pure ()) $ \node ->
|
|||||||
, t "Vector.fold-balanced (+) 0 [1,2,3]" "6"
|
, t "Vector.fold-balanced (+) 0 [1,2,3]" "6"
|
||||||
, t "Vector.dedup-adjacent (==_Number) [1,1,2,2,3,4,4,4,4,5]" "[1,2,3,4,5]"
|
, t "Vector.dedup-adjacent (==_Number) [1,1,2,2,3,4,4,4,4,5]" "[1,2,3,4,5]"
|
||||||
, t "Vector.dedup Number.Order [1,2,1,5,4,2,4,4,3,5]" "[1,2,3,4,5]"
|
, t "Vector.dedup Number.Order [1,2,1,5,4,2,4,4,3,5]" "[1,2,3,4,5]"
|
||||||
|
, t "Vector.histogram Number.Order [1,2,1,5,4,2,4,4,3,5]" "[(1,2),(2,2),(3,1),(4,3),(5,2)]"
|
||||||
|
, t "Vector.ranked-histogram Number.Order [1,2,1,5,4,2,4,4,3,5]"
|
||||||
|
"[(4,3),(1,2),(2,2),(5,2),(3,1)]"
|
||||||
, t "Vector.range 0 10" "[0,1,2,3,4,5,6,7,8,9]"
|
, t "Vector.range 0 10" "[0,1,2,3,4,5,6,7,8,9]"
|
||||||
, t "Vector.range 0 0" "[]"
|
, t "Vector.range 0 0" "[]"
|
||||||
, t "Vector.fold-left (+) 0 (Vector.replicate 5 1)" "5"
|
, t "Vector.fold-left (+) 0 (Vector.replicate 5 1)" "5"
|
||||||
|
@ -77,9 +77,11 @@ Vector.fold-balanced plus zero vs =
|
|||||||
Vector.fold-balanced1 : ∀ a . (a -> a -> a) -> Vector a -> Optional a;
|
Vector.fold-balanced1 : ∀ a . (a -> a -> a) -> Vector a -> Optional a;
|
||||||
Vector.fold-balanced1 f v = Vector.fold-balanced (Optional.lift-or f) None (Vector.map Some v);
|
Vector.fold-balanced1 f v = Vector.fold-balanced (Optional.lift-or f) None (Vector.map Some v);
|
||||||
|
|
||||||
Optional.lift-or : ∀ a . (a -> a -> a) -> Optional a -> Optional a -> Optional a;
|
Vector.join : ∀ a . Vector (Vector a) -> Vector a;
|
||||||
Optional.lift-or f = a1 a2 ->
|
Vector.join = Vector.bind identity;
|
||||||
a1 |> Optional.fold a2 (a1 -> Optional.fold None (a2 -> Some (f a1 a2)) a2);
|
|
||||||
|
Vector.filter : ∀ a . (a -> Boolean) -> Vector a -> Vector a;
|
||||||
|
Vector.filter f = Vector.bind (a -> if f a then [a] else []);
|
||||||
|
|
||||||
Vector.all? : ∀ a . (a -> Boolean) -> Vector a -> Boolean;
|
Vector.all? : ∀ a . (a -> Boolean) -> Vector a -> Boolean;
|
||||||
Vector.all? f vs = Vector.fold-balanced and True (Vector.map f vs);
|
Vector.all? f vs = Vector.fold-balanced and True (Vector.map f vs);
|
||||||
@ -106,6 +108,33 @@ Vector.dedup-adjacent eq v =
|
|||||||
[]
|
[]
|
||||||
(Vector.map Vector.pure v);
|
(Vector.map Vector.pure v);
|
||||||
|
|
||||||
|
Vector.drop-right : ∀ a . Number -> Vector a -> Vector a;
|
||||||
|
Vector.drop-right n v = Vector.take (Vector.size v - n) v;
|
||||||
|
|
||||||
|
Vector.take-right : ∀ a . Number -> Vector a -> Vector a;
|
||||||
|
Vector.take-right n v = Vector.drop (Vector.size v - n) v;
|
||||||
|
|
||||||
|
Vector.histogram : ∀ a . Order a -> Vector a -> Vector (a, Number);
|
||||||
|
Vector.histogram o v = let
|
||||||
|
merge-bin b1 b2 = (1st b1, 2nd b1 + 2nd b2);
|
||||||
|
combine bin1 bin2 =
|
||||||
|
Optional.map2 (p1 p2 -> if Order.equal o (1st p1) (1st p2)
|
||||||
|
then [merge-bin p1 p2]
|
||||||
|
else [p1, p2])
|
||||||
|
(Vector.last bin1) (Vector.1st bin2)
|
||||||
|
|> Optional.fold' (u -> Vector.concatenate bin1 bin2)
|
||||||
|
(p -> Vector.join [Vector.drop-right 1 bin1, p, Vector.drop 1 bin2])
|
||||||
|
<| Unit;
|
||||||
|
Vector.fold-balanced combine [] (Vector.map (a -> Vector.pure (a, 1)) (Vector.sort o v));;
|
||||||
|
;
|
||||||
|
|
||||||
|
Vector.ranked-histogram : ∀ a . Order a -> Vector a -> Vector (a, Number);
|
||||||
|
Vector.ranked-histogram o v =
|
||||||
|
Vector.histogram o v |> Vector.sort-by (Order.invert Number.Order) 2nd;
|
||||||
|
|
||||||
|
Vector.sum : Vector Number -> Number;
|
||||||
|
Vector.sum = Vector.fold-left (+) 0;
|
||||||
|
|
||||||
Vector.dedup : ∀ a . Order a -> Vector a -> Vector a;
|
Vector.dedup : ∀ a . Order a -> Vector a -> Vector a;
|
||||||
Vector.dedup o v = Vector.dedup-adjacent (Order.equal o) (Vector.sort o v);
|
Vector.dedup o v = Vector.dedup-adjacent (Order.equal o) (Vector.sort o v);
|
||||||
|
|
||||||
@ -219,6 +248,13 @@ Optional.map2 f a b = do Optional
|
|||||||
pure (f a b);;
|
pure (f a b);;
|
||||||
;
|
;
|
||||||
|
|
||||||
|
Optional.lift-or : ∀ a . (a -> a -> a) -> Optional a -> Optional a -> Optional a;
|
||||||
|
Optional.lift-or f = a1 a2 ->
|
||||||
|
a1 |> Optional.fold a2 (a1 -> Optional.fold None (a2 -> Some (f a1 a2)) a2);
|
||||||
|
|
||||||
|
Optional.fold' : ∀ a b . (Unit -> b) -> (a -> b) -> Optional a -> Unit -> b;
|
||||||
|
Optional.fold' thunk f = Optional.fold thunk (a u -> f a);
|
||||||
|
|
||||||
Either.map : ∀ a b c . (b -> c) -> Either a b -> Either a c;
|
Either.map : ∀ a b c . (b -> c) -> Either a b -> Either a c;
|
||||||
Either.map f = Either.fold Left (f `and-then` Right);
|
Either.map f = Either.fold Left (f `and-then` Right);
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
-- to pick which nodes are responsible for which keys. See:
|
-- to pick which nodes are responsible for which keys. See:
|
||||||
-- https://en.wikipedia.org/wiki/Rendezvous_hashing
|
-- https://en.wikipedia.org/wiki/Rendezvous_hashing
|
||||||
|
|
||||||
DIndex.Replication-Factor = 3;
|
DIndex.Replication-Factor = 1;
|
||||||
DIndex.Timeout = Duration.seconds 10;
|
DIndex.Timeout = Duration.seconds 10;
|
||||||
DIndex.Max-Timeout = Duration.seconds 500;
|
DIndex.Max-Timeout = Duration.seconds 500;
|
||||||
|
|
||||||
|
@ -1,11 +1,94 @@
|
|||||||
-- going to need tuples at least
|
let
|
||||||
-- allow declarations somehow
|
alias DIndex k v = Index Node (Index k v);
|
||||||
-- going to need hashing
|
alias Set v = Index v Unit;
|
||||||
|
alias SearchIndex = DIndex Text (Set Text);
|
||||||
|
alias VisitSet = DIndex (Hash Text) Unit;
|
||||||
|
|
||||||
let rec
|
search : Number -> Vector Text -> SearchIndex
|
||||||
dindex : Vector Node
|
-> Remote (Vector Text);
|
||||||
-> Remote (k -> Remote (Optional v), -- lookup
|
search limit query ind = do Remote
|
||||||
k -> v -> Remote Unit) -- insert
|
url-sets := Remote.traverse (k -> DIndex.lookup k ind) query;
|
||||||
|
url-sets = Vector.map Index.traversal (Optional.somes url-sets);
|
||||||
in
|
zero = IndexedTraversal.empty;
|
||||||
_
|
merge = IndexedTraversal.intersect (Order.by-2nd Hash.Order);
|
||||||
|
urls = Optional.get-or IndexedTraversal.empty <| Vector.fold-balanced1 merge url-sets;
|
||||||
|
urls := IndexedTraversal.take-keys limit urls;
|
||||||
|
pure (Vector.map 1st urls);;
|
||||||
|
;
|
||||||
|
|
||||||
|
trim-to-host : Text -> Text;
|
||||||
|
trim-to-host url = Optional.get-or url <| do Optional
|
||||||
|
host := Uri.parse-authority url;
|
||||||
|
scheme := Uri.parse-scheme url;
|
||||||
|
pure (Text.concatenate scheme ("//" `Text.concatenate` host));;
|
||||||
|
;
|
||||||
|
|
||||||
|
-- | Convert url (possibly relative to parent) to an absolute url
|
||||||
|
resolve-url : Text -> Text -> Text;
|
||||||
|
resolve-url parent child =
|
||||||
|
if Text.take 1 child ==_Text "/" then
|
||||||
|
Text.concatenate (trim-to-host parent) child
|
||||||
|
else if (Text.take 5 child ==_Text "http:") `or` (Text.take 6 child ==_Text "https:") then
|
||||||
|
child
|
||||||
|
else parent `Text.concatenate` "/" `Text.concatenate` child
|
||||||
|
;
|
||||||
|
|
||||||
|
crawl : Number -> SearchIndex -> VisitSet -> Text -> Remote Unit;
|
||||||
|
crawl depth ind visited url = let rec
|
||||||
|
insert url keyword = do Remote
|
||||||
|
url-set := DIndex.lookup keyword ind;
|
||||||
|
Optional.fold
|
||||||
|
(do Remote
|
||||||
|
url-set := Index.empty;
|
||||||
|
DIndex.insert keyword url-set ind;
|
||||||
|
insert url keyword;;)
|
||||||
|
(Index.insert url Unit)
|
||||||
|
url-set;;
|
||||||
|
;
|
||||||
|
go depth url =
|
||||||
|
if depth <=_Number 0 then Remote.pure Unit
|
||||||
|
else do Remote
|
||||||
|
page := Remote.map (Debug.log "indexing url" url) (Http.get-url url);
|
||||||
|
page = Either.fold (err -> Debug.log "error fetching" (url, err) "") identity page;
|
||||||
|
page-hash := hash! page;
|
||||||
|
h := DIndex.lookup page-hash visited;
|
||||||
|
Optional.fold
|
||||||
|
(do Remote
|
||||||
|
page-text = Html.plain-text page;
|
||||||
|
keywords = Text.words page-text
|
||||||
|
|> Vector.map Text.lowercase
|
||||||
|
|> Vector.ranked-histogram Text.Order;
|
||||||
|
summary = Vector.drop 5 keywords |> Vector.take 100; -- hacky filter
|
||||||
|
keywords = summary;
|
||||||
|
-- rankings = Debug.watch "rs" <| Vector.map 2nd keywords;
|
||||||
|
-- rankings0 = Debug.watch "kw" <| Vector.map 1st keywords;
|
||||||
|
keywords = Vector.map 1st keywords;
|
||||||
|
links = Html.get-links page;
|
||||||
|
links = Vector.map (Html.get-href `and-then` resolve-url url) links;
|
||||||
|
-- insert all keywords for the page into the map
|
||||||
|
Remote.traverse (insert url) keywords;
|
||||||
|
-- mark page as visited
|
||||||
|
Debug.log "finished indexing" url <| DIndex.insert page-hash Unit visited;
|
||||||
|
-- recurse
|
||||||
|
Remote.traverse (go (depth - 1)) links;
|
||||||
|
pure Unit;;)
|
||||||
|
(x -> Remote.pure (Debug.log "already visited" url Unit))
|
||||||
|
h;;
|
||||||
|
;
|
||||||
|
go depth url;;
|
||||||
|
;
|
||||||
|
|
||||||
|
do Remote
|
||||||
|
n := Remote.spawn;
|
||||||
|
Remote.transfer n;
|
||||||
|
ind := DIndex.empty;
|
||||||
|
visited := DIndex.empty;
|
||||||
|
ind-nodes := Remote.replicate 3 Remote.spawn;
|
||||||
|
visited-nodes := Remote.replicate 3 Remote.spawn;
|
||||||
|
Remote.traverse (n -> Remote.at' n (DIndex.join ind)) ind-nodes;
|
||||||
|
Remote.traverse (n -> Remote.at' n (DIndex.join visited)) visited-nodes;
|
||||||
|
Remote.fork <| crawl 2 ind visited "http://unisonweb.org";
|
||||||
|
Remote.sleep (Duration.seconds 500);
|
||||||
|
results := search 10 ["design", "unison", "refactoring"] ind;
|
||||||
|
pure <| Debug.watch "results --- " results;;
|
||||||
|
;;
|
||||||
|
Loading…
Reference in New Issue
Block a user