clean up search engine example and add a few more builtins

This commit is contained in:
Paul Chiusano 2016-10-11 16:53:00 -04:00
parent a317b1124c
commit c5a1c6fed2
6 changed files with 78 additions and 21 deletions

View File

@ -3,6 +3,8 @@ module Unison.Runtime.Html where
import Data.Maybe (listToMaybe, catMaybes, mapMaybe)
import Data.Text (Text, toLower, pack)
import Text.HTML.TagSoup (Tag(..), (~/=), maybeTagText, parseTags, innerText, isTagOpenName, isTagComment, isTagCloseName)
import qualified Text.HTML.TagSoup.Tree as HT
import qualified Data.Char as Char
import qualified Data.Text as Text
data Link = Link { ref :: Text, description :: Text } deriving (Show)
@ -15,6 +17,10 @@ justAnchorSections l =
$ dropWhile (\t -> t ~/= "<a>" && t ~/= "<A>") l
in newSection : justAnchorSections remaining
tagSubtrees :: Text -> [Tag Text] -> [Tag Text]
tagSubtrees tag ts =
HT.flattenTree [x | x@(HT.TagBranch tag' _ _) <- HT.universeTree (HT.tagTree ts), tag == tag' ]
sectionToLink :: [Tag Text] -> Maybe Link
sectionToLink (TagOpen _ attrList : otherTags) =
let href = listToMaybe $ filter (\(a, _) -> toLower a == pack "href") attrList
@ -26,7 +32,15 @@ getLinks :: Text -> [Link]
getLinks s = mapMaybe sectionToLink . justAnchorSections $ parseTags s
toPlainText :: Text -> Text
toPlainText s = innerText . ignores $ parseTags s
toPlainText s =
Text.concat [headings s, Text.pack "\n", collapseSpaces . innerText . ignores $ parseTags s]
collapseSpaces :: Text -> Text
collapseSpaces t =
Text.intercalate (Text.pack " ") . filter (/= Text.empty) . Text.split Char.isSpace $ t
headings :: Text -> Text
headings s = collapseSpaces . innerText . tagSubtrees (Text.pack "h1") . parseTags $ s
ignores :: [Tag Text] -> [Tag Text]
ignores = go where

View File

@ -97,7 +97,7 @@ scope msg = local tweak where
-- | Crash with a message. Include the current logging scope.
crash :: String -> Multiplex a
crash msg = do
-- warn msg
warn msg
scope msg $ do
l <- logger
fail (show $ L.getScope l)

View File

@ -262,6 +262,16 @@ make logger =
op _ = error "Text.words unpossible"
typ = "Text -> Vector Text"
in (r, Just (I.Primop 1 op), unsafeParseType typ, prefix "Text.words")
, let r = R.Builtin "Text.length"
op [Term.Text' txt] = pure $ Term.num (fromIntegral $ Text.length txt)
op _ = error "Text.words unpossible"
typ = "Text -> Number"
in (r, Just (I.Primop 1 op), unsafeParseType typ, prefix "Text.length")
, let r = R.Builtin "Text.newline"
op [] = pure $ Term.text "\n"
op _ = error "Text.newline unpossible"
typ = "Text"
in (r, Just (I.Primop 0 op), unsafeParseType typ, prefix "Text.newline")
-- Pair
, let r = R.Builtin "Pair"

View File

@ -90,6 +90,7 @@ tests = withResource Common.codebase (\_ -> pure ()) $ \codebase ->
, t "Vector.take 0 [1,2,3]" "[]"
, t "Vector.take 2 [1,2,3]" "[1,2]"
, t "Vector.drop 2 [1,2,3]" "[3]"
, t "Text.join [\"a\", \"b\", \"c\"]" "\"abc\""
]
t uneval eval = testCase (uneval ++ "" ++ eval) $ do
(codebase, _, builtins, evaluate) <- codebase

View File

@ -267,3 +267,13 @@ Either.bind = Either.fold Left;
Either.swap : ∀ a b . Either a b -> Either b a;
Either.swap e = Either.fold Right Left e;
Text.join : Vector Text -> Text;
Text.join = Vector.fold-balanced Text.concatenate "";
Text.take-right : Number -> Text -> Text;
Text.take-right n t = Text.drop (Text.length t - n) t;
Text.ends-with : Text -> Text -> Boolean;
Text.ends-with suffix overall =
Text.take-right (Text.length suffix) overall ==_Text suffix;

View File

@ -1,21 +1,38 @@
let
alias DIndex k v = Index Node (Index k v);
alias Set v = Index v Unit;
alias SearchIndex = DIndex Text (Set Text);
alias VisitSet = DIndex (Hash Text) Unit;
-- Maps keywords to set of page content hashes with that keyword
alias SearchIndex = DIndex Text (Set (Hash Text));
-- Maps page hash to canonical Url for that hash
alias CanonicalUrls = DIndex (Hash Text) Text;
-- Maps page hash to a short, plain text exerpt from that page
alias Excerpts = DIndex (Hash Text) Text;
-- Using the search index, returns the list of page hashes (up to limit)
-- whose content contains all the keywords of the query
search : Number -> Vector Text -> SearchIndex
-> Remote (Vector Text);
-> Remote (Vector (Hash Text));
search limit query ind = do Remote
url-sets := Remote.traverse (k -> DIndex.lookup k ind) query;
url-sets = Vector.map Index.traversal (Optional.somes url-sets);
zero = IndexedTraversal.empty;
merge = IndexedTraversal.intersect (Order.by-2nd Hash.Order);
urls = Optional.get-or IndexedTraversal.empty <| Vector.fold-balanced1 merge url-sets;
urls := IndexedTraversal.take-keys limit urls;
pure (Vector.map 1st urls);;
;
-- Plain-text formating of a set of results
format-results : Vector (Hash Text) -> CanonicalUrls -> Excerpts -> Remote Text;
format-results hs urls excerpts = do Remote
urls := Remote.map Optional.somes <| Remote.traverse (h -> DIndex.lookup h urls) hs;
excerpts := Remote.map Optional.somes <| Remote.traverse (h -> DIndex.lookup h excerpts) hs;
fmt = p -> Text.join [1st p, Text.newline, 2nd p, Text.newline, Text.newline];
pure <| Text.join (Vector.map fmt (urls `Vector.zip` excerpts));;
;
trim-to-host : Text -> Text;
trim-to-host url = Optional.get-or url <| do Optional
host := Uri.parse-authority url;
@ -30,11 +47,11 @@ let
Text.concatenate (trim-to-host parent) child
else if (Text.take 5 child ==_Text "http:") `or` (Text.take 6 child ==_Text "https:") then
child
else parent `Text.concatenate` "/" `Text.concatenate` child
else Text.join [parent, if Text.ends-with "/" parent then "" else "/", child]
;
crawl : Number -> SearchIndex -> VisitSet -> Text -> Remote Unit;
crawl depth ind visited url = let rec
crawl : Number -> SearchIndex -> CanonicalUrls -> Excerpts -> Text -> Remote Unit;
crawl depth ind visited excerpts url = let rec
insert url keyword = do Remote
url-set := DIndex.lookup keyword ind;
Optional.fold
@ -58,17 +75,17 @@ let
keywords = Text.words page-text
|> Vector.map Text.lowercase
|> Vector.ranked-histogram Text.Order;
summary = Vector.drop 5 keywords |> Vector.take 100; -- hacky filter
summary = Vector.take 100 keywords; -- hacky filter
keywords = summary;
-- rankings = Debug.watch "rs" <| Vector.map 2nd keywords;
-- rankings0 = Debug.watch "kw" <| Vector.map 1st keywords;
keywords = Vector.map 1st keywords;
links = Html.get-links page;
links = Vector.map (Html.get-href `and-then` resolve-url url) links;
-- insert all keywords for the page into the map
Remote.traverse (insert url) keywords;
Remote.traverse (insert page-hash) keywords;
-- mark page as visited
Debug.log "finished indexing" url <| DIndex.insert page-hash Unit visited;
excerpt = Text.take 400 page-text `Text.concatenate` "...";
DIndex.insert page-hash excerpt excerpts;
Debug.log "finished indexing" url <| DIndex.insert page-hash url visited;
-- recurse
Remote.traverse (go (depth - 1)) links;
pure Unit;;)
@ -85,19 +102,24 @@ let
-- Build DIndex for index state and for crawler state
ind := DIndex.empty;
visited := DIndex.empty;
excerpts := DIndex.empty;
ind-nodes := Remote.replicate 3 Remote.spawn;
visited-nodes := Remote.replicate 3 Remote.spawn;
excerpts-nodes := Remote.replicate 3 Remote.spawn;
Remote.traverse (n -> Remote.at' n (DIndex.join ind)) ind-nodes;
Remote.traverse (n -> Remote.at' n (DIndex.join visited)) visited-nodes;
Remote.traverse (n -> Remote.at' n (DIndex.join excerpts)) excerpts-nodes;
-- Kick off multiple crawlers
Remote.fork <| crawl 2 ind visited "http://unisonweb.org";
Remote.fork <| crawl 4 ind visited "http://unisonweb.org/design";
Remote.fork <| crawl 3 ind visited "http://www.cnn.com";
Remote.fork <| crawl 4 ind visited "http://lambda-the-ultimate.org/";
Remote.fork <| crawl 5 ind visited excerpts "http://unisonweb.org";
Remote.fork <| crawl 5 ind visited excerpts "http://unisonweb.org/design";
Remote.fork <| crawl 5 ind visited excerpts "http://www.cnn.com";
Remote.fork <| crawl 5 ind visited excerpts "http://lambda-the-ultimate.org/";
-- Wait a while for crawlers to index a bunch of pages, then do query
Remote.sleep (Duration.seconds 120);
results := search 10 ["design", "unison", "refactoring"] ind;
pure <| Debug.watch "results --- " results;;
Remote.sleep (Duration.seconds 60);
results := search 10 ["design", "unison"] ind;
results := format-results results visited excerpts;
pure <| Debug.log results Unit results;;
-- pure <| Debug.watch "results" results;;
;;