crunch: Add crunch library and generator

The crunch library and generator are tools to
export group graph data to a CSV-format file.

The purpose of crunch is to help quantify the
growth of the Urbit network.

The code added here is work done under the grant
https://urbit.org/grants/bounties/analytics-script/
This commit is contained in:
hosted-fornet 2021-07-24 15:11:54 -07:00
parent ceed4b78d0
commit d43c579d78
4 changed files with 412 additions and 0 deletions

View File

@ -0,0 +1,27 @@
/- ms=metadata-store
/+ crunch
:- %say
|= [[now=@da * bec=beak] [csv-path=path from=@da ~] [to=@da groups=(list path) content=(unit ?) ~]]
=/ our=@p p.bec
:: get all graph associations ship is a part of
::
=/ associations=associations:ms
(~(scry-graph-associations crunch [our now]))
:: filter by input groups, if any (default: all from scry)
::
=/ filtered-associations=associations:ms
?~ groups
associations
%+ filter-associations-by-group-resources.crunch
associations
(paths-to-resources.crunch groups)
:: walk graphs to extract content
::
=/ file-content=wain
%: ~(walk-graph-associations crunch [our now])
filtered-associations
?~ content %.n u.content
from
?: =(*@da to) now to
==
[%helm-pass (note-write-csv-to-clay.crunch csv-path file-content)]

361
pkg/arvo/lib/crunch.hoon Normal file
View File

@ -0,0 +1,361 @@
/- c=crunch, gs=graph-store, ms=metadata-store, p=post, r=resource
::
=<
|_ [our=ship now=@da]
++ walk-graph-associations
|= [=associations:ms content=? from=@da to=@da]
^- wain
:: graph resources in `our`; used to avoid scrying, e.g.,
:: a graph `our` has left and can no longer access
::
=/ accessible-graphs=(set resource:r) (scry-graph-resources)
%- ~(rep by associations)
|= [[=md-resource:ms =association:ms] out=wain]
^- wain
?> ?=(%graph app-name.md-resource)
?> ?=(%graph -.config.metadatum.association)
:: ensure graph, given by association, exists in `our`
::
?. (~(has in accessible-graphs) resource.md-resource)
out
:: scry the graph
::
=/ graph=(unit graph:gs) (scry-graph resource.md-resource)
?~ graph
out
:: prepare channel-info argument
::
=/ channel-info=channel-info:c
:*
group.association
resource.md-resource
module.config.metadatum.association
==
:: walk the graph
::
?+ module.config.metadatum.association
:: non-chat (e.g. links & notes)
::
%+ weld out
%:
walk-nested-graph-for-most-recent-entries
u.graph
content
channel-info
from
to
==
::
%chat
%+ weld out
%:
walk-chat-graph
u.graph
content
channel-info
from
to
==
==
::
++ scry-graph
|= graph-resource=resource:r
^- (unit graph:gs)
=/ scry-response=update:gs
.^ update:gs
%gx
(scot %p our)
%graph-store
(scot %da now)
%graph
(scot %p entity.graph-resource)
name.graph-resource
/noun
==
?. ?=(%add-graph -.q.scry-response)
~
?~ graph.q.scry-response
~
[~ graph.q.scry-response]
::
++ scry-graph-resources
|= ~
^- (set resource:r)
=/ scry-response=update:gs
.^ update:gs
%gx
(scot %p our)
%graph-store
(scot %da now)
/keys/noun
==
?. ?=(%keys -.q.scry-response)
~
resources.q.scry-response
:: helper arm for callers to get graph associations
:: to pass to `walk-graph-associations`
::
++ scry-graph-associations
|= ~
^- associations:ms
.^ associations:ms
%gx
(scot %p our)
%metadata-store
(scot %da now)
/app-name/graph/noun
==
--
::
|%
::
:: parsing and formatting
::
++ concatenate-cords (cury cat 3)
::
++ resource-to-cord
|= =resource:r
^- @t
:(concatenate-cords (scot %p entity.resource) '/' (scot %tas name.resource))
::
++ paths-to-resources
|= paxs=(list path)
^- (set resource:r)
%- ~(gas in *(set resource:r))
(turn paxs path-to-resource)
::
++ path-to-resource
|= pax=path
^- resource:r
=/ entity=@p (slav %p -.pax)
=/ name=@tas -.+.pax
[entity name]
::
++ escape-characters-in-cord
|= =cord
^- @t
%- crip
%- mesc
:: specific to CSVs: make sure content does not
:: contain commas (only allowed as delimiters)
::
%- replace-tape-commas-with-semicolons
%- trip
cord
::
++ replace-tape-commas-with-semicolons
|= string=tape
^- tape
=/ comma-indices=(list @ud) (fand "," string)
|-
^- tape
?~ comma-indices
string
$(string (snap string i.comma-indices ';'), comma-indices t.comma-indices)
::
++ contents-to-cord
|= contents=(list content:p)
^- @t
?~ contents
''
%+ join-cords
' '
(turn contents content-to-cord)
::
++ content-to-cord
|= =content:p
^- @t
?- -.content
%text (escape-characters-in-cord text.content)
%mention (scot %p ship.content)
%url url.content
%code expression.content :: TODO: also print output?
%reference (reference-content-to-cord reference.content)
==
::
++ reference-content-to-cord
|= =reference:p
^- @t
?- -.reference
%group (resource-to-cord group.reference)
%graph :(concatenate-cords (resource-to-cord group.reference) ': ' (resource-to-cord resource.uid.reference))
==
::
++ format-post-to-comma-separated-cord
|= [=post:gs =channel-info:c]
^- @t
%+ join-cords
','
:~
(scot %da time-sent.post)
(scot %p author.post)
(resource-to-cord group.channel-info)
(resource-to-cord channel.channel-info)
(scot %tas channel-type.channel-info)
:: exclude content; optionally add later
::
==
::
++ join-cords
|= [delimiter=@t cords=(list @t)]
^- @t
%+ roll cords
|= [cord=@t out=@t]
^- @t
?: =('' out)
:: don't put delimiter before first element
::
cord
:(concatenate-cords out delimiter cord)
::
:: walking graphs
::
++ walk-chat-graph
|= [=graph:gs content=? =channel-info:c from=@da to=@da]
^- wain
%- flop
%+ roll
:: filter by time
::
%+ only-nodes-older-than to
%+ only-nodes-newer-than from
~(val by graph)
|= [=node:gs out=wain]
^- wain
?- -.post.node
%|
:: do not output deleted posts
::
out
%&
?~ contents.p.post.node
:: do not output structural nodes
::
out
:_ out
=/ post-no-content=@t (format-post-to-comma-separated-cord p.post.node channel-info)
?- content
%| post-no-content
%&
%+ join-cords ','
~[post-no-content (contents-to-cord contents.p.post.node)]
==
==
::
++ walk-nested-graph-for-most-recent-entries
|= [=graph:gs content=? =channel-info:c from=@da to=@da]
^- wain
=| out=wain
=| most-recent-post-content=@t
=/ nodes
:: filter by time
::
%+ only-nodes-older-than to
%+ only-nodes-newer-than from
~(val by graph)
%- flop
|-
^- wain
?~ nodes
?: =('' most-recent-post-content)
:: don't return a cell: `['' ~]`
:: we want either an empty list `~`
:: or a list populated with actual entries
::
out
[most-recent-post-content out]
::
=? out ?=(%graph -.children.i.nodes)
%+ weld out
%:
walk-nested-graph-for-most-recent-entries
p.children.i.nodes
content
channel-info
from
to
==
::
?- -.post.i.nodes
%|
:: do not keep deleted posts
::
$(nodes t.nodes)
%&
?~ contents.p.post.i.nodes
:: do not keep structural nodes
::
$(nodes t.nodes)
=/ post-no-content=@t (format-post-to-comma-separated-cord p.post.i.nodes channel-info)
%= $
nodes t.nodes
most-recent-post-content
?- content
%| post-no-content
%&
%+ join-cords ','
~[post-no-content (contents-to-cord contents.p.post.i.nodes)]
==
==
==
::
:: filters
::
++ filter-associations-by-group-resources
|= [=associations:ms group-resources=(set resource:r)]
^- associations:ms
%- ~(rep by associations)
|= [[=md-resource:ms =association:ms] out=associations:ms]
^- associations:ms
?. (~(has in group-resources) group.association)
out
(~(put by out) md-resource association)
:: wrappers for intuitive use of `filter-nodes-by-timestamp`:
:: pass `nodes` as given by the `graph-store` scry and no
:: need to worry about comparators
::
++ only-nodes-older-than
|= [time=@da nodes=(list node:gs)]
(filter-nodes-by-timestamp nodes lte time)
::
++ only-nodes-newer-than
|= [time=@da nodes=(list node:gs)]
%- flop
(filter-nodes-by-timestamp (flop nodes) gte time)
::
++ filter-nodes-by-timestamp
|= [nodes=(list node:gs) comparator=$-([@ @] ?) time=@da]
=| out=(list node:gs)
:: return `out` in same time-order as `nodes`
::
%- flop
|-
^- (list node:gs)
?~ nodes
out
?- -.post.i.nodes
%|
:: skip deleted posts
::
$(nodes t.nodes)
%&
?. (comparator time-sent.p.post.i.nodes time)
:: assume:
:: * time is monotonic
:: * first `%.n` we hit indicates nodes further on are `%.n`
:: (i.e. `nodes` must be ordered st. they start `%.y`,
:: e.g. if want all `nodes` older than given time,
:: `nodes` must start with oldest and comparator is `lth`)
::
out
$(nodes t.nodes, out [i.nodes out])
==
::
:: io
::
++ note-write-csv-to-clay
|= [pax=path file-content=wain]
?> =(%csv (snag (dec (lent pax)) pax))
[%c [%info %home %& [pax %ins %csv !>(file-content)]~]]
::
--

15
pkg/arvo/mar/csv.hoon Normal file
View File

@ -0,0 +1,15 @@
=, format
=, mimes:html
|_ csv=wain
::
++ grab :: convert from
|%
++ mime |=((pair mite octs) (to-wain q.q))
++ noun wain :: clam from %noun
--
++ grow
|%
++ mime [/text/csv (as-octs (of-wain csv))]
--
++ grad %mime
--

9
pkg/arvo/sur/crunch.hoon Normal file
View File

@ -0,0 +1,9 @@
/- resource
::
|%
+$ channel-info
$: group=resource:resource
channel=resource:resource
channel-type=term
==
--