Merge pull request #197 from tloncorp/as/vitals

vitals: initial commit of connectivity check agent
This commit is contained in:
Hunter Miller 2023-06-21 14:24:02 -05:00 committed by GitHub
commit 0bbc4a4b70
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 585 additions and 0 deletions

View File

@ -0,0 +1,47 @@
/- v=vitals
/+ default-agent
^- agent:gall
=>
|%
+$ card card:agent:gall
--
|_ =bowl:gall
+* this .
def ~(. (default-agent this %|) bowl)
++ on-init on-init:def
++ on-save on-save:def
++ on-load on-load:def
++ on-poke
|= [=mark =vase]
^- (quip card _this)
?+ mark (on-poke:def mark vase)
%sub
?> =(our.bowl src.bowl)
=+ !<(=ship vase)
:_ this
:~ :* %pass /updates
%agent [our.bowl %vitals]
%watch /status/(scot %p ship)
== ==
==
++ on-watch on-watch:def
++ on-leave on-leave:def
++ on-peek on-peek:def
++ on-agent
|= [=wire =sign:agent:gall]
^- (quip card _this)
?+ wire (on-agent:def wire sign)
[%updates ~]
?+ -.sign (on-agent:def wire sign)
%fact
?+ p.cage.sign (on-agent:def wire sign)
%vitals-result
=/ =result:v !<(result:v q.cage.sign)
~& result
`this
==
==
==
++ on-arvo on-arvo:def
++ on-fail on-fail:def
--

234
desk/app/vitals.hoon Normal file
View File

@ -0,0 +1,234 @@
:: This app provides a batteries-included view of your connectivity with any
:: ship. An interface should be able to get all relevant information directly
:: from this app and display it to the user. For this reason, it includes many
:: "pending" states and reasons for failure.
::
:: To access these statuses, subscribe to /ship/~sampel-palnet for each ship
:: you care about, and then send pokes of the form [%run-check ~sampel-palnet]
:: to kick off the connectivity check.
::
:: You can also scry to /ship/~sampel-palnet to get the latest information
:: (with timestamp), but you must poke to actively test the state. The special
:: scry paths /sponsor and /galaxy give immediate access to connectivity state
:: with our direct sponsor & galaxy, which will always be up-to-date (within
:: the last minute).
::
:: Internally, when we receive a %run-check poke, we start a thread to attempt
:: to contact that ship, and if it is taking too long, we will investigate and
:: report possible reasons. As the thread progresses, it will give updates on
:: its investigation by poking the app with `%update-status`, and these updates
:: will immediately go out to subscribers.
::
:: TODO:
:: V: Replace foreign sponsor ack/nack with remote scry
:: W: Remote scry foreign sponsor child connection from Ames directly
:: X: Add ping to Ames connection state and surface to %vitals
:: Y: Register ships for regularly scheduled connectivity checks
:: Z: Replace legacy subscription with SSS
::
/- v=vitals
/+ lib=vitals
/+ dbug, default-agent, verb
^- agent:gall
=>
|%
+$ card card:agent:gall
+$ versioned-state current-state
+$ current-state
$: %0
connections=(map ship result:v)
==
--
=| current-state
=* state -
=<
%+ verb |
%- agent:dbug
|_ =bowl:gall
+* this .
def ~(. (default-agent this %|) bowl)
cor ~(. +> [bowl ~])
++ on-init on-init:def
::
++ on-save
!>(state)
::
++ on-load
|= =vase
^- (quip card _this)
=^ cards state
abet:(load:cor vase)
[cards this]
::
++ on-poke
|= [=mark =vase]
^- (quip card _this)
=^ cards state
abet:(poke:cor mark vase)
[cards this]
++ on-watch
|= =path
^- (quip card _this)
=^ cards state
abet:(watch:cor path)
[cards this]
::
++ on-peek peek:cor
::
++ on-leave on-leave:def
::
++ on-fail on-fail:def
::
++ on-agent on-agent:def
::
++ on-arvo
|= [=wire sign=sign-arvo]
^- (quip card _this)
=^ cards state
abet:(arvo:cor wire sign)
[cards this]
--
|_ [=bowl:gall cards=(list card)]
+* our our.bowl
now now.bowl
src src.bowl
++ cor .
++ abet [(flop cards) state]
++ emit |=(=card cor(cards [card cards]))
++ emil |=(caz=(list card) cor(cards (welp (flop caz) cards)))
++ give |=(=gift:agent:gall (emit %give gift))
++ load
|= =vase
=/ loaded-state !<(versioned-state vase)
?- -.loaded-state
%0 cor(connections connections.loaded-state)
==
++ poke
|= [=mark =vase]
^+ cor
?+ mark ~|([%bad-mark mark] !!)
::
:: private pokes
::
%update-status
?> =(our src)
=+ !<(=update:v vase)
=. connections (~(put by connections) update)
%- give
:* %fact
~[/status/(scot %p -.update)]
%vitals-result
!>(+.update)
==
::
%run-check
?> =(our src)
=+ !<(=ship vase)
?> =(~ (find ~[our] (saxo:title our now ship)))
=/ stat=(unit result:v) (~(get by connections) ship)
:: XX: code duplicated because of annoying type issue
?~ stat
=. connections (~(put by connections) [ship now %pending %setting-up ~])
%- emit
:* %pass
/check-result/(scot %p ship)
%arvo
%k
%fard
%garden :: XX: %landscape?
%vitals-connection-check
%noun
!>(ship)
==
?: ?=(%pending -.status.u.stat)
cor
=. connections (~(put by connections) [ship now %pending %setting-up ~])
%- emit
:: [%pass [wire] %arvo %k %fard [beak or desk] [thread-name] [mark] [vase]]
:* %pass
/check-result/(scot %p ship)
%arvo
%k
%fard
%garden :: XX: %landscape?
%vitals-connection-check
%noun
!>(ship)
==
::
:: public pokes
::
%ship
=+ !<(=ship vase)
?< =(our ship)
?< =(~ (find ~[our] (saxo:title our now ship)))
?. ?=([%live *] (scry-qos:lib our now ship))
!!
cor
==
++ watch
|= =path
^+ cor
?+ path ~|([%evil-watch path] !!)
::
[%status @ta ~]
%- give
:* %fact
~[path]
%vitals-result
!> %+ fall
(~(get by connections) (slav %p i.t.path))
[*@da %complete %no-data ~]
==
==
++ peek
|= =path
^- (unit (unit cage))
?+ path [~ ~]
::
[%x %sponsor ~]
?< ?=(%czar (clan:title our))
%- some
%- some
:- %vitals-qos
!> (scry-qos:lib our now (sein:title our now our))
::
[%x %galaxy ~]
?< ?=(%czar (clan:title our))
%- some
%- some
:- %vitals-qos
!> (scry-qos:lib our now (rear (saxo:title our now our)))
::
[%x %ship @tas ~]
%- some
%- some
:- %vitals-result
!>
%+ fall (~(get by connections) (slav %p i.t.t.path))
[*@da %complete %no-data ~]
==
++ arvo
|= [=wire sign=sign-arvo]
^+ cor
?+ wire ~|([%bad-arvo-take wire] !!)
[%check-result @ta ~]
?> ?=(%khan -.sign)
?> ?=(%arow +<.sign)
=/ =ship (slav %p i.t.wire)
=/ =result:v
:- now
:- %complete
?- -.p.sign
%& !<(complete:v q.p.p.sign)
%| [%crash tang.p.p.sign]
==
=. connections (~(put by connections) [ship result])
%- give
:* %fact
~[/status/(scot %p ship)]
%vitals-result
!>(result)
==
==
--

View File

@ -7,4 +7,5 @@
%storage
%reel
%bait
%vitals
==

24
desk/lib/vitals.hoon Normal file
View File

@ -0,0 +1,24 @@
/- vitals
|%
++ simplify-qos
|= =ship-state:ames
^- qos:ames
?- -.ship-state
%alien [%dead *@da]
%known ?+ -.qos.ship-state qos.ship-state
%unborn [%dead last-contact.qos.ship-state]
== ==
++ scry-qos
|= [=ship =time peer=ship]
^- qos:ames
%- simplify-qos
.^ ship-state:ames
%ax
(scot %p ship)
%$
(scot %da time)
%peers
(scot %p peer)
~
==
--

19
desk/mar/vitals/qos.hoon Normal file
View File

@ -0,0 +1,19 @@
/- *vitals
=, format
|_ =qos:ames
++ grab
|%
++ noun qos
--
++ grow
|%
++ noun qos
++ json
%- frond:enjs
:- 'status'
?+ -.qos [%s -.qos]
%unborn !!
==
--
++ grad %noun
--

View File

@ -0,0 +1,39 @@
/- *vitals
=, format
|_ =result
++ grab
|%
++ noun result
--
++ grow
|%
++ noun result
++ json
%- pairs:enjs
:~
['timestamp' (time:enjs timestamp.result)]
::
:- 'status'
%- pairs:enjs
?- -.status.result
%pending
:- ['pending' [%s -.p.status.result]]
?+ -.p.status.result ~
%trying-sponsor ['ship' (ship:enjs ship.p.status.result)]~
==
::
%complete
:- ['complete' [%s -.p.status.result]]
?+ -.p.status.result ~
%no-our-planet ['last-contact' (time:enjs last-contact.p.status.result)]~
%no-our-galaxy ['last-contact' (time:enjs last-contact.p.status.result)]~
%no-sponsor-hit ['ship' (ship:enjs ship.p.status.result)]~
%no-sponsor-miss ['ship' (ship:enjs ship.p.status.result)]~
%no-their-galaxy ['last-contact' (time:enjs last-contact.p.status.result)]~
%crash ['crash' a+(turn tang.p.status.result tank:enjs)]~
==
==
==
--
++ grad %noun
--

44
desk/sur/vitals.hoon Normal file
View File

@ -0,0 +1,44 @@
|%
+| %constants
::
++ info-timeout ~s30
::
++ target-timeout ~s10
::
+| %types
::
+$ result
$: timestamp=@da
=status
==
::
+$ status
$% [%complete p=complete]
[%pending p=pending]
==
::
+$ complete
$% [%yes ~]
[%no-data ~] :: yet to test connectivity for ship
[%no-dns ~] :: can't even talk to example.com
[%no-our-planet last-contact=@da] :: can't reach our own planet (moon only)
[%no-our-galaxy last-contact=@da] :: can't reach our own galaxy
[%no-sponsor-hit =ship] :: their sponsor can reach their ship
[%no-sponsor-miss =ship] :: their sponsor can't reach their ship
[%no-their-galaxy last-contact=@da] :: can't reach their galaxy
[%crash =tang] :: check crashed
==
::
+$ pending
$% [%setting-up ~]
[%trying-dns ~]
[%trying-local ~]
[%trying-target ~]
[%trying-sponsor =ship]
==
::
+$ update
$: =ship
=result
==
--

View File

@ -0,0 +1,177 @@
:: XX: As an alternative implementation, we could perform these checks (mostly)
:: in parallel. In that case, we shouldn't return immediately when a check
:: completes, but instead record the results and then check them after some
:: timeout (e.g. 30s).
::
/- spider, vitals
/+ io=strandio, lib-vitals=vitals
=, strand=strand:spider
^- thread:spider
|= arg=vase
=/ m (strand ,vase)
^- form:m
=+ !<(target=ship arg)
;< our=@p bind:m get-our:io
|^
:: early exit; check if we have live path to target
;< now=@da bind:m get-time:io
;< tqos=qos:ames bind:m (get-qos target)
?: ?& ?=(%live -.tqos)
(gth last-contact.tqos (sub now info-timeout:vitals))
==
(post-result [%yes ~])
:: set pending to %trying-dns
:: XX: can we use the strand cards for these?
;< ~ bind:m (update-status [%trying-dns ~])
:: check if we can fetch example.com
;< ~ bind:m (send-request:io [%'GET' 'http://example.com' ~ ~])
;< =client-response:iris bind:m take-client-response:io
?. ?& ?=(%finished -.client-response)
=(200 status-code.response-header.client-response)
==
(post-result [%no-dns ~])
:: set pending to %trying-local
;< ~ bind:m (update-status [%trying-local ~])
:: check if we can contact our own galaxy
;< gqos=qos:ames bind:m (scry:io qos:ames ~[%gx %vitals %galaxy %vitals-qos])
?. ?=(%live -.gqos)
(post-result [%no-our-galaxy last-contact.gqos])
:: set pending to %trying-target
;< ~ bind:m (update-status [%trying-target ~])
:: check if we can contact target (with timeout)
;< chek=(unit) bind:m (check-online target target-timeout:vitals)
?: ?=([%$ %$] chek)
(post-result [%yes ~])
:: if we're a moon, check if we can contact our planet
::
:: NN: failing to contact our sponsor is only a failure condition for moons,
:: since currently only moons receive additional routing help from their
:: sponsors
:: NN: we do this after the initial target check because if we're a moon and
:: our planet is down, it's useful to talk to ships that still have live
:: wires (e.g. for troubleshooting); thus, by waiting to perform this
:: check, we don't report %no-our-planet for every connectivity check
:: when attempting to track down a live peer from whom to seek help
::
;< sqos=qos:ames
bind:m
=/ mm (strand ,qos:ames)
^- form:mm
?. ?=(%earl (clan:title our))
(pure:mm [%live *@da])
(scry:io qos:ames ~[%gx %vitals %sponsor %vitals-qos])
?. ?=(%live -.sqos)
(post-result [%no-our-planet last-contact.sqos])
:: early exit; if target is a galaxy, there's nothing more we can check
?: ?=(%czar (clan:title target))
(galaxy-down target)
:: check if target sponsors can reach target
;< saxo=(list ship) bind:m (scry:io (list ship) ~[%j %saxo (scot %p target)])
=/ sponsors
?~ saxo ~
t.saxo
|-
:: case impossible:
:: - early exit for target = galaxy
:: - base case is sponsor = galaxy
?~ sponsors !!
:: set pending to %trying-sponsor
;< ~ bind:m (update-status [%trying-sponsor i.sponsors])
:: ask sponsor if he has live wire to target
;< live=(unit ?) bind:m (ask-sponsor i.sponsors)
:: if timeout...
?~ live
:: ... and sponsor is galaxy ...
?: ?=(%czar (clan:title i.sponsors))
:: ... it's so over
(galaxy-down i.sponsors)
:: ... otherwise, check next sponsor
$(sponsors t.sponsors)
:: report whether sponsor can reach target
%- post-result
?: u.live
[%no-sponsor-hit i.sponsors]
[%no-sponsor-miss i.sponsors]
++ update-status
|= =pending:vitals
=/ m (strand ,~)
^- form:m
;< now=@da bind:m get-time:io
%+ poke-our:io
%vitals
:- %update-status
!>
^- update:vitals
[target now %pending pending]
++ get-qos
|= =ship
=/ m (strand ,qos:ames)
^- form:m
;< state=ship-state:ames
bind:m
(scry:io ship-state:ames ~[%ax %$ %peers (scot %p ship)])
(pure:m (simplify-qos:lib-vitals state))
++ galaxy-down
|= galaxy=ship
=/ m (strand ,vase)
^- form:m
;< =qos:ames bind:m (get-qos galaxy)
(post-result [%no-their-galaxy last-contact.qos])
++ post-result
|= =complete:vitals
=/ m (strand ,vase)
^- form:m
(pure:m !>(complete))
++ ask-sponsor
|= sponsor=ship
=/ m (strand ,(unit ?))
^- form:m
%- (handle-err ,?)
%+ (set-timeout:io ,?) target-timeout:vitals
:: XX: currently returns [~ |] if the sponsor doesn't have %vitals running
;< ~
bind:(strand ,?)
%- send-raw-card:io
:* %pass
/poke
%agent
[sponsor %vitals]
%poke
%ship
!>(target)
==
|= tin=strand-input:strand
?+ in.tin `[%skip ~]
~ `[%wait ~]
::
[~ %agent * %poke-ack *]
?. =(/poke wire.u.in.tin)
`[%skip ~]
?~ p.sign.u.in.tin
`[%done &]
`[%done |]
==
++ check-online
|= [who=ship lag=@dr]
=/ m (strand ,(unit))
^- form:m
%- (handle-err ,~)
%+ (set-timeout:io ,~) lag
=/ n (strand ,~)
;< ~ bind:n (poke:io [who %hood] %helm-hi !>(~))
(pure:n ~)
++ handle-err
|* computation-result=mold
=/ m (strand ,(unit computation-result))
=/ n (strand ,computation-result)
|= computation=form:n
^- form:m
|= tin=strand-input:strand
=* loop $
=/ c-res (computation tin)
?+ -.next.c-res c-res
%cont c-res(self.next ..loop(computation self.next.c-res))
%fail c-res(next [%done ~])
%done c-res(value.next (some value.next.c-res))
==
--