From 8c02057eb8be5dc12e001acde41793f9370f4cb8 Mon Sep 17 00:00:00 2001 From: Claudio Bley <103681498+cbley-da@users.noreply.github.com> Date: Tue, 28 Jun 2022 13:37:42 +0200 Subject: [PATCH] bazel: Retry cache operation for `IOException` "Operation timed out" (#14274) We have seen cache failures caused by: ``` 2022-06-13T19:02:13.0964410Z NOT RETRYING: java.io.IOException: Operation timed out ``` which later resulted in: ``` 2022-06-13T19:05:42.4633080Z WARNING: Reading from Remote Cache: 2022-06-13T19:05:42.4635340Z com.google.devtools.build.lib.remote.BulkTransferException: Operation timed out 2022-06-13T19:05:42.4636670Z at com.google.devtools.build.lib.remote.RemoteCache.waitForBulkTransfer(RemoteCache.java:291) 2022-06-13T19:05:42.4638600Z at com.google.devtools.build.lib.remote.RemoteCache.download(RemoteCache.java:466) 2022-06-13T19:05:42.4640330Z at com.google.devtools.build.lib.remote.RemoteExecutionService.downloadOutputs(RemoteExecutionService.java:383) 2022-06-13T19:05:42.4643000Z at com.google.devtools.build.lib.remote.RemoteSpawnCache.lookup(RemoteSpawnCache.java:120) 2022-06-13T19:05:42.4645110Z at com.google.devtools.build.lib.exec.AbstractSpawnStrategy.exec(AbstractSpawnStrategy.java:139) 2022-06-13T19:05:42.4647270Z at com.google.devtools.build.lib.exec.AbstractSpawnStrategy.exec(AbstractSpawnStrategy.java:106) 2022-06-13T19:05:42.4649070Z at com.google.devtools.build.lib.actions.SpawnStrategy.beginExecution(SpawnStrategy.java:47) 2022-06-13T19:05:42.4650750Z at com.google.devtools.build.lib.exec.SpawnStrategyResolver.beginExecution(SpawnStrategyResolver.java:65) 2022-06-13T19:05:42.4655610Z at com.google.devtools.build.lib.analysis.actions.SpawnAction.beginExecution(SpawnAction.java:331) 2022-06-13T19:05:42.4657770Z at com.google.devtools.build.lib.actions.Action.execute(Action.java:127) 2022-06-13T19:05:42.4667910Z at com.google.devtools.build.lib.skyframe.SkyframeActionExecutor$5.execute(SkyframeActionExecutor.java:855) 2022-06-13T19:05:42.4669320Z at com.google.devtools.build.lib.skyframe.SkyframeActionExecutor$ActionRunner.continueAction(SkyframeActionExecutor.java:1016) 2022-06-13T19:05:42.4670870Z at com.google.devtools.build.lib.skyframe.SkyframeActionExecutor$ActionRunner.run(SkyframeActionExecutor.java:975) 2022-06-13T19:05:42.4672520Z at com.google.devtools.build.lib.skyframe.ActionExecutionState.runStateMachine(ActionExecutionState.java:129) 2022-06-13T19:05:42.4673950Z at com.google.devtools.build.lib.skyframe.ActionExecutionState.getResultOrDependOnFuture(ActionExecutionState.java:81) 2022-06-13T19:05:42.4675420Z at com.google.devtools.build.lib.skyframe.SkyframeActionExecutor.executeAction(SkyframeActionExecutor.java:472) 2022-06-13T19:05:42.4677050Z at com.google.devtools.build.lib.skyframe.ActionExecutionFunction.checkCacheAndExecuteIfNeeded(ActionExecutionFunction.java:834) 2022-06-13T19:05:42.4678230Z at com.google.devtools.build.lib.skyframe.ActionExecutionFunction.compute(ActionExecutionFunction.java:307) 2022-06-13T19:05:42.4679530Z at com.google.devtools.build.skyframe.AbstractParallelEvaluator$Evaluate.run(AbstractParallelEvaluator.java:477) 2022-06-13T19:05:42.4680390Z at com.google.devtools.build.lib.concurrent.AbstractQueueVisitor$WrappedRunnable.run(AbstractQueueVisitor.java:398) 2022-06-13T19:05:42.4681260Z at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) 2022-06-13T19:05:42.4682100Z at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) 2022-06-13T19:05:42.4683170Z at java.base/java.lang.Thread.run(Thread.java:829) 2022-06-13T19:05:42.4683650Z Suppressed: java.io.IOException: Operation timed out 2022-06-13T19:05:42.4684210Z at java.base/sun.nio.ch.FileDispatcherImpl.read0(Native Method) 2022-06-13T19:05:42.4685100Z at java.base/sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:39) 2022-06-13T19:05:42.4685990Z at java.base/sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:276) 2022-06-13T19:05:42.4686710Z at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:233) 2022-06-13T19:05:42.4687740Z at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:223) 2022-06-13T19:05:42.4688740Z at java.base/sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:356) 2022-06-13T19:05:42.4689620Z at io.netty.buffer.PooledByteBuf.setBytes(PooledByteBuf.java:253) 2022-06-13T19:05:42.4690290Z at io.netty.buffer.AbstractByteBuf.writeBytes(AbstractByteBuf.java:1133) 2022-06-13T19:05:42.4692060Z at io.netty.channel.socket.nio.NioSocketChannel.doReadBytes(NioSocketChannel.java:350) 2022-06-13T19:05:42.4693020Z at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:148) 2022-06-13T19:05:42.4693700Z at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:714) 2022-06-13T19:05:42.4694350Z at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:650) 2022-06-13T19:05:42.4695020Z at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:576) 2022-06-13T19:05:42.4695740Z at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:493) 2022-06-13T19:05:42.4696750Z at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989) 2022-06-13T19:05:42.4698000Z at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) 2022-06-13T19:05:42.4699170Z at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) 2022-06-13T19:05:42.4700320Z ... 1 more ``` This was caused by a time out to the cache server while downloading cached artifacts and should have been retried. (see [1]) CHANGELOG_BEGIN CHANGELOG_END [1]: https://gist.github.com/cocreature/98481cd41029e73d1bc2a26a7661e07a --- nix/bazel-retry-cache.patch | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nix/bazel-retry-cache.patch b/nix/bazel-retry-cache.patch index 76337ecb8a7..15ca2ecda14 100644 --- a/nix/bazel-retry-cache.patch +++ b/nix/bazel-retry-cache.patch @@ -90,7 +90,7 @@ index 350e1afa51..db81481b60 100644 import java.util.HashSet; import java.util.List; import java.util.Map; -@@ -213,7 +216,31 @@ public final class RemoteModule extends BlazeModule { +@@ -213,7 +216,33 @@ public final class RemoteModule extends BlazeModule { remoteOptions, creds, Preconditions.checkNotNull(env.getWorkingDirectory(), "workingDirectory"), @@ -108,6 +108,8 @@ index 350e1afa51..db81481b60 100644 + String msg = e.getMessage().toLowerCase(); + if (msg.contains("connection reset by peer")) { + retry = true; ++ } else if (msg.contains("operation timed out")) { ++ retry = true; + } + } + if (retry) {