bazel: Retry cache operation for IOException "Operation timed out" (#14274)

We have seen cache failures caused by:
```
2022-06-13T19:02:13.0964410Z NOT RETRYING: java.io.IOException: Operation timed out
```
which later resulted in:
```
2022-06-13T19:05:42.4633080Z WARNING: Reading from Remote Cache:
2022-06-13T19:05:42.4635340Z com.google.devtools.build.lib.remote.BulkTransferException: Operation timed out
2022-06-13T19:05:42.4636670Z 	at com.google.devtools.build.lib.remote.RemoteCache.waitForBulkTransfer(RemoteCache.java:291)
2022-06-13T19:05:42.4638600Z 	at com.google.devtools.build.lib.remote.RemoteCache.download(RemoteCache.java:466)
2022-06-13T19:05:42.4640330Z 	at com.google.devtools.build.lib.remote.RemoteExecutionService.downloadOutputs(RemoteExecutionService.java:383)
2022-06-13T19:05:42.4643000Z 	at com.google.devtools.build.lib.remote.RemoteSpawnCache.lookup(RemoteSpawnCache.java:120)
2022-06-13T19:05:42.4645110Z 	at com.google.devtools.build.lib.exec.AbstractSpawnStrategy.exec(AbstractSpawnStrategy.java:139)
2022-06-13T19:05:42.4647270Z 	at com.google.devtools.build.lib.exec.AbstractSpawnStrategy.exec(AbstractSpawnStrategy.java:106)
2022-06-13T19:05:42.4649070Z 	at com.google.devtools.build.lib.actions.SpawnStrategy.beginExecution(SpawnStrategy.java:47)
2022-06-13T19:05:42.4650750Z 	at com.google.devtools.build.lib.exec.SpawnStrategyResolver.beginExecution(SpawnStrategyResolver.java:65)
2022-06-13T19:05:42.4655610Z 	at com.google.devtools.build.lib.analysis.actions.SpawnAction.beginExecution(SpawnAction.java:331)
2022-06-13T19:05:42.4657770Z 	at com.google.devtools.build.lib.actions.Action.execute(Action.java:127)
2022-06-13T19:05:42.4667910Z 	at com.google.devtools.build.lib.skyframe.SkyframeActionExecutor$5.execute(SkyframeActionExecutor.java:855)
2022-06-13T19:05:42.4669320Z 	at com.google.devtools.build.lib.skyframe.SkyframeActionExecutor$ActionRunner.continueAction(SkyframeActionExecutor.java:1016)
2022-06-13T19:05:42.4670870Z 	at com.google.devtools.build.lib.skyframe.SkyframeActionExecutor$ActionRunner.run(SkyframeActionExecutor.java:975)
2022-06-13T19:05:42.4672520Z 	at com.google.devtools.build.lib.skyframe.ActionExecutionState.runStateMachine(ActionExecutionState.java:129)
2022-06-13T19:05:42.4673950Z 	at com.google.devtools.build.lib.skyframe.ActionExecutionState.getResultOrDependOnFuture(ActionExecutionState.java:81)
2022-06-13T19:05:42.4675420Z 	at com.google.devtools.build.lib.skyframe.SkyframeActionExecutor.executeAction(SkyframeActionExecutor.java:472)
2022-06-13T19:05:42.4677050Z 	at com.google.devtools.build.lib.skyframe.ActionExecutionFunction.checkCacheAndExecuteIfNeeded(ActionExecutionFunction.java:834)
2022-06-13T19:05:42.4678230Z 	at com.google.devtools.build.lib.skyframe.ActionExecutionFunction.compute(ActionExecutionFunction.java:307)
2022-06-13T19:05:42.4679530Z 	at com.google.devtools.build.skyframe.AbstractParallelEvaluator$Evaluate.run(AbstractParallelEvaluator.java:477)
2022-06-13T19:05:42.4680390Z 	at com.google.devtools.build.lib.concurrent.AbstractQueueVisitor$WrappedRunnable.run(AbstractQueueVisitor.java:398)
2022-06-13T19:05:42.4681260Z 	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
2022-06-13T19:05:42.4682100Z 	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
2022-06-13T19:05:42.4683170Z 	at java.base/java.lang.Thread.run(Thread.java:829)
2022-06-13T19:05:42.4683650Z 	Suppressed: java.io.IOException: Operation timed out
2022-06-13T19:05:42.4684210Z 		at java.base/sun.nio.ch.FileDispatcherImpl.read0(Native Method)
2022-06-13T19:05:42.4685100Z 		at java.base/sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:39)
2022-06-13T19:05:42.4685990Z 		at java.base/sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:276)
2022-06-13T19:05:42.4686710Z 		at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:233)
2022-06-13T19:05:42.4687740Z 		at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:223)
2022-06-13T19:05:42.4688740Z 		at java.base/sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:356)
2022-06-13T19:05:42.4689620Z 		at io.netty.buffer.PooledByteBuf.setBytes(PooledByteBuf.java:253)
2022-06-13T19:05:42.4690290Z 		at io.netty.buffer.AbstractByteBuf.writeBytes(AbstractByteBuf.java:1133)
2022-06-13T19:05:42.4692060Z 		at io.netty.channel.socket.nio.NioSocketChannel.doReadBytes(NioSocketChannel.java:350)
2022-06-13T19:05:42.4693020Z 		at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:148)
2022-06-13T19:05:42.4693700Z 		at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:714)
2022-06-13T19:05:42.4694350Z 		at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:650)
2022-06-13T19:05:42.4695020Z 		at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:576)
2022-06-13T19:05:42.4695740Z 		at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:493)
2022-06-13T19:05:42.4696750Z 		at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989)
2022-06-13T19:05:42.4698000Z 		at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
2022-06-13T19:05:42.4699170Z 		at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
2022-06-13T19:05:42.4700320Z 		... 1 more
```

This was caused by a time out to the cache server while downloading cached
artifacts and should have been retried. (see [1])

CHANGELOG_BEGIN
CHANGELOG_END

[1]: https://gist.github.com/cocreature/98481cd41029e73d1bc2a26a7661e07a
This commit is contained in:
Claudio Bley 2022-06-28 13:37:42 +02:00 committed by GitHub
parent 32d6a4a2dc
commit 8c02057eb8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -90,7 +90,7 @@ index 350e1afa51..db81481b60 100644
import java.util.HashSet;
import java.util.List;
import java.util.Map;
@@ -213,7 +216,31 @@ public final class RemoteModule extends BlazeModule {
@@ -213,7 +216,33 @@ public final class RemoteModule extends BlazeModule {
remoteOptions,
creds,
Preconditions.checkNotNull(env.getWorkingDirectory(), "workingDirectory"),
@ -108,6 +108,8 @@ index 350e1afa51..db81481b60 100644
+ String msg = e.getMessage().toLowerCase();
+ if (msg.contains("connection reset by peer")) {
+ retry = true;
+ } else if (msg.contains("operation timed out")) {
+ retry = true;
+ }
+ }
+ if (retry) {