From e266a525fc3fabab83d91d11304ced95ab4beff9 Mon Sep 17 00:00:00 2001 From: gyz-web <76725986+gyz-web@users.noreply.github.com> Date: Wed, 16 Apr 2025 21:21:24 +0800 Subject: [PATCH] HDFS-17769. Allows client to actively retry to Active NameNode when the Observer NameNode is too far behind client state id. --- .../server/namenode/GlobalStateIdContext.java | 7 +++--- .../server/namenode/ha/TestObserverNode.java | 25 +++++++++++++++++++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/GlobalStateIdContext.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/GlobalStateIdContext.java index 7d613594efd64..85397a273c62d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/GlobalStateIdContext.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/GlobalStateIdContext.java @@ -30,6 +30,7 @@ import org.apache.hadoop.hdfs.server.namenode.ha.ObserverReadProxyProvider; import org.apache.hadoop.hdfs.server.namenode.ha.ReadOnly; import org.apache.hadoop.ipc.AlignmentContext; +import org.apache.hadoop.ipc.ObserverRetryOnActiveException; import org.apache.hadoop.ipc.RetriableException; import org.apache.hadoop.ipc.StandbyException; import org.apache.hadoop.ipc.protobuf.RpcHeaderProtos.RpcRequestHeaderProto; @@ -156,9 +157,9 @@ public long receiveRequestState(RpcRequestHeaderProto header, ESTIMATED_TRANSACTIONS_PER_SECOND * TimeUnit.MILLISECONDS.toSeconds(clientWaitTime) * ESTIMATED_SERVER_TIME_MULTIPLIER) { - throw new RetriableException( - "Observer Node is too far behind: serverStateId = " - + serverStateId + " clientStateId = " + clientStateId); + throw new ObserverRetryOnActiveException("Retrying to Active NameNode, Observer Node is too" + + " far behind: serverStateId = " + serverStateId + + " clientStateId = " + clientStateId); } return clientStateId; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestObserverNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestObserverNode.java index 55d17d3bb27c9..b27beb585dc87 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestObserverNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestObserverNode.java @@ -518,6 +518,31 @@ public void testObserverRetryActiveException() throws Exception { assertTrue(thrownRetryException); } + /** + * Test that, when the server stateId is too far behind the + * client stateId, the request should be retried directly to + * Active NameNode, instead of constantly trying again. + */ + @Test + public void testObserverRetryActiveExceptionWhenStateIdTooStale() throws Exception { + dfs.mkdir(testPath, FsPermission.getDefault()); + assertSentTo(0); + + // Set large stateId on the client,the server stateId is too far behind + // the client stateId and will retry to active. + long realStateId = HATestUtil.setACStateId(dfs, 1000000); + FileStatus fileStatus = dfs.getFileStatus(testPath); + assertNotNull(fileStatus); + assertSentTo(0); + + // StateId restored to normal, request processed by observer. + HATestUtil.setACStateId(dfs, realStateId); + FileStatus fileStatus2= dfs.getFileStatus(testPath); + assertNotNull(fileStatus2); + assertSentTo(2); + + } + /** * Test that for open call, if access time update is required, * the open call should go to active, instead of observer.