Skip to content

Commit

Permalink
tests: reproduce raft panic
Browse files Browse the repository at this point in the history
Signed-off-by: Clement <[email protected]>
  • Loading branch information
clement2026 committed Sep 14, 2024
1 parent 981061a commit 8947d9a
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 5 deletions.
15 changes: 10 additions & 5 deletions server/etcdserver/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -979,6 +979,7 @@ func (s *EtcdServer) applyAll(ep *etcdProgress, apply *toApply) {
<-apply.notifyc

s.triggerSnapshot(ep)
s.maybeCompactRaftLog(ep.appliedi)
select {
// snapshot requested via send()
case m := <-s.r.msgSnapC:
Expand Down Expand Up @@ -2169,6 +2170,10 @@ func (s *EtcdServer) snapshot(snapi uint64, confState raftpb.ConfState) {
"saved snapshot",
zap.Uint64("snapshot-index", snap.Metadata.Index),
)
}

func (s *EtcdServer) maybeCompactRaftLog(appliedi uint64) {
lg := s.Logger()

// When sending a snapshot, etcd will pause compaction.
// After receives a snapshot, the slow follower needs to get all the entries right after
Expand All @@ -2180,13 +2185,13 @@ func (s *EtcdServer) snapshot(snapi uint64, confState raftpb.ConfState) {
return
}

// keep some in memory log entries for slow followers.
compacti := uint64(1)
if snapi > s.Cfg.SnapshotCatchUpEntries {
compacti = snapi - s.Cfg.SnapshotCatchUpEntries
if appliedi <= s.Cfg.SnapshotCatchUpEntries {
return
}

err = s.r.raftStorage.Compact(compacti)
compacti := appliedi - s.Cfg.SnapshotCatchUpEntries

err := s.r.raftStorage.Compact(compacti)
if err != nil {
// the compaction was done asynchronously with the progress of raft.
// raft log might already been compact.
Expand Down
102 changes: 102 additions & 0 deletions tests/integration/reproduce_raft_panic_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
// Copyright 2024 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package integration

import (
"context"
"testing"
"time"

"github.com/stretchr/testify/assert"

"go.etcd.io/etcd/tests/v3/framework/integration"
)

func TestMustPanic(t *testing.T) {
integration.BeforeTest(t)

clus := integration.NewCluster(t, &integration.ClusterConfig{
Size: 3,
SnapshotCount: 1000,
SnapshotCatchUpEntries: 100,
})
defer clus.Terminate(t)

// inject partition between [member-0] and [member-1, member 2]
clus.Members[0].InjectPartition(t, clus.Members[1:]...)

// wait for leader in [member-1, member 2]
lead := clus.WaitMembersForLeader(t, clus.Members[1:]) + 1
t.Logf("elected lead: %v", clus.Members[lead].Server.MemberID())
time.Sleep(2 * time.Second)

// send 500 put request to [member-1, member 2], resulting at least 400 compaction in raft log
for i := 0; i < 500; i++ {
_, err := clus.Client(1).Put(context.TODO(), "foo", "bar")
if err != nil {
return
}
}

expectMemberLog(t, clus.Members[lead], 5*time.Second, "compacted Raft logs", 400)
expectMemberLog(t, clus.Members[lead], 5*time.Second, "\"compact-index\": 400", 1)

// member-0 rejoins the cluster. Since its appliedIndex is very low (less than 10),
// the leader decides to send a snapshot to member-0.
clus.Members[0].RecoverPartition(t, clus.Members[1:]...)

// Wait for the leader to panic with `panic("need non-empty snapshot")`
time.Sleep(5 * time.Second)
}

func TestMustNotPanic(t *testing.T) {
integration.BeforeTest(t)

clus := integration.NewCluster(t, &integration.ClusterConfig{
Size: 3,
SnapshotCount: 1000,
SnapshotCatchUpEntries: 100,
})
defer clus.Terminate(t)

// inject partition between [member-0] and [member-1, member 2]
clus.Members[0].InjectPartition(t, clus.Members[1:]...)

// wait for leader in [member-1, member 2]
lead := clus.WaitMembersForLeader(t, clus.Members[1:]) + 1
t.Logf("elected lead: %v", clus.Members[lead].Server.MemberID())
time.Sleep(2 * time.Second)

// send 80 put request to [member-1, member 2], no compaction in raft log
for i := 0; i < 80; i++ {
_, err := clus.Client(1).Put(context.TODO(), "foo", "bar")
if err != nil {
return
}
}

// leader should not compact raft logs
ctx, cancel := context.WithTimeout(context.TODO(), 5*time.Second)
defer cancel()
_, err := clus.Members[lead].LogObserver.Expect(ctx, "compacted Raft logs", 1)
assert.ErrorIs(t, err, context.DeadlineExceeded)

// member-0 rejoins the cluster. Since its appliedIndex is within the leader raft log's range,
// no snapshot should be sent
clus.Members[0].RecoverPartition(t, clus.Members[1:]...)

// No errors should occur during this wait
time.Sleep(5 * time.Second)
}

0 comments on commit 8947d9a

Please sign in to comment.